diff --git a/.pre-commit-config-zh-cn.yaml b/.pre-commit-config-zh-cn.yaml index 20a7d30c..2e21c85d 100644 --- a/.pre-commit-config-zh-cn.yaml +++ b/.pre-commit-config-zh-cn.yaml @@ -120,4 +120,4 @@ repos: # hooks: # - id: check-algo-readme # - id: check-copyright - # args: ["mmocr", "tests", "tools"] # these directories will be checked \ No newline at end of file + # args: ["mmocr", "tests", "tools"] # these directories will be checked diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index b464115e..9f72ae42 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -120,4 +120,4 @@ repos: # hooks: # - id: check-algo-readme # - id: check-copyright - # args: ["mmocr", "tests", "tools"] # these directories will be checked \ No newline at end of file + # args: ["mmocr", "tests", "tools"] # these directories will be checked diff --git a/docs/en/conf.py b/docs/en/conf.py index 150d5ca7..b41f6568 100644 --- a/docs/en/conf.py +++ b/docs/en/conf.py @@ -231,4 +231,4 @@ def builder_inited_handler(app): def setup(app): - app.connect('builder-inited', builder_inited_handler) \ No newline at end of file + app.connect('builder-inited', builder_inited_handler) diff --git a/docs/zh_cn/advanced_guides/compassbench_intro.md b/docs/zh_cn/advanced_guides/compassbench_intro.md index be827296..e9e82977 100644 --- a/docs/zh_cn/advanced_guides/compassbench_intro.md +++ b/docs/zh_cn/advanced_guides/compassbench_intro.md @@ -1,194 +1,194 @@ -# CompassBench 介绍 - -## CompassBench 2.0 v1.3 版本 - -CompassBench(官方自建榜单)经历了多次更新迭代,从2024年7月起,OpenCompass将会公布自建榜单的评测规则(评测配置文件)和示例数据集文件,以帮助社区更好的了解自建榜单的评测逻辑和方法。 - -### 能力维度 - -2024年8月榜单将会包括以下能力维度: - -| 能力 | 任务介绍 | 评测方式 | 示例数据地址 | -| -------- | -------------------------------------------------------------------------------------- | ------------------- | ------------------------------------------------------------------------------ | -| 语言 | 评测模型在信息抽取、信息抽取、内容总结、对话、创作等多种任务上的能力 | 主观评测 | https://github.com/open-compass/CompassBench/tree/main/v1_3_data/language | -| 推理 | 评测模型在逻辑推理、常识推理、表格推理等多种日常推理任务上的能力 | 主观评测 | https://github.com/open-compass/CompassBench/tree/main/v1_3_data/reasoning | -| 知识 | 评测模型在理科、工科、人文社科等多个领域的知识水平 | 客观评测 | https://github.com/open-compass/CompassBench/tree/main/v1_3_data/knowledge | -| 数学 | 评测模型在数值计算、高中及大学难度的数学问题上的能力 | 客观评测 | https://github.com/open-compass/CompassBench/tree/main/v1_3_data/math | -| 代码 | 评测模型在代码生成、代码补全、代码注释、代码重构、代码改写、计算机知识综合问答上的能力 | 客观评测 + 主观评测 | https://github.com/open-compass/CompassBench/tree/main/v1_3_data/code | -| 指令跟随 | 评测模型在基于各类语言、推理、知识等任务中,能否准确遵循复杂指令的能力 | 主观评测 | https://github.com/open-compass/CompassBench/tree/main/v1_3_data/instruct | -| 智能体 | 评估模型在复杂工具调用的能力,以及数据科学、数学等情况下使用代码解释器的能力 | 客观评测 | https://github.com/open-compass/T-Eval https://github.com/open-compass/CIBench | - -### 评测方法 - -- 对于客观评测,将会采用0-shot + CoT的方式评测。 - - OpenCompass在客观题评测的后处理上已进行较多优化,并在评测时在Prompt中对回答格式进行约束,对于因指令跟随问题带来的无法完成答案提取的情况,将视为回答错误 - - 数学、智能体题目类型与给定的示例数据类似,但真实评测数据与开源数据不同 -- 对于主观评测,将会采用基于大模型评价的方式进行评测。 - - 我们对每一道问题均提供评测时的打分指引。 - - - 比较待测模型相对于参考回复的胜率,共设置为五档 - - - `A++`:回答A远胜于回答B。 - - `A+`:回答A略优于回答B。 - - `A=B`:回答A和回答B质量相同。 - - `B+`:回答B略优于回答A。 - - `B++`:回答B远胜于回答A。 -- 主观评测配置文件 - - [示例评测配置](https://github.com/open-compass/opencompass/blob/main/configs/eval_compassbench_v1_3_subjective.py) -- 主观评价提示词 - -``` - -# Instruction - -You are an expert evaluator. Your task is to evaluate the quality of the \ -responses generated by two AI models. -We will provide you with the user query and a pair of AI-generated \ -responses (Response A and Response B). -You should first read the user query and the conversation history \ -carefully for analyzing the task, and then evaluate the quality of the \ -responses based on and rules provided below. - -# Conversation between User and AI - -## User Query -<|begin_of_query|> - -{question} - -<|end_of_query|> - -## Response A -<|begin_of_response_A|> - -{prediction} - -<|end_of_response_A|> - -## Response B -<|begin_of_response_B|> - -{prediction2} - -<|end_of_response_B|> - -# Evaluation - -## Checklist - -<|begin_of_checklist|> - -{checklist} - -<|end_of_checklist|> - -Please use this checklist to guide your evaluation, but do not limit your \ -assessment to the checklist. - -## Rules - -You should compare the above two responses based on your analysis of the \ -user queries and the conversation history. -You should first write down your analysis and the checklist that you used \ -for the evaluation, and then provide your assessment according to the \ -checklist. -There are five choices to give your final assessment: ["A++", "A+", \ -"A=B", "B+", "B++"], which correspond to the following meanings: - -- `A++`: Response A is much better than Response B. -- `A+`: Response A is only slightly better than Response B. -- `A=B`: Response A and B are of the same quality. Please use this \ -choice sparingly. -- `B+`: Response B is only slightly better than Response A. -- `B++`: Response B is much better than Response A. - -## Output Format -First, please output your analysis for each model response, and \ -then summarize your assessment to three aspects: "reason A=B", \ -"reason A>B", and "reason B>A", and finally make your choice for \ -the final assessment. - -Please provide your evaluation results in the following json \ -format by filling in the placeholders in []: - -{ - "analysis of A": "[analysis of Response A]", - "analysis of B": "[analysis of Response B]", - "reason of A=B": "[where Response A and B perform equally well]", - "reason of A>B": "[where Response A is better than Response B]", - "reason of B>A": "[where Response B is better than Response A]", - "choice": "[A++ or A+ or A=B or B+ or B++]", -} - - -# 指令 - -您是一位专业评估专家。您的任务是评估两个AI模型生成回答的质量。 -我们将为您提供用户问题及一对AI生成的回答(回答A和回答B)。 -您应当首先仔细阅读用户问题,然后根据以下提供的规则评估回答的质量。 - -# 用户与AI之间的对话 - -## 用户问题 -<|begin_of_query|> - -{question} - -<|end_of_query|> - -## 回答A -<|begin_of_response_A|> - -{prediction} - -<|end_of_response_A|> - -## 回答B -<|begin_of_response_B|> - -{prediction2} - -<|end_of_response_B|> - -# 评估 - -## 检查清单 - -<|begin_of_checklist|> - -{checklist} - -<|end_of_checklist|> - -请参考此检查清单来评估回答的质量,但不要局限于此检查清单。 - -## 规则 - -您应当基于用户查询,分析比较上述两种回答。 -您应当基于检查清单写下您的分析,然后提供您的评价。 -有五个选项供您做出最终评估:["A++", "A+", "A=B", "B+", "B++"],它们对应如下含义: - -- `A++`:回答A远胜于回答B。 -- `A+`:回答A略优于回答B。 -- `A=B`:回答A和回答B质量相同。请谨慎使用此选项。 -- `B+`:回答B略优于回答A。 -- `B++`:回答B远胜于回答A。 - -## 输出格式 -首先,请输出您对每个模型回答的分析, -然后总结您的评估到三个方面:"A=B的理由","A优于B的理由",和 "B优于A的理由", -最后做出您对最终评估的选择。 - -请按照以下json格式提供您的评估结果,通过填充[]中的占位符: - -{ - "回答A的分析": "[回答A的分析]", - "回答B的分析": "[回答B的分析]", - "A=B的理由": "[A和B回答差不多的理由]", - "A优于B的理由": "[回答A优于B的理由]", - "B优于A的理由": "[回答B优于A的理由]", - "choice": "[A++ or A+ or A=B or B+ or B++]", -} - - -``` +# CompassBench 介绍 + +## CompassBench 2.0 v1.3 版本 + +CompassBench(官方自建榜单)经历了多次更新迭代,从2024年7月起,OpenCompass将会公布自建榜单的评测规则(评测配置文件)和示例数据集文件,以帮助社区更好的了解自建榜单的评测逻辑和方法。 + +### 能力维度 + +2024年8月榜单将会包括以下能力维度: + +| 能力 | 任务介绍 | 评测方式 | 示例数据地址 | +| -------- | -------------------------------------------------------------------------------------- | ------------------- | ------------------------------------------------------------------------------ | +| 语言 | 评测模型在信息抽取、信息抽取、内容总结、对话、创作等多种任务上的能力 | 主观评测 | https://github.com/open-compass/CompassBench/tree/main/v1_3_data/language | +| 推理 | 评测模型在逻辑推理、常识推理、表格推理等多种日常推理任务上的能力 | 主观评测 | https://github.com/open-compass/CompassBench/tree/main/v1_3_data/reasoning | +| 知识 | 评测模型在理科、工科、人文社科等多个领域的知识水平 | 客观评测 | https://github.com/open-compass/CompassBench/tree/main/v1_3_data/knowledge | +| 数学 | 评测模型在数值计算、高中及大学难度的数学问题上的能力 | 客观评测 | https://github.com/open-compass/CompassBench/tree/main/v1_3_data/math | +| 代码 | 评测模型在代码生成、代码补全、代码注释、代码重构、代码改写、计算机知识综合问答上的能力 | 客观评测 + 主观评测 | https://github.com/open-compass/CompassBench/tree/main/v1_3_data/code | +| 指令跟随 | 评测模型在基于各类语言、推理、知识等任务中,能否准确遵循复杂指令的能力 | 主观评测 | https://github.com/open-compass/CompassBench/tree/main/v1_3_data/instruct | +| 智能体 | 评估模型在复杂工具调用的能力,以及数据科学、数学等情况下使用代码解释器的能力 | 客观评测 | https://github.com/open-compass/T-Eval https://github.com/open-compass/CIBench | + +### 评测方法 + +- 对于客观评测,将会采用0-shot + CoT的方式评测。 + - OpenCompass在客观题评测的后处理上已进行较多优化,并在评测时在Prompt中对回答格式进行约束,对于因指令跟随问题带来的无法完成答案提取的情况,将视为回答错误 + - 数学、智能体题目类型与给定的示例数据类似,但真实评测数据与开源数据不同 +- 对于主观评测,将会采用基于大模型评价的方式进行评测。 + - 我们对每一道问题均提供评测时的打分指引。 + + - 比较待测模型相对于参考回复的胜率,共设置为五档 + + - `A++`:回答A远胜于回答B。 + - `A+`:回答A略优于回答B。 + - `A=B`:回答A和回答B质量相同。 + - `B+`:回答B略优于回答A。 + - `B++`:回答B远胜于回答A。 +- 主观评测配置文件 + - [示例评测配置](https://github.com/open-compass/opencompass/blob/main/configs/eval_compassbench_v1_3_subjective.py) +- 主观评价提示词 + +``` + +# Instruction + +You are an expert evaluator. Your task is to evaluate the quality of the \ +responses generated by two AI models. +We will provide you with the user query and a pair of AI-generated \ +responses (Response A and Response B). +You should first read the user query and the conversation history \ +carefully for analyzing the task, and then evaluate the quality of the \ +responses based on and rules provided below. + +# Conversation between User and AI + +## User Query +<|begin_of_query|> + +{question} + +<|end_of_query|> + +## Response A +<|begin_of_response_A|> + +{prediction} + +<|end_of_response_A|> + +## Response B +<|begin_of_response_B|> + +{prediction2} + +<|end_of_response_B|> + +# Evaluation + +## Checklist + +<|begin_of_checklist|> + +{checklist} + +<|end_of_checklist|> + +Please use this checklist to guide your evaluation, but do not limit your \ +assessment to the checklist. + +## Rules + +You should compare the above two responses based on your analysis of the \ +user queries and the conversation history. +You should first write down your analysis and the checklist that you used \ +for the evaluation, and then provide your assessment according to the \ +checklist. +There are five choices to give your final assessment: ["A++", "A+", \ +"A=B", "B+", "B++"], which correspond to the following meanings: + +- `A++`: Response A is much better than Response B. +- `A+`: Response A is only slightly better than Response B. +- `A=B`: Response A and B are of the same quality. Please use this \ +choice sparingly. +- `B+`: Response B is only slightly better than Response A. +- `B++`: Response B is much better than Response A. + +## Output Format +First, please output your analysis for each model response, and \ +then summarize your assessment to three aspects: "reason A=B", \ +"reason A>B", and "reason B>A", and finally make your choice for \ +the final assessment. + +Please provide your evaluation results in the following json \ +format by filling in the placeholders in []: + +{ + "analysis of A": "[analysis of Response A]", + "analysis of B": "[analysis of Response B]", + "reason of A=B": "[where Response A and B perform equally well]", + "reason of A>B": "[where Response A is better than Response B]", + "reason of B>A": "[where Response B is better than Response A]", + "choice": "[A++ or A+ or A=B or B+ or B++]", +} + + +# 指令 + +您是一位专业评估专家。您的任务是评估两个AI模型生成回答的质量。 +我们将为您提供用户问题及一对AI生成的回答(回答A和回答B)。 +您应当首先仔细阅读用户问题,然后根据以下提供的规则评估回答的质量。 + +# 用户与AI之间的对话 + +## 用户问题 +<|begin_of_query|> + +{question} + +<|end_of_query|> + +## 回答A +<|begin_of_response_A|> + +{prediction} + +<|end_of_response_A|> + +## 回答B +<|begin_of_response_B|> + +{prediction2} + +<|end_of_response_B|> + +# 评估 + +## 检查清单 + +<|begin_of_checklist|> + +{checklist} + +<|end_of_checklist|> + +请参考此检查清单来评估回答的质量,但不要局限于此检查清单。 + +## 规则 + +您应当基于用户查询,分析比较上述两种回答。 +您应当基于检查清单写下您的分析,然后提供您的评价。 +有五个选项供您做出最终评估:["A++", "A+", "A=B", "B+", "B++"],它们对应如下含义: + +- `A++`:回答A远胜于回答B。 +- `A+`:回答A略优于回答B。 +- `A=B`:回答A和回答B质量相同。请谨慎使用此选项。 +- `B+`:回答B略优于回答A。 +- `B++`:回答B远胜于回答A。 + +## 输出格式 +首先,请输出您对每个模型回答的分析, +然后总结您的评估到三个方面:"A=B的理由","A优于B的理由",和 "B优于A的理由", +最后做出您对最终评估的选择。 + +请按照以下json格式提供您的评估结果,通过填充[]中的占位符: + +{ + "回答A的分析": "[回答A的分析]", + "回答B的分析": "[回答B的分析]", + "A=B的理由": "[A和B回答差不多的理由]", + "A优于B的理由": "[回答A优于B的理由]", + "B优于A的理由": "[回答B优于A的理由]", + "choice": "[A++ or A+ or A=B or B+ or B++]", +} + + +``` diff --git a/docs/zh_cn/advanced_guides/compassbench_v2_0.md b/docs/zh_cn/advanced_guides/compassbench_v2_0.md index ff8f85fb..a547838f 100644 --- a/docs/zh_cn/advanced_guides/compassbench_v2_0.md +++ b/docs/zh_cn/advanced_guides/compassbench_v2_0.md @@ -1,48 +1,48 @@ -# CompassBench 2.0 介绍 - - -## v1.0介绍 -为支持OpenCompass的年度榜单,本文将提供CompassBench的整体介绍。 - -本次评测将在语言、知识、创作、推理、数学、代码、长文本、智能体能力的多项任务上开展评测,现提供任务介绍和题目示例。 - -- 评测方式采样主观与客观相结合的方式,具体根据各个任务不同进行具体设计。 -- 针对推理、数学、代码、智能体等任务,将会采用Few-shot + CoT的评测方式。 -- 对于填空题,通过在Prompt中提供Few-shot和输出格式约束来协助抽取答案。 -- 对于选择题,针对同一问题,通过变换提问方式,减少随机影响。 -- 对于开放式问题的评测,对同一问题进行多次采样,并采用多维度打分的方式进行评价。 - -> OpenCompass在客观题评测的后处理上已进行较多优化,并在评测时在Prompt中对回答格式进行约束,对于因指令跟随问题带来的无法完成答案提取的情况,将视为回答错误。OpenCompass将会在下一期加入指令跟随能力的评测。 - -| 能力 | 任务 | 介绍 | 题目示例 | -| ---- | ---- | ---- | ---- | -| 语言 | 信息抽取 | 信息抽取是指从文本中提取出特定类型的信息。这类任务通常用于处理结构化数据、知识图谱、问答系统等场景。 | ```"question": "野马队在分区轮以 23–16 击败了匹兹堡钢人队,在比赛的最后三分钟拿下 11 分。然后他们在美式足球联合会 (AFC) 锦标赛上以 20–18 击败了第 49 届超级碗卫冕冠军新英格兰爱国者队,在比赛还剩 17 秒 时拦截了新英格兰队的两分转换传球。尽管曼宁在本赛季的拦截上有问题,但他在两场季后赛中未投任何球。\n野马队在 AFC 锦标赛中打败了谁?"``` | -| 语言 | 意图识别 | 意图识别是对用户输入的文本或语音进行分析,判断其意图或需求。这类任务应用于智能客服、语音助手、聊天机器人等场景。 | ```"question": "中国文化的天人合一思想\n中西文化的基本差异之一就是,在人与自然的关系问题上,中国文化比较重视人与自然的和谐统一,而西方文化则强调,人要征服自然、改造自然才能求得自己的生存和发展。中国文化的这种特色,有时通过“天人合一”的命题表述出来。中国古代思想家一般都反对把天与人割裂开来、对立起来,而主张天人协调、天人合一。\n天人合一问题,就其理论实质而言,是关于人与自然的统一问题,或者说是自然界和精神的统一问题。应当承认,中国传统文化中的天人合一思想,内容十分复杂,其中既有正确的观点,也有错误的观点,我们必须实事求是地予以分析。但是,从文化的民族性以及对民族文化的推进作用和深远影响看,我们应当大胆肯定。中国古代思想家关于天人合一的思想,其最基本的涵义,就是充分肯定自然界和精神的统一,关注人类行为与自然界的协调问题。从这个意思上说,天人合一思想的,是非常有价值的。\n恩格斯对自然和精神的统一问题,有过一系列精辟的论述。他说:“我们一天天地学会更加正确地理解自然规律,学会认识我们对于自然界的惯常行程的干涉所引起的比较近或比较远的影响。”他还说:“自然界和精神是统一的。自然界不能是无理性的……而理性是不能和自然界矛盾的。”“思维规律和自然规律,只要它们被正确地认识,必然是互相一致的。”恩格斯的这些论述,深刻地揭示了自然和精神统一问题的丰富内涵。根据恩格斯的这些论述,考察中国古代的天人合一思想,不难看出,这种思想有着深刻的合理性。\n中国古代的天人合一思想,强调人与自然的统一,人的行为与自然的协调,道德理性与自然理性的一致,充分显示了中国古代思想家对于主客体之间、主观能动性和客观规律之间关系的辩证思考。根据这种思想,人不能违背自然规律,不能超越自然界的承受力去改造自然、征服自然、破坏自然,而只能在顺从自然规律的条件下去利用自然、调整自然,使之更符合人类的需要,也使自然界的万物都能生长发展。另一方面,自然界也不是主宰人其社会的神秘力量,而是可以认识、可以为我所用的客观对象。这种思想长期实践的结果,是达到自然界与人的统一,人的精神、行为与外在自然的统一,自我身心平衡与自然环境平衡的统一,以及由于这些统一而达到的天道与人道的统一,从而实现完满和谐的精神追求。中国文化的天人合一思想,对于解决当今世界由于工业化和无限制地征服自然而带来的自然环境被污染、生态平衡遭破坏等问题,具有重要的启迪意义;对于我们今天正在进行的社会主义现代化建设,更有着防患于未然的重大现实意义。\n(选自张岱年等主编的《中国文化概论》,有删改)\n根据原文提供的信息,下列推断不正确的一项是","A": "对人与自然关系的认识,中国古代天人合一思想有优于西方文化的地方。","B": "现代人重视和研究天人合一思想,是基于对现实及发展问题的思考。", "C": "肯定天人合一思想的合理性,并不意味着对其思想内容的全盘接受。", "D": "以天人合一思想为指导,可解决当今世界因工业化带来的各种社会问题。",``` | -| 语言 | 情感分析 | 情感分析是对文本中的情感或情绪进行识别和分析的任务。这类任务可用于情感倾向分析场景。例如,分析社交媒体上的用户评论,了解新闻或事件的倾向。| ```"question": "请问以下评价是正面评价还是负面评价?\n大众点评网的霸王餐,200份华辉拉肠双人试吃,员村一店是已经有经营两年以上的,年前装修过,干净齐整,下单的服务员亲切有礼,可能我是第一个用代码验证的,中间拖了点时间去验证,幸好周日10点左右没有平时的多人。拉肠一如既往的滑,皮蛋瘦肉粥很绵,皮蛋瘦肉超多,肉肠是一底带肉一底斋肠,以前没吃过鸡蛋肠觉得6蚊不太划算,现在发现是有三底肠粉的哦,不太喜欢吃肉的可以试下,很饱肚,鼓油是吃过这么多家肠粉店味道调得最好的。","A": "正面评价", "B": "负面评价"```| -| 语言 | 内容总结 | 内容总结是将一篇较长的文本压缩成一篇简短的概括性摘要。这类任务适用于需要快速了解文档核心内容的情境,如新闻标题、电子邮件摘要 | ```联合国减灾办公室负责人格拉瑟。联合国减灾办公室2016年2月11日联合国减灾办公室今天表示,2015年是有记录以来最热的一个年份,在这一年当中,自然灾害影响了近1亿人口。减灾办公室呼吁各国采取行动,应对气候变化,在最大程度上做出努力,防止和减少灾害的发生。联合国减灾办公室所公布的最新数据显示,在过去一年当中,受到灾害影响最重的国家都在亚洲,它们是中国、印度、菲律宾和印度尼西亚。自然灾害共导致2万2000人死亡,带来的经济损失约合660亿美元。然而,尽管这一数字惊人,但却低于1400亿的10年平均数字。其中的部分原因是各国政府采取了更好的防范措施。数据显示,2015年有5000万人深受旱灾之苦,增幅达40%。联合国减灾办公室负责人格拉瑟表示,2015年是记载中最热的一个年份,成因是气候变化和厄尔尼诺天气现象。他指出,最令人感到不安的一个趋势是2015年有记录的主要干旱增加了一倍。他强调,数据表明,减少温室气体排放和适应气候变化对于减少灾害风险至关重要。```| -| 语言 | 内容评价 | 内容评价是对文本的质量、价值或观点进行判断和评价的任务。这类任务可用于评论筛选、观点挖掘等场景。 | ```"question": "以下是一个问题以及针对该问题的两个答案,哪个答案更好?\n问题:创建一篇1000字的非剽窃新闻文章,关于任天堂将于2月8日星期三播出新的任天堂直面会,承诺将公布即将推出的Switch游戏的新细节。2月的任天堂直面会将在东部时间下午5点/太平洋时间下午2点,在公司的YouTube频道上直播。\n\n任天堂表示,星期三的任天堂直面会将持续“大约”40分钟,并将重点放在即将在2023年上半年推出的Nintendo Switch游戏上。\n\n任天堂宣布的Nintendo Switch游戏阵容包括《星之卡比:梦之泉豪华版》,这是2011年Wii游戏的重制版;《魔兵雅各:樱花与失落的恶魔》,这是一个以童话为灵感的冒险故事,主角是年轻的魔兵雅各;《塞尔达传说:王国之泪》,是《荒野之息》的续作;以及《皮克敏4》,这是一款长期运行的策略/动作系列的新作,仍然有些神秘。\n\n此外,任天堂的计划中,但日期尚未确定的还有《超级战争大战1+2:重启营》,这是两款Game Boy Advance时代的《超级战争》游戏的重制版。经过长时间的延迟,Switch版的《超级战争》合集据传很快就要推出。当然,任天堂在2023年还有其他一些大计划,包括2月17日在好莱坞环球影城开设的超级任天堂乐园主题公园,以及4月7日上映的《超级马里奥兄弟》电影。格斗游戏确实很酷,但可能会让人望而生畏。你可能会看到铁拳或街霸这样的竞技世界,然后想,“我可能很菜!而且没有正确的装备!而且我不懂这些术语!”但是停下来。你可以进入格斗游戏。对Pat Gill来说,这段旅程真的很有意义。以下是他如何拥抱格斗游戏,以及你也可以。\n答案 1:任天堂粉丝们正急切地期待着最新的任天堂直面发布会,该发布会定于2月8日星期三播出。该发布会将在公司的YouTube频道上以东部时间下午5点/太平洋时间下午2点的直播形式进行,承诺提供有关任天堂Switch即将推出的游戏的令人兴奋的新细节。\n\n据任天堂称,直面发布会将持续大约40分钟,专门关注2023年上半年将为Switch发布的游戏。已经宣布的一些游戏标题包括《星之卡比:梦之泉DX》、《魔兽传说:樱花与失落的恶魔》、《塞尔达传说:王国的泪滴》和《皮克敏4》。\n\n任天堂日程表上最受期待的游戏之一是《超级战争大战1+2:重启训练营》,这是两款经典Game Boy Advance游戏的重制版。尽管尚未正式宣布发行日期,但有传言称游戏将在经过长时间延迟后很快推出。\n\n除了游戏阵容外,任天堂还计划在2023年推出许多其他令人兴奋的项目,包括2月17日在好莱坞环球影城开放的超级任天堂乐园主题公园,以及4月7日上映的《超级马里奥兄弟》电影。\n\n对于格斗游戏爱好者来说,任天堂直面发布会承诺将是一个令人兴奋的活动,将提供任天堂Switch最新和最伟大游戏的一瞥。无论您是经验丰富的玩家还是刚刚开始,直面发布会中肯定会有适合每个人的内容。因此,请务必在2月8日星期三收看,以获取来自任天堂的所有最新消息和更新!\n答案 2:任天堂宣布将于2月8日星期三下午5点EST/下午2点PST,在公司的YouTube频道上播出新的任天堂Direct演示。演示将持续大约40分钟,并将提供有关2023年上半年即将推出的任天堂Switch游戏的新细节。\n\n公司宣布的任天堂Switch阵容包括《星之卡比:梦之返乡豪华版》,这是2011年Wii游戏的重制版;《魔兽师起源:赛瑞莎与失落的恶魔》,一个以童话为灵感的冒险,主角是年轻的贝约奈塔;《塞尔达传说:王国之泪》,《荒野之息》的续作;以及《皮克敏4》,这是一款长寿的策略/动作系列的新作,仍然有些神秘。\n\n此外,任天堂还有一些其他大计划,包括在2023年2月17日在好莱坞环球影城开设超级任天堂乐园主题公园,以及于4月7日上映《超级马里奥兄弟电影》。\n\n格斗游戏是一种受欢迎的游戏类型,可能是一种令人望而生畏的爱好。然而,人们是可以享受格斗游戏的,Pat Gill就是如何拥抱这种爱好的一个很好的例子。他从一个初学者开始,发现这是一段有意义的旅程。只要有正确的心态和资源,任何人都可以参与格斗游戏,并享受它们所提供的刺激和竞争。" ``` | -| 语言 | 多语言翻译 | 多语言翻译是将一种语言的文本转换为另一种语言的文本。这类任务适用于跨语言沟通、在线翻译等场景。|```"question": "Translate the following sentence from English to French: \"He [Wales] basically lied to us from the start. First, by acting as if this was for legal reasons. Second, by pretending he was listening to us, right up to his art deletion."```| -| 语言 | 中华传统文化理解 | 中华传统文化涉及对中国古代文学、艺术、哲学、历史等领域的研究 | ``` "question": "王实甫在《西厢记》中写道:“淋漓襟袖啼红泪,比司马青衫更湿”,其中“司马青衫”指的是什么"``` | -| 语言 | 中文语意理解 | 中文语意理解涉及理解文本中的词汇、短语和句子之间的语义关系,包括但不限于近义词、反义词、整体-部分关系、修饰关系等。 |``` "question": "“繁荣”与以下哪个词具有近义关系?", "A": "盛世", "B": "荣誉", "C": "繁花", "D": "昌盛"```| -| 语言 | 多轮对话 | 评价模型能否在多轮对话中保持上下文一致性和连贯性的能力,评估模型是否能够理解并记住对话的上下文信息,记住之前的对话内容。 |```[{'role': 'user','content': '我在做一项关于智能手机市场的研究,需要整理一些数据成 Markdown 表格。数据包括品牌名称、市场份额和热销型号。品牌有苹果、三星和华为。苹果的市场份额是30%,热销型号是iPhone 13;三星市场份额是25%,热销型号是Galaxy S21;华为市场份额是20%,热销型号是Mate 40。请帮我做一个表格。'},{'role': 'user','content': '看起来不错,不过我希望表格中的市场份额列展示为百分比和实际销量。苹果的销量是8000万部,三星是6000万部,华为是5000万部。'}, {'role': 'user', 'content': '很好。现在请把表格的标题中文改成英文,并且各列改成对齐方式:品牌列左对齐,市场份额列居中对齐,热销型号列右对齐。'},{'role': 'user', 'content': '可以,我注意到我们可能需要添加一列来表示这些品牌的总收入,苹果为500亿美元,三星为400亿美元,华为为350亿美元。此外,请按市场销量对行进行排序。'}]```| -| 知识 | 生活常识 | 考察普通社会上智力正常的人皆有或普遍拥有的,大众化的知识 | ```"question": "世界四大文明古国有哪些?```| -| 知识 | 自然科学(理科) | 关于自然现象的具体科学,研究自然界的本质和规律(理科):包括不限于数学,物理学,化学,生物学,天文学等 | ```"question": "群的研究对象是什么?"``` | -| 知识 | 自然科学(工科) | 关于自然现象的具体科学,研究自然界的本质和规律(工科):包括不限于计算机科学,医学,建筑学,材料学,机械学,测量学,气象学,环境学等 | ```"question": "下列关于信息安全的说法,正确的是( )。", "options": ["打开朋友转发的网页链接一定是安全的", "安装了杀毒软件后电脑就不会感染病毒", "数据加密是一种提高信息安全性的有效措施", "手机指纹识别技术能确保手机所有信息的安全"]``` | -| 知识 | 社会科学 | 研究社会现象的具体科学,力求揭示社会的本质和规律,例如经济学,政治学,军事学,社会学,管理学,教育学等。社会科学主要以人类社会的组织与结构、体制与关系、功能与效率、秩序与规范为研究认识之对象,并通过这种知识来为人类社会的有序管理、高效运作提供知识、理论和手段 | ```"question": "为了避免资金供应短缺和倒闭,企业经营者需要做什么?"``` | -| 知识 | 人文科学 | 设设计对人的问题的类型思考与情感体验,围绕着关乎人的心灵世界、关乎人的精神生命主题而展开的种种思想、观念、知识和理论的探索。它以人类自身,特别是人的内心情感世界为研究中心,以人自身的发展和完善作为学术探索的出发点和归宿。包括不限于文学,历史学、哲学、艺术、语言等 | ```"question": "光绪二十四年(1898)五月,维新派代表人物康有为从“中体西用”的角度论述了科举制度改革的必要性。这表明他( )", "options": ["在戊戌变法初期思想趋于保守", "认同洋务派的“中体西用”思想", "在教育改革方面与洋务派观点一致", "所说的“体”和“用”与洋务派不同"]``` | -| 创作 | 内容扩写 | 给定标题或者大纲的基础上,通过增加细节、描述和解释,使内容更加丰富、饱满和具有表现力。这种方法主要用于散文、小说等文学创作,以及学术论文、报告等实用文本 | ```请根据我给出的[外星人入侵、核弹、流亡]这些关键词来撰写一篇[科幻]题材的短篇故事。 \n故事需要拥有[引人入胜]的开头以及[反转]的结局,故事线[跌宕起伏]。\n注意请使用[刘慈欣]的写作风格为我撰写这篇故事。减少赘述,内容中不要有重复或意思相近的段落,大约800字``` | -| 创作 | 内容续写 | 现有文本的基础上,继续编写后面的内容。这种方法主要用于小说、故事等叙事性文本。续写部分通常要保持与原有文本的风格、情节和人物设定相一致,同时要求作者具备较强的想象力和创造力。 | ```题目:《新型能源技术在工业生产中的应用与效益》随着能源需求的不断增长和传统能源的有限性,新型能源技术在工业领域的应用备受瞩目。本文将着重探讨新型能源技术对工业生产的潜在影响,以及其在提高生产效益和减少环境影响方面的作用。请按照以上题目和摘要,完成一篇不少于1000字的论文``` | -| 创作 | 内容改写 | 不改变原文主题和基本结构的前提下,对文本进行一定程度的修改、重组和优化。这种方法主要用于修改学术论文、报告、文章等。内容改写的目的是提高文本的表达能力、逻辑性和可读性,同时避免重复。 | ```请帮我总结一封电子邮件的内容,总结需要包含以下四个部分:\n【重要性】根据内容判断事项是否重要,结果包含重要、不重要\n【紧急性】根据内容判断事项是否紧急,结果包含紧急、不紧急\n【核心内容】使用一句简短的话总结邮件最核心的内容。\n【需要回复内容】请判断邮件中哪些内容需要获得我的回复/确认,以列表形式呈现。\n 接下来,请根据下面邮件的内容,进行摘要:\n亲爱的全体员工:\n为了改善大家的身心健康,增强工作效率,公司特别安排了一场瑜伽兴趣培训,现将培训内容通知如下:\n日期及时间:8月15日(周六)上午9:00至11:00\n地点:公司三楼活动室(面积120平米,可容纳30人参加培训)\n培训内容:\n专业瑜伽教练将为大家进行基础的瑜伽技能和健康知识培训。 瑜伽是一种低强度有氧运动,适合各年龄层人群。它能够通过姿势练习、呼吸技巧等,改善身体的柔韧性和平衡感,帮助人体各系统更好地运行,有效减压提神。\n本次培训重点讲解:\n1)基本的瑜伽哲学及其健康效果介绍\n2)冥想和呼吸技巧演练\n3)10多个常见的基础瑜伽姿势示范及练习(包括猿人式、波浪式、斜 Supported Headstand 等)\n4)瑜伽练习时需要注意的安全事项\n5)瑜伽适宜穿戴的服装和个人物品\n6)参与培训后如何延续瑜伽运动\n培训具体流程:\n9:00-9:30 瑜伽基本概念介绍\n9:30-10:10 练习冥想、呼吸及基础姿势\n10:10-10:30 小休10分钟\n10:30-11:00 继续练习高难度姿势并解答问题\n如有意参加本次瑜伽兴趣培训,请于8月10日前用邮件或电话方式告知我们,我方将安排培训。\n若您有任何问题或建议,也欢迎与我联系。感谢您的收听与参与。```| -| 推理 | 逻辑推理 | 综合考察模型的几种常见逻辑推理模式:如演绎、归纳和溯因。 | ```"question": "在接下来的文本中,符号 -> 代表着一个简单的数学运算。\n695 - 472 -> 229\n222 - 62 -> 166\n689 - 439 -> ?",```| -| 推理 | 常识推理 | 常识推理是指基于日常生活中积累的知识和经验,对事物进行合理推断和判断的过程。它涉及到对常见事物、现象和规律的理解,通过综合分析得出合理的结论。 | ```"question": "美即好效应,指对一个外表英俊漂亮的人,人们很容易误认为他或她的其他方面也很不错。根据上述定义,下列哪项属于美即好效应?( )", "A": "外表英俊漂亮的人在应聘中更受招聘者的青睐", "B": "小芳认为自己的女儿是幼儿园中最漂亮的孩子", "C": "人们常说女孩因为可爱而美丽并非因为美丽而可爱", "D": "购物网站上有一个漂亮的模特往往会提高产品的销量"``` | -| 数学 | 初等数学 | 初等教育数学能力(小学数学) | ```"question": "小芳手上有40元。她的爸爸又给了她100元。她花了30元买了一条牛仔裤,又花了20元买了一个包。那么小芳还剩下多少钱呢?"```| -| 数学 | 中等数学 | 中等教育数学能力(初中和高中数学) | ```"question": "某地开展建设绿色家园活动,活动期间,计划每天种植相同数量的树木.该活动开始后,实际每天比原计划每天多植树$50$棵,实际植树$400$棵所需时间与原计划植树$300$棵所需时间相同.设实际每天植树$x$棵,则下列方程正确的是( )", "options": ["$\\frac{{400}}{{x-50}}=\\frac{{300}}{x}$", "$\\frac{{300}}{{x-50}}=\\frac{{400}}{x}$", "$\\frac{{400}}{{x+50}}=\\frac{{300}}{x}$", "$\\frac{{300}}{{x+50}}=\\frac{{400}}{x}$"]```| -| 数学 | 高等 | 高教育数学能力(大学和研究生数学) | ```"question": "已知有向曲线 $L$ 为球面 $x^2+y^2+z^2=2x$ 与平面 $2x-z-1=0$ 的交线,从 $z$ 轴正向往 $z$ 轴负向看去为逆时针方向,计算曲线积分$\\int_L(6xyz-yz^2)dx+2x^2zdy+xyzdz$.", "options": [ "$\\frac{4\\pi}{7\\sqrt5}$", "$\\frac{3\\pi}{7\\sqrt5}$", "$\\frac{3\\pi}{5\\sqrt5}$", "$\\frac{4\\pi}{5\\sqrt5}$"]``` | -| 代码 | 代码理解 | 输入为用户的需求文字或者部分代码,考察模型的逻辑推理能力和代码生成能力,考察模型对各类编程语言的掌握程度。内容包括不限于:算法和数据结构能力考察编程语言语法考察跨编程语言转换 | ```"question": "编写一个 Python 函数,用于检查两个数字是否仅在一个位置上不同。"```| -| 代码 | 代码分析 | 考察模型对代码的理解和分析能力,给定一段代码,进行代码意图分析,代码规范检查,错误检查等 | ```"question":"\n\ndef truncate_number(number: float) -> float:\n \"\"\" 给定一个正的浮点数,可以将其分解为整数部分(小于给定数字的最大整数)和小数部分(余数部分总是小于1)。\n\n 返回该数字的小数部分。\n >>> truncate_number(3.5)\n 0.5\n \"\"\"",``` | -| 长文本 | 长文本理解与推理 | 考察模型在不同的长度上下文(2k, 4k, 8k, 16k, 32k)情况下的理解和推理能力 | 略 | -| 智能体 | 任务规划 | 智能体根据用户的需求目标和具备工具条件,进行合理的任务拆解,科学地安排子任务的执行顺序和策略,对任务执行路径进行设计和规划,选择合适的策略。 | 略| -| 智能体 | 工具调用 | 评估模型能否准确的调用合适的API,在调用API时能否正确的传递参数 | 略 | -| 智能体 | 反思能力 | 评估模型在子任务执行失败时,是否具有反思和重新规划任务路径的能力 | 略 | -| 智能体 | 任务执行总结 | 评估模型能否根据子任务的执行结果进行总结分析,完成原始任务目标,正确地按指令输出回复 | 略| -| 智能体 | 多轮交互 | 评估模型在进行多轮复杂工具调用时的能力,在多轮情况下能否准确理解意图 | 略 | +# CompassBench 2.0 介绍 + + +## v1.0介绍 +为支持OpenCompass的年度榜单,本文将提供CompassBench的整体介绍。 + +本次评测将在语言、知识、创作、推理、数学、代码、长文本、智能体能力的多项任务上开展评测,现提供任务介绍和题目示例。 + +- 评测方式采样主观与客观相结合的方式,具体根据各个任务不同进行具体设计。 +- 针对推理、数学、代码、智能体等任务,将会采用Few-shot + CoT的评测方式。 +- 对于填空题,通过在Prompt中提供Few-shot和输出格式约束来协助抽取答案。 +- 对于选择题,针对同一问题,通过变换提问方式,减少随机影响。 +- 对于开放式问题的评测,对同一问题进行多次采样,并采用多维度打分的方式进行评价。 + +> OpenCompass在客观题评测的后处理上已进行较多优化,并在评测时在Prompt中对回答格式进行约束,对于因指令跟随问题带来的无法完成答案提取的情况,将视为回答错误。OpenCompass将会在下一期加入指令跟随能力的评测。 + +| 能力 | 任务 | 介绍 | 题目示例 | +| ---- | ---- | ---- | ---- | +| 语言 | 信息抽取 | 信息抽取是指从文本中提取出特定类型的信息。这类任务通常用于处理结构化数据、知识图谱、问答系统等场景。 | ```"question": "野马队在分区轮以 23–16 击败了匹兹堡钢人队,在比赛的最后三分钟拿下 11 分。然后他们在美式足球联合会 (AFC) 锦标赛上以 20–18 击败了第 49 届超级碗卫冕冠军新英格兰爱国者队,在比赛还剩 17 秒 时拦截了新英格兰队的两分转换传球。尽管曼宁在本赛季的拦截上有问题,但他在两场季后赛中未投任何球。\n野马队在 AFC 锦标赛中打败了谁?"``` | +| 语言 | 意图识别 | 意图识别是对用户输入的文本或语音进行分析,判断其意图或需求。这类任务应用于智能客服、语音助手、聊天机器人等场景。 | ```"question": "中国文化的天人合一思想\n中西文化的基本差异之一就是,在人与自然的关系问题上,中国文化比较重视人与自然的和谐统一,而西方文化则强调,人要征服自然、改造自然才能求得自己的生存和发展。中国文化的这种特色,有时通过“天人合一”的命题表述出来。中国古代思想家一般都反对把天与人割裂开来、对立起来,而主张天人协调、天人合一。\n天人合一问题,就其理论实质而言,是关于人与自然的统一问题,或者说是自然界和精神的统一问题。应当承认,中国传统文化中的天人合一思想,内容十分复杂,其中既有正确的观点,也有错误的观点,我们必须实事求是地予以分析。但是,从文化的民族性以及对民族文化的推进作用和深远影响看,我们应当大胆肯定。中国古代思想家关于天人合一的思想,其最基本的涵义,就是充分肯定自然界和精神的统一,关注人类行为与自然界的协调问题。从这个意思上说,天人合一思想的,是非常有价值的。\n恩格斯对自然和精神的统一问题,有过一系列精辟的论述。他说:“我们一天天地学会更加正确地理解自然规律,学会认识我们对于自然界的惯常行程的干涉所引起的比较近或比较远的影响。”他还说:“自然界和精神是统一的。自然界不能是无理性的……而理性是不能和自然界矛盾的。”“思维规律和自然规律,只要它们被正确地认识,必然是互相一致的。”恩格斯的这些论述,深刻地揭示了自然和精神统一问题的丰富内涵。根据恩格斯的这些论述,考察中国古代的天人合一思想,不难看出,这种思想有着深刻的合理性。\n中国古代的天人合一思想,强调人与自然的统一,人的行为与自然的协调,道德理性与自然理性的一致,充分显示了中国古代思想家对于主客体之间、主观能动性和客观规律之间关系的辩证思考。根据这种思想,人不能违背自然规律,不能超越自然界的承受力去改造自然、征服自然、破坏自然,而只能在顺从自然规律的条件下去利用自然、调整自然,使之更符合人类的需要,也使自然界的万物都能生长发展。另一方面,自然界也不是主宰人其社会的神秘力量,而是可以认识、可以为我所用的客观对象。这种思想长期实践的结果,是达到自然界与人的统一,人的精神、行为与外在自然的统一,自我身心平衡与自然环境平衡的统一,以及由于这些统一而达到的天道与人道的统一,从而实现完满和谐的精神追求。中国文化的天人合一思想,对于解决当今世界由于工业化和无限制地征服自然而带来的自然环境被污染、生态平衡遭破坏等问题,具有重要的启迪意义;对于我们今天正在进行的社会主义现代化建设,更有着防患于未然的重大现实意义。\n(选自张岱年等主编的《中国文化概论》,有删改)\n根据原文提供的信息,下列推断不正确的一项是","A": "对人与自然关系的认识,中国古代天人合一思想有优于西方文化的地方。","B": "现代人重视和研究天人合一思想,是基于对现实及发展问题的思考。", "C": "肯定天人合一思想的合理性,并不意味着对其思想内容的全盘接受。", "D": "以天人合一思想为指导,可解决当今世界因工业化带来的各种社会问题。",``` | +| 语言 | 情感分析 | 情感分析是对文本中的情感或情绪进行识别和分析的任务。这类任务可用于情感倾向分析场景。例如,分析社交媒体上的用户评论,了解新闻或事件的倾向。| ```"question": "请问以下评价是正面评价还是负面评价?\n大众点评网的霸王餐,200份华辉拉肠双人试吃,员村一店是已经有经营两年以上的,年前装修过,干净齐整,下单的服务员亲切有礼,可能我是第一个用代码验证的,中间拖了点时间去验证,幸好周日10点左右没有平时的多人。拉肠一如既往的滑,皮蛋瘦肉粥很绵,皮蛋瘦肉超多,肉肠是一底带肉一底斋肠,以前没吃过鸡蛋肠觉得6蚊不太划算,现在发现是有三底肠粉的哦,不太喜欢吃肉的可以试下,很饱肚,鼓油是吃过这么多家肠粉店味道调得最好的。","A": "正面评价", "B": "负面评价"```| +| 语言 | 内容总结 | 内容总结是将一篇较长的文本压缩成一篇简短的概括性摘要。这类任务适用于需要快速了解文档核心内容的情境,如新闻标题、电子邮件摘要 | ```联合国减灾办公室负责人格拉瑟。联合国减灾办公室2016年2月11日联合国减灾办公室今天表示,2015年是有记录以来最热的一个年份,在这一年当中,自然灾害影响了近1亿人口。减灾办公室呼吁各国采取行动,应对气候变化,在最大程度上做出努力,防止和减少灾害的发生。联合国减灾办公室所公布的最新数据显示,在过去一年当中,受到灾害影响最重的国家都在亚洲,它们是中国、印度、菲律宾和印度尼西亚。自然灾害共导致2万2000人死亡,带来的经济损失约合660亿美元。然而,尽管这一数字惊人,但却低于1400亿的10年平均数字。其中的部分原因是各国政府采取了更好的防范措施。数据显示,2015年有5000万人深受旱灾之苦,增幅达40%。联合国减灾办公室负责人格拉瑟表示,2015年是记载中最热的一个年份,成因是气候变化和厄尔尼诺天气现象。他指出,最令人感到不安的一个趋势是2015年有记录的主要干旱增加了一倍。他强调,数据表明,减少温室气体排放和适应气候变化对于减少灾害风险至关重要。```| +| 语言 | 内容评价 | 内容评价是对文本的质量、价值或观点进行判断和评价的任务。这类任务可用于评论筛选、观点挖掘等场景。 | ```"question": "以下是一个问题以及针对该问题的两个答案,哪个答案更好?\n问题:创建一篇1000字的非剽窃新闻文章,关于任天堂将于2月8日星期三播出新的任天堂直面会,承诺将公布即将推出的Switch游戏的新细节。2月的任天堂直面会将在东部时间下午5点/太平洋时间下午2点,在公司的YouTube频道上直播。\n\n任天堂表示,星期三的任天堂直面会将持续“大约”40分钟,并将重点放在即将在2023年上半年推出的Nintendo Switch游戏上。\n\n任天堂宣布的Nintendo Switch游戏阵容包括《星之卡比:梦之泉豪华版》,这是2011年Wii游戏的重制版;《魔兵雅各:樱花与失落的恶魔》,这是一个以童话为灵感的冒险故事,主角是年轻的魔兵雅各;《塞尔达传说:王国之泪》,是《荒野之息》的续作;以及《皮克敏4》,这是一款长期运行的策略/动作系列的新作,仍然有些神秘。\n\n此外,任天堂的计划中,但日期尚未确定的还有《超级战争大战1+2:重启营》,这是两款Game Boy Advance时代的《超级战争》游戏的重制版。经过长时间的延迟,Switch版的《超级战争》合集据传很快就要推出。当然,任天堂在2023年还有其他一些大计划,包括2月17日在好莱坞环球影城开设的超级任天堂乐园主题公园,以及4月7日上映的《超级马里奥兄弟》电影。格斗游戏确实很酷,但可能会让人望而生畏。你可能会看到铁拳或街霸这样的竞技世界,然后想,“我可能很菜!而且没有正确的装备!而且我不懂这些术语!”但是停下来。你可以进入格斗游戏。对Pat Gill来说,这段旅程真的很有意义。以下是他如何拥抱格斗游戏,以及你也可以。\n答案 1:任天堂粉丝们正急切地期待着最新的任天堂直面发布会,该发布会定于2月8日星期三播出。该发布会将在公司的YouTube频道上以东部时间下午5点/太平洋时间下午2点的直播形式进行,承诺提供有关任天堂Switch即将推出的游戏的令人兴奋的新细节。\n\n据任天堂称,直面发布会将持续大约40分钟,专门关注2023年上半年将为Switch发布的游戏。已经宣布的一些游戏标题包括《星之卡比:梦之泉DX》、《魔兽传说:樱花与失落的恶魔》、《塞尔达传说:王国的泪滴》和《皮克敏4》。\n\n任天堂日程表上最受期待的游戏之一是《超级战争大战1+2:重启训练营》,这是两款经典Game Boy Advance游戏的重制版。尽管尚未正式宣布发行日期,但有传言称游戏将在经过长时间延迟后很快推出。\n\n除了游戏阵容外,任天堂还计划在2023年推出许多其他令人兴奋的项目,包括2月17日在好莱坞环球影城开放的超级任天堂乐园主题公园,以及4月7日上映的《超级马里奥兄弟》电影。\n\n对于格斗游戏爱好者来说,任天堂直面发布会承诺将是一个令人兴奋的活动,将提供任天堂Switch最新和最伟大游戏的一瞥。无论您是经验丰富的玩家还是刚刚开始,直面发布会中肯定会有适合每个人的内容。因此,请务必在2月8日星期三收看,以获取来自任天堂的所有最新消息和更新!\n答案 2:任天堂宣布将于2月8日星期三下午5点EST/下午2点PST,在公司的YouTube频道上播出新的任天堂Direct演示。演示将持续大约40分钟,并将提供有关2023年上半年即将推出的任天堂Switch游戏的新细节。\n\n公司宣布的任天堂Switch阵容包括《星之卡比:梦之返乡豪华版》,这是2011年Wii游戏的重制版;《魔兽师起源:赛瑞莎与失落的恶魔》,一个以童话为灵感的冒险,主角是年轻的贝约奈塔;《塞尔达传说:王国之泪》,《荒野之息》的续作;以及《皮克敏4》,这是一款长寿的策略/动作系列的新作,仍然有些神秘。\n\n此外,任天堂还有一些其他大计划,包括在2023年2月17日在好莱坞环球影城开设超级任天堂乐园主题公园,以及于4月7日上映《超级马里奥兄弟电影》。\n\n格斗游戏是一种受欢迎的游戏类型,可能是一种令人望而生畏的爱好。然而,人们是可以享受格斗游戏的,Pat Gill就是如何拥抱这种爱好的一个很好的例子。他从一个初学者开始,发现这是一段有意义的旅程。只要有正确的心态和资源,任何人都可以参与格斗游戏,并享受它们所提供的刺激和竞争。" ``` | +| 语言 | 多语言翻译 | 多语言翻译是将一种语言的文本转换为另一种语言的文本。这类任务适用于跨语言沟通、在线翻译等场景。|```"question": "Translate the following sentence from English to French: \"He [Wales] basically lied to us from the start. First, by acting as if this was for legal reasons. Second, by pretending he was listening to us, right up to his art deletion."```| +| 语言 | 中华传统文化理解 | 中华传统文化涉及对中国古代文学、艺术、哲学、历史等领域的研究 | ``` "question": "王实甫在《西厢记》中写道:“淋漓襟袖啼红泪,比司马青衫更湿”,其中“司马青衫”指的是什么"``` | +| 语言 | 中文语意理解 | 中文语意理解涉及理解文本中的词汇、短语和句子之间的语义关系,包括但不限于近义词、反义词、整体-部分关系、修饰关系等。 |``` "question": "“繁荣”与以下哪个词具有近义关系?", "A": "盛世", "B": "荣誉", "C": "繁花", "D": "昌盛"```| +| 语言 | 多轮对话 | 评价模型能否在多轮对话中保持上下文一致性和连贯性的能力,评估模型是否能够理解并记住对话的上下文信息,记住之前的对话内容。 |```[{'role': 'user','content': '我在做一项关于智能手机市场的研究,需要整理一些数据成 Markdown 表格。数据包括品牌名称、市场份额和热销型号。品牌有苹果、三星和华为。苹果的市场份额是30%,热销型号是iPhone 13;三星市场份额是25%,热销型号是Galaxy S21;华为市场份额是20%,热销型号是Mate 40。请帮我做一个表格。'},{'role': 'user','content': '看起来不错,不过我希望表格中的市场份额列展示为百分比和实际销量。苹果的销量是8000万部,三星是6000万部,华为是5000万部。'}, {'role': 'user', 'content': '很好。现在请把表格的标题中文改成英文,并且各列改成对齐方式:品牌列左对齐,市场份额列居中对齐,热销型号列右对齐。'},{'role': 'user', 'content': '可以,我注意到我们可能需要添加一列来表示这些品牌的总收入,苹果为500亿美元,三星为400亿美元,华为为350亿美元。此外,请按市场销量对行进行排序。'}]```| +| 知识 | 生活常识 | 考察普通社会上智力正常的人皆有或普遍拥有的,大众化的知识 | ```"question": "世界四大文明古国有哪些?```| +| 知识 | 自然科学(理科) | 关于自然现象的具体科学,研究自然界的本质和规律(理科):包括不限于数学,物理学,化学,生物学,天文学等 | ```"question": "群的研究对象是什么?"``` | +| 知识 | 自然科学(工科) | 关于自然现象的具体科学,研究自然界的本质和规律(工科):包括不限于计算机科学,医学,建筑学,材料学,机械学,测量学,气象学,环境学等 | ```"question": "下列关于信息安全的说法,正确的是( )。", "options": ["打开朋友转发的网页链接一定是安全的", "安装了杀毒软件后电脑就不会感染病毒", "数据加密是一种提高信息安全性的有效措施", "手机指纹识别技术能确保手机所有信息的安全"]``` | +| 知识 | 社会科学 | 研究社会现象的具体科学,力求揭示社会的本质和规律,例如经济学,政治学,军事学,社会学,管理学,教育学等。社会科学主要以人类社会的组织与结构、体制与关系、功能与效率、秩序与规范为研究认识之对象,并通过这种知识来为人类社会的有序管理、高效运作提供知识、理论和手段 | ```"question": "为了避免资金供应短缺和倒闭,企业经营者需要做什么?"``` | +| 知识 | 人文科学 | 设设计对人的问题的类型思考与情感体验,围绕着关乎人的心灵世界、关乎人的精神生命主题而展开的种种思想、观念、知识和理论的探索。它以人类自身,特别是人的内心情感世界为研究中心,以人自身的发展和完善作为学术探索的出发点和归宿。包括不限于文学,历史学、哲学、艺术、语言等 | ```"question": "光绪二十四年(1898)五月,维新派代表人物康有为从“中体西用”的角度论述了科举制度改革的必要性。这表明他( )", "options": ["在戊戌变法初期思想趋于保守", "认同洋务派的“中体西用”思想", "在教育改革方面与洋务派观点一致", "所说的“体”和“用”与洋务派不同"]``` | +| 创作 | 内容扩写 | 给定标题或者大纲的基础上,通过增加细节、描述和解释,使内容更加丰富、饱满和具有表现力。这种方法主要用于散文、小说等文学创作,以及学术论文、报告等实用文本 | ```请根据我给出的[外星人入侵、核弹、流亡]这些关键词来撰写一篇[科幻]题材的短篇故事。 \n故事需要拥有[引人入胜]的开头以及[反转]的结局,故事线[跌宕起伏]。\n注意请使用[刘慈欣]的写作风格为我撰写这篇故事。减少赘述,内容中不要有重复或意思相近的段落,大约800字``` | +| 创作 | 内容续写 | 现有文本的基础上,继续编写后面的内容。这种方法主要用于小说、故事等叙事性文本。续写部分通常要保持与原有文本的风格、情节和人物设定相一致,同时要求作者具备较强的想象力和创造力。 | ```题目:《新型能源技术在工业生产中的应用与效益》随着能源需求的不断增长和传统能源的有限性,新型能源技术在工业领域的应用备受瞩目。本文将着重探讨新型能源技术对工业生产的潜在影响,以及其在提高生产效益和减少环境影响方面的作用。请按照以上题目和摘要,完成一篇不少于1000字的论文``` | +| 创作 | 内容改写 | 不改变原文主题和基本结构的前提下,对文本进行一定程度的修改、重组和优化。这种方法主要用于修改学术论文、报告、文章等。内容改写的目的是提高文本的表达能力、逻辑性和可读性,同时避免重复。 | ```请帮我总结一封电子邮件的内容,总结需要包含以下四个部分:\n【重要性】根据内容判断事项是否重要,结果包含重要、不重要\n【紧急性】根据内容判断事项是否紧急,结果包含紧急、不紧急\n【核心内容】使用一句简短的话总结邮件最核心的内容。\n【需要回复内容】请判断邮件中哪些内容需要获得我的回复/确认,以列表形式呈现。\n 接下来,请根据下面邮件的内容,进行摘要:\n亲爱的全体员工:\n为了改善大家的身心健康,增强工作效率,公司特别安排了一场瑜伽兴趣培训,现将培训内容通知如下:\n日期及时间:8月15日(周六)上午9:00至11:00\n地点:公司三楼活动室(面积120平米,可容纳30人参加培训)\n培训内容:\n专业瑜伽教练将为大家进行基础的瑜伽技能和健康知识培训。 瑜伽是一种低强度有氧运动,适合各年龄层人群。它能够通过姿势练习、呼吸技巧等,改善身体的柔韧性和平衡感,帮助人体各系统更好地运行,有效减压提神。\n本次培训重点讲解:\n1)基本的瑜伽哲学及其健康效果介绍\n2)冥想和呼吸技巧演练\n3)10多个常见的基础瑜伽姿势示范及练习(包括猿人式、波浪式、斜 Supported Headstand 等)\n4)瑜伽练习时需要注意的安全事项\n5)瑜伽适宜穿戴的服装和个人物品\n6)参与培训后如何延续瑜伽运动\n培训具体流程:\n9:00-9:30 瑜伽基本概念介绍\n9:30-10:10 练习冥想、呼吸及基础姿势\n10:10-10:30 小休10分钟\n10:30-11:00 继续练习高难度姿势并解答问题\n如有意参加本次瑜伽兴趣培训,请于8月10日前用邮件或电话方式告知我们,我方将安排培训。\n若您有任何问题或建议,也欢迎与我联系。感谢您的收听与参与。```| +| 推理 | 逻辑推理 | 综合考察模型的几种常见逻辑推理模式:如演绎、归纳和溯因。 | ```"question": "在接下来的文本中,符号 -> 代表着一个简单的数学运算。\n695 - 472 -> 229\n222 - 62 -> 166\n689 - 439 -> ?",```| +| 推理 | 常识推理 | 常识推理是指基于日常生活中积累的知识和经验,对事物进行合理推断和判断的过程。它涉及到对常见事物、现象和规律的理解,通过综合分析得出合理的结论。 | ```"question": "美即好效应,指对一个外表英俊漂亮的人,人们很容易误认为他或她的其他方面也很不错。根据上述定义,下列哪项属于美即好效应?( )", "A": "外表英俊漂亮的人在应聘中更受招聘者的青睐", "B": "小芳认为自己的女儿是幼儿园中最漂亮的孩子", "C": "人们常说女孩因为可爱而美丽并非因为美丽而可爱", "D": "购物网站上有一个漂亮的模特往往会提高产品的销量"``` | +| 数学 | 初等数学 | 初等教育数学能力(小学数学) | ```"question": "小芳手上有40元。她的爸爸又给了她100元。她花了30元买了一条牛仔裤,又花了20元买了一个包。那么小芳还剩下多少钱呢?"```| +| 数学 | 中等数学 | 中等教育数学能力(初中和高中数学) | ```"question": "某地开展建设绿色家园活动,活动期间,计划每天种植相同数量的树木.该活动开始后,实际每天比原计划每天多植树$50$棵,实际植树$400$棵所需时间与原计划植树$300$棵所需时间相同.设实际每天植树$x$棵,则下列方程正确的是( )", "options": ["$\\frac{{400}}{{x-50}}=\\frac{{300}}{x}$", "$\\frac{{300}}{{x-50}}=\\frac{{400}}{x}$", "$\\frac{{400}}{{x+50}}=\\frac{{300}}{x}$", "$\\frac{{300}}{{x+50}}=\\frac{{400}}{x}$"]```| +| 数学 | 高等 | 高教育数学能力(大学和研究生数学) | ```"question": "已知有向曲线 $L$ 为球面 $x^2+y^2+z^2=2x$ 与平面 $2x-z-1=0$ 的交线,从 $z$ 轴正向往 $z$ 轴负向看去为逆时针方向,计算曲线积分$\\int_L(6xyz-yz^2)dx+2x^2zdy+xyzdz$.", "options": [ "$\\frac{4\\pi}{7\\sqrt5}$", "$\\frac{3\\pi}{7\\sqrt5}$", "$\\frac{3\\pi}{5\\sqrt5}$", "$\\frac{4\\pi}{5\\sqrt5}$"]``` | +| 代码 | 代码理解 | 输入为用户的需求文字或者部分代码,考察模型的逻辑推理能力和代码生成能力,考察模型对各类编程语言的掌握程度。内容包括不限于:算法和数据结构能力考察编程语言语法考察跨编程语言转换 | ```"question": "编写一个 Python 函数,用于检查两个数字是否仅在一个位置上不同。"```| +| 代码 | 代码分析 | 考察模型对代码的理解和分析能力,给定一段代码,进行代码意图分析,代码规范检查,错误检查等 | ```"question":"\n\ndef truncate_number(number: float) -> float:\n \"\"\" 给定一个正的浮点数,可以将其分解为整数部分(小于给定数字的最大整数)和小数部分(余数部分总是小于1)。\n\n 返回该数字的小数部分。\n >>> truncate_number(3.5)\n 0.5\n \"\"\"",``` | +| 长文本 | 长文本理解与推理 | 考察模型在不同的长度上下文(2k, 4k, 8k, 16k, 32k)情况下的理解和推理能力 | 略 | +| 智能体 | 任务规划 | 智能体根据用户的需求目标和具备工具条件,进行合理的任务拆解,科学地安排子任务的执行顺序和策略,对任务执行路径进行设计和规划,选择合适的策略。 | 略| +| 智能体 | 工具调用 | 评估模型能否准确的调用合适的API,在调用API时能否正确的传递参数 | 略 | +| 智能体 | 反思能力 | 评估模型在子任务执行失败时,是否具有反思和重新规划任务路径的能力 | 略 | +| 智能体 | 任务执行总结 | 评估模型能否根据子任务的执行结果进行总结分析,完成原始任务目标,正确地按指令输出回复 | 略| +| 智能体 | 多轮交互 | 评估模型在进行多轮复杂工具调用时的能力,在多轮情况下能否准确理解意图 | 略 | diff --git a/examples/eval_cascade_evaluator.py b/examples/eval_cascade_evaluator.py index 1c1b0980..beb3a3c5 100644 --- a/examples/eval_cascade_evaluator.py +++ b/examples/eval_cascade_evaluator.py @@ -124,4 +124,4 @@ datasets = math_datasets models = lmdeploy_qwen2_5_7b_instruct_model -work_dir = 'math_prm800k_500_cascade_evaluator' \ No newline at end of file +work_dir = 'math_prm800k_500_cascade_evaluator' diff --git a/opencompass/configs/datasets/ARC_Prize_Public_Evaluation/README.md b/opencompass/configs/datasets/ARC_Prize_Public_Evaluation/README.md index 3692e542..09be4dae 100644 --- a/opencompass/configs/datasets/ARC_Prize_Public_Evaluation/README.md +++ b/opencompass/configs/datasets/ARC_Prize_Public_Evaluation/README.md @@ -44,4 +44,4 @@ Here is an example of a simple ARC-AGI task that has three training pairs along | Qwen2.5-72B-Instruct | LLaMA3.1-70B-Instruct | gemma-2-27b-it | | ----- | ----- | ----- | -| 0.09 | 0.06 | 0.05 | \ No newline at end of file +| 0.09 | 0.06 | 0.05 | diff --git a/opencompass/configs/datasets/ARC_Prize_Public_Evaluation/arc_prize_public_evaluation_gen.py b/opencompass/configs/datasets/ARC_Prize_Public_Evaluation/arc_prize_public_evaluation_gen.py index 416fd1cb..10d7773b 100644 --- a/opencompass/configs/datasets/ARC_Prize_Public_Evaluation/arc_prize_public_evaluation_gen.py +++ b/opencompass/configs/datasets/ARC_Prize_Public_Evaluation/arc_prize_public_evaluation_gen.py @@ -1,4 +1,4 @@ from mmengine.config import read_base with read_base(): - from .arc_prize_public_evaluation_gen_872059 import arc_prize_public_evaluation_datasets # noqa: F401, F403 \ No newline at end of file + from .arc_prize_public_evaluation_gen_872059 import arc_prize_public_evaluation_datasets # noqa: F401, F403 diff --git a/opencompass/configs/datasets/ARC_Prize_Public_Evaluation/arc_prize_public_evaluation_gen_872059.py b/opencompass/configs/datasets/ARC_Prize_Public_Evaluation/arc_prize_public_evaluation_gen_872059.py index 000e6312..26b5a371 100644 --- a/opencompass/configs/datasets/ARC_Prize_Public_Evaluation/arc_prize_public_evaluation_gen_872059.py +++ b/opencompass/configs/datasets/ARC_Prize_Public_Evaluation/arc_prize_public_evaluation_gen_872059.py @@ -53,4 +53,4 @@ arc_prize_public_evaluation_datasets = [ infer_cfg=arc_prize_public_evaluation_infer_cfg, eval_cfg=arc_prize_public_evaluation_eval_cfg ) -] \ No newline at end of file +] diff --git a/opencompass/configs/datasets/ARC_Prize_Public_Evaluation/arc_prize_public_evaluation_gen_fedd04.py b/opencompass/configs/datasets/ARC_Prize_Public_Evaluation/arc_prize_public_evaluation_gen_fedd04.py index 536349e9..b2e87117 100644 --- a/opencompass/configs/datasets/ARC_Prize_Public_Evaluation/arc_prize_public_evaluation_gen_fedd04.py +++ b/opencompass/configs/datasets/ARC_Prize_Public_Evaluation/arc_prize_public_evaluation_gen_fedd04.py @@ -53,4 +53,4 @@ arc_prize_public_evaluation_datasets = [ infer_cfg=arc_prize_public_evaluation_infer_cfg, eval_cfg=arc_prize_public_evaluation_eval_cfg ) -] \ No newline at end of file +] diff --git a/opencompass/configs/datasets/CHARM/charm_reason_cot_only_gen_f7b7d3.py b/opencompass/configs/datasets/CHARM/charm_reason_cot_only_gen_b82a50.py similarity index 100% rename from opencompass/configs/datasets/CHARM/charm_reason_cot_only_gen_f7b7d3.py rename to opencompass/configs/datasets/CHARM/charm_reason_cot_only_gen_b82a50.py diff --git a/opencompass/configs/datasets/CHARM/charm_reason_gen_f8fca2.py b/opencompass/configs/datasets/CHARM/charm_reason_gen_75da72.py similarity index 100% rename from opencompass/configs/datasets/CHARM/charm_reason_gen_f8fca2.py rename to opencompass/configs/datasets/CHARM/charm_reason_gen_75da72.py diff --git a/opencompass/configs/datasets/CHARM/charm_reason_ppl_3da4de.py b/opencompass/configs/datasets/CHARM/charm_reason_ppl_47e433.py similarity index 100% rename from opencompass/configs/datasets/CHARM/charm_reason_ppl_3da4de.py rename to opencompass/configs/datasets/CHARM/charm_reason_ppl_47e433.py diff --git a/opencompass/configs/datasets/CHARM/few-shot-examples/Chinese_Anachronisms_Judgment_Direct.txt b/opencompass/configs/datasets/CHARM/few-shot-examples/Chinese_Anachronisms_Judgment_Direct.txt index c70e110c..9fab5c7a 100644 --- a/opencompass/configs/datasets/CHARM/few-shot-examples/Chinese_Anachronisms_Judgment_Direct.txt +++ b/opencompass/configs/datasets/CHARM/few-shot-examples/Chinese_Anachronisms_Judgment_Direct.txt @@ -19,4 +19,4 @@ Q:以下陈述是否包含时代错误,请选择正确选项。一个接受 选项: (A) 是 (B) 否 -A:(B) \ No newline at end of file +A:(B) diff --git a/opencompass/configs/datasets/CHARM/few-shot-examples/Chinese_Anachronisms_Judgment_EN-CoT.txt b/opencompass/configs/datasets/CHARM/few-shot-examples/Chinese_Anachronisms_Judgment_EN-CoT.txt index 4de255f6..7ab04c27 100644 --- a/opencompass/configs/datasets/CHARM/few-shot-examples/Chinese_Anachronisms_Judgment_EN-CoT.txt +++ b/opencompass/configs/datasets/CHARM/few-shot-examples/Chinese_Anachronisms_Judgment_EN-CoT.txt @@ -22,4 +22,4 @@ Q:以下陈述是否包含时代错误,请选择正确选项。一个接受 (A) 是 (B) 否 A:Let's think step by step. -This statement mentions that in 1998, New China was established in 1949, and the five-star red flag was designated as the national flag of China. Therefore, October 1, 1998 is National Day, and it is reasonable for people to celebrate National Day at Tiananmen Square, excluding historical errors. So the answer is (B). \ No newline at end of file +This statement mentions that in 1998, New China was established in 1949, and the five-star red flag was designated as the national flag of China. Therefore, October 1, 1998 is National Day, and it is reasonable for people to celebrate National Day at Tiananmen Square, excluding historical errors. So the answer is (B). diff --git a/opencompass/configs/datasets/CHARM/few-shot-examples/Chinese_Anachronisms_Judgment_XLT.txt b/opencompass/configs/datasets/CHARM/few-shot-examples/Chinese_Anachronisms_Judgment_XLT.txt index 36948e20..eedb9e89 100644 --- a/opencompass/configs/datasets/CHARM/few-shot-examples/Chinese_Anachronisms_Judgment_XLT.txt +++ b/opencompass/configs/datasets/CHARM/few-shot-examples/Chinese_Anachronisms_Judgment_XLT.txt @@ -60,4 +60,4 @@ Step-by-step answer: 1.This statement mentions that in 1998, New China was established in 1949 2.The Five Star Red Flag was designated as the national flag of China in 1949. 3.So October 1st, 1998 is National Day, and it is reasonable for people to celebrate National Day at Tiananmen Square, so the statement does not contain any historical errors. -So the answer is (B). \ No newline at end of file +So the answer is (B). diff --git a/opencompass/configs/datasets/CHARM/few-shot-examples/Chinese_Anachronisms_Judgment_ZH-CoT.txt b/opencompass/configs/datasets/CHARM/few-shot-examples/Chinese_Anachronisms_Judgment_ZH-CoT.txt index 4b71c4b3..6887aaa4 100644 --- a/opencompass/configs/datasets/CHARM/few-shot-examples/Chinese_Anachronisms_Judgment_ZH-CoT.txt +++ b/opencompass/configs/datasets/CHARM/few-shot-examples/Chinese_Anachronisms_Judgment_ZH-CoT.txt @@ -22,4 +22,4 @@ Q:以下陈述是否包含时代错误,请选择正确选项。一个接受 (A) 是 (B) 否 A:让我们一步一步来思考。 -这个陈述提到了1998年,新中国是1949年成立的,五星红旗在1949年被确定为中国国旗,因此1998年10月1日是国庆节,人们在天安门庆祝国庆是合理的,因此陈述不包含时代错误。所以答案是(B)。 \ No newline at end of file +这个陈述提到了1998年,新中国是1949年成立的,五星红旗在1949年被确定为中国国旗,因此1998年10月1日是国庆节,人们在天安门庆祝国庆是合理的,因此陈述不包含时代错误。所以答案是(B)。 diff --git a/opencompass/configs/datasets/CHARM/few-shot-examples/Chinese_Movie_and_Music_Recommendation_Direct.txt b/opencompass/configs/datasets/CHARM/few-shot-examples/Chinese_Movie_and_Music_Recommendation_Direct.txt index 4a391abe..48a4ba09 100644 --- a/opencompass/configs/datasets/CHARM/few-shot-examples/Chinese_Movie_and_Music_Recommendation_Direct.txt +++ b/opencompass/configs/datasets/CHARM/few-shot-examples/Chinese_Movie_and_Music_Recommendation_Direct.txt @@ -22,4 +22,4 @@ Q: 和这些歌曲《夜曲》、《本草纲目》、《听妈妈的话》、 (B)《年少有为》 (C)《浮夸》 (D)《三人游》 -A: (A) \ No newline at end of file +A: (A) diff --git a/opencompass/configs/datasets/CHARM/few-shot-examples/Chinese_Movie_and_Music_Recommendation_EN-CoT.txt b/opencompass/configs/datasets/CHARM/few-shot-examples/Chinese_Movie_and_Music_Recommendation_EN-CoT.txt index 272492bd..ab7e27d0 100644 --- a/opencompass/configs/datasets/CHARM/few-shot-examples/Chinese_Movie_and_Music_Recommendation_EN-CoT.txt +++ b/opencompass/configs/datasets/CHARM/few-shot-examples/Chinese_Movie_and_Music_Recommendation_EN-CoT.txt @@ -37,4 +37,4 @@ A:Let's think step by step. "Herbalist's Manual" is a song performed by Jay Chou, with lyrics by Vincent Fang, music by Jay Chou, and arrangement by Michael Lin. It is included in Jay Chou's 2006 album "Still Fantasy." "Listen to Your Mother" is a song performed by Jay Chou, with lyrics and music by Jay Chou, arrangement by Michael Lin and Hong Jingyao. It is included in Jay Chou's 2006 album "Still Fantasy." "Common Jasmine Orange" is a song performed by Jay Chou, with lyrics by Vincent Fang, music by Jay Chou, and arrangement by Chung Hsin-min. It is included in Jay Chou's self-titled album "Common Jasmine Orange" released in 2004. -These are all famous pop songs performed by Jay Chou. The only song among the options that seems to have something in common with these songs is "Nunchucks" (performed by Jay Chou, composed by Jay Chou, lyrics by Vincent Fang, arrangement by Chung Hsin-min, included in Jay Chou's 2001 album "Fantasy"). So the answer is (A). \ No newline at end of file +These are all famous pop songs performed by Jay Chou. The only song among the options that seems to have something in common with these songs is "Nunchucks" (performed by Jay Chou, composed by Jay Chou, lyrics by Vincent Fang, arrangement by Chung Hsin-min, included in Jay Chou's 2001 album "Fantasy"). So the answer is (A). diff --git a/opencompass/configs/datasets/CHARM/few-shot-examples/Chinese_Movie_and_Music_Recommendation_XLT.txt b/opencompass/configs/datasets/CHARM/few-shot-examples/Chinese_Movie_and_Music_Recommendation_XLT.txt index 0e69a8d0..37678c28 100644 --- a/opencompass/configs/datasets/CHARM/few-shot-examples/Chinese_Movie_and_Music_Recommendation_XLT.txt +++ b/opencompass/configs/datasets/CHARM/few-shot-examples/Chinese_Movie_and_Music_Recommendation_XLT.txt @@ -73,4 +73,4 @@ Step-by-step answer: 3."Listen to Your Mother" is a song performed by Jay Chou, with lyrics and music by Jay Chou, arrangement by Michael Lin and Hong Jingyao. It is included in Jay Chou's 2006 album "Still Fantasy." 4."Common Jasmine Orange" is a song performed by Jay Chou, with lyrics by Vincent Fang, music by Jay Chou, and arrangement by Chung Hsin-min. It is included in Jay Chou's self-titled album "Common Jasmine Orange" released in 2004. 5.These are all famous pop songs performed by Jay Chou. The only song among the options that seems to have something in common with these songs is "Nunchucks" (performed by Jay Chou, composed by Jay Chou, lyrics by Vincent Fang, arrangement by Chung Hsin-min, included in Jay Chou's 2001 album "Fantasy"). -So the answer is (A). \ No newline at end of file +So the answer is (A). diff --git a/opencompass/configs/datasets/CHARM/few-shot-examples/Chinese_Movie_and_Music_Recommendation_ZH-CoT.txt b/opencompass/configs/datasets/CHARM/few-shot-examples/Chinese_Movie_and_Music_Recommendation_ZH-CoT.txt index f11dd14a..f157094d 100644 --- a/opencompass/configs/datasets/CHARM/few-shot-examples/Chinese_Movie_and_Music_Recommendation_ZH-CoT.txt +++ b/opencompass/configs/datasets/CHARM/few-shot-examples/Chinese_Movie_and_Music_Recommendation_ZH-CoT.txt @@ -37,4 +37,4 @@ A: 让我们一步一步来思考。 《本草纲目》是周杰伦演唱的一首歌曲,由方文山作词,周杰伦作曲,林迈可编曲,收录在周杰伦2006年发行的专辑《依然范特西》中。 《听妈妈的话》是周杰伦演唱的一首歌曲,由周杰伦作词、作曲,林迈可、洪敬尧编曲,收录在周杰伦2006年发行的专辑《依然范特西》中。 《七里香》是周杰伦演唱的一首歌曲,由方文山作词,周杰伦谱曲,钟兴民编曲,收录在周杰伦2004年发行的同名专辑《七里香》中。 -这些都是由周杰伦演唱的中国著名流行音乐歌曲,在所有选项中,唯一与这些歌曲有相同点的歌曲似乎是《双节棍》(由周杰伦演唱,由周杰伦作曲,方文山作词,钟兴民编曲,收录于周杰伦2001年发行的专辑《范特西》中)。所以答案是(A)。 \ No newline at end of file +这些都是由周杰伦演唱的中国著名流行音乐歌曲,在所有选项中,唯一与这些歌曲有相同点的歌曲似乎是《双节棍》(由周杰伦演唱,由周杰伦作曲,方文山作词,钟兴民编曲,收录于周杰伦2001年发行的专辑《范特西》中)。所以答案是(A)。 diff --git a/opencompass/configs/datasets/CHARM/few-shot-examples/Chinese_Natural_Language_Inference_Direct.txt b/opencompass/configs/datasets/CHARM/few-shot-examples/Chinese_Natural_Language_Inference_Direct.txt index 83aa4179..9ca444cb 100644 --- a/opencompass/configs/datasets/CHARM/few-shot-examples/Chinese_Natural_Language_Inference_Direct.txt +++ b/opencompass/configs/datasets/CHARM/few-shot-examples/Chinese_Natural_Language_Inference_Direct.txt @@ -22,4 +22,4 @@ Q:语句一:昨天,小明在山上看落日,感叹道:"夕阳无限好 (A) 蕴含 (B) 矛盾 (C) 无关 -A:(B) \ No newline at end of file +A:(B) diff --git a/opencompass/configs/datasets/CHARM/few-shot-examples/Chinese_Natural_Language_Inference_EN-CoT.txt b/opencompass/configs/datasets/CHARM/few-shot-examples/Chinese_Natural_Language_Inference_EN-CoT.txt index 3ce816cc..54df6c1b 100644 --- a/opencompass/configs/datasets/CHARM/few-shot-examples/Chinese_Natural_Language_Inference_EN-CoT.txt +++ b/opencompass/configs/datasets/CHARM/few-shot-examples/Chinese_Natural_Language_Inference_EN-CoT.txt @@ -25,4 +25,4 @@ Q:语句一:昨天,小明在山上看落日,感叹道:"夕阳无限好 (B) 矛盾 (C) 无关 A: Let's think step by step. -The first sentence states that Xiaoming saw the sunset on the mountain yesterday, while the second sentence states that it rained yesterday and Xiaoming did not go out. There is a contradiction between these two sentences, because if Xiaoming had not gone out, he could not have seen the sunset on the mountain. So, the relationship between these two sentences is contradictory. So the answer is (B). \ No newline at end of file +The first sentence states that Xiaoming saw the sunset on the mountain yesterday, while the second sentence states that it rained yesterday and Xiaoming did not go out. There is a contradiction between these two sentences, because if Xiaoming had not gone out, he could not have seen the sunset on the mountain. So, the relationship between these two sentences is contradictory. So the answer is (B). diff --git a/opencompass/configs/datasets/CHARM/few-shot-examples/Chinese_Natural_Language_Inference_XLT.txt b/opencompass/configs/datasets/CHARM/few-shot-examples/Chinese_Natural_Language_Inference_XLT.txt index ccf8c4de..949c1670 100644 --- a/opencompass/configs/datasets/CHARM/few-shot-examples/Chinese_Natural_Language_Inference_XLT.txt +++ b/opencompass/configs/datasets/CHARM/few-shot-examples/Chinese_Natural_Language_Inference_XLT.txt @@ -64,4 +64,4 @@ What is the relationship between these two statements? Step-by-step answer: 1.The first sentence states that Xiaoming saw the sunset on the mountain yesterday, while the second sentence states that it rained yesterday and Xiaoming did not go out. 2.There is a contradiction between these two sentences, because if Xiaoming had not gone out, he could not have seen the sunset on the mountain. So, the relationship between these two sentences is contradictory. -So the answer is (B). \ No newline at end of file +So the answer is (B). diff --git a/opencompass/configs/datasets/CHARM/few-shot-examples/Chinese_Natural_Language_Inference_ZH-CoT.txt b/opencompass/configs/datasets/CHARM/few-shot-examples/Chinese_Natural_Language_Inference_ZH-CoT.txt index de824780..fdc446c4 100644 --- a/opencompass/configs/datasets/CHARM/few-shot-examples/Chinese_Natural_Language_Inference_ZH-CoT.txt +++ b/opencompass/configs/datasets/CHARM/few-shot-examples/Chinese_Natural_Language_Inference_ZH-CoT.txt @@ -25,4 +25,4 @@ Q:语句一:昨天,小明在山上看落日,感叹道:"夕阳无限好 (B) 矛盾 (C) 无关 A:让我们一步一步来思考。 -第一句话说小明昨天在山上看到了落日,而第二句话说昨天下雨,小明没有出门。这两句话之间存在矛盾,因为如果小明没有出门,那么他就不可能在山上看到落日。所以,这两句话之间的关系是矛盾的。所以答案是(B)。 \ No newline at end of file +第一句话说小明昨天在山上看到了落日,而第二句话说昨天下雨,小明没有出门。这两句话之间存在矛盾,因为如果小明没有出门,那么他就不可能在山上看到落日。所以,这两句话之间的关系是矛盾的。所以答案是(B)。 diff --git a/opencompass/configs/datasets/CHARM/few-shot-examples/Chinese_Reading_Comprehension_Direct.txt b/opencompass/configs/datasets/CHARM/few-shot-examples/Chinese_Reading_Comprehension_Direct.txt index 5f8d5381..c25ea4ef 100644 --- a/opencompass/configs/datasets/CHARM/few-shot-examples/Chinese_Reading_Comprehension_Direct.txt +++ b/opencompass/configs/datasets/CHARM/few-shot-examples/Chinese_Reading_Comprehension_Direct.txt @@ -20,4 +20,4 @@ Q:“知人论世”作为一种文学批评的原则和方法,最早由战国 (B) “知人论世”作为一种文学评论发沿用至今并显现了强大的生命力 (C) “知人论世”可以帮助后人交结古人和古人成为知音 (D) 了解古人和他所处的时代,有助于理解他的作品 -A: (B) \ No newline at end of file +A: (B) diff --git a/opencompass/configs/datasets/CHARM/few-shot-examples/Chinese_Reading_Comprehension_EN-CoT.txt b/opencompass/configs/datasets/CHARM/few-shot-examples/Chinese_Reading_Comprehension_EN-CoT.txt index b4a2e9ad..7a80185a 100644 --- a/opencompass/configs/datasets/CHARM/few-shot-examples/Chinese_Reading_Comprehension_EN-CoT.txt +++ b/opencompass/configs/datasets/CHARM/few-shot-examples/Chinese_Reading_Comprehension_EN-CoT.txt @@ -22,4 +22,4 @@ Q:“知人论世”作为一种文学批评的原则和方法,最早由战国 (C) “知人论世”可以帮助后人交结古人和古人成为知音 (D) 了解古人和他所处的时代,有助于理解他的作品 A: Let's think step by step. -From this passage, we cannot see (B) that "understanding people and discussing the world" as a literary criticism has been used to this day and has shown strong vitality. Although "knowing people and discussing the world" was indeed proposed by the philosopher Mencius during the Warring States period as a principle and method of literary criticism, this passage does not mention that "knowing people and discussing the world" is still in use today, or that it has shown strong vitality. Therefore, option (B) is an incorrect understanding. So the answer is (B). \ No newline at end of file +From this passage, we cannot see (B) that "understanding people and discussing the world" as a literary criticism has been used to this day and has shown strong vitality. Although "knowing people and discussing the world" was indeed proposed by the philosopher Mencius during the Warring States period as a principle and method of literary criticism, this passage does not mention that "knowing people and discussing the world" is still in use today, or that it has shown strong vitality. Therefore, option (B) is an incorrect understanding. So the answer is (B). diff --git a/opencompass/configs/datasets/CHARM/few-shot-examples/Chinese_Reading_Comprehension_XLT.txt b/opencompass/configs/datasets/CHARM/few-shot-examples/Chinese_Reading_Comprehension_XLT.txt index 34351fcc..5a442dbf 100644 --- a/opencompass/configs/datasets/CHARM/few-shot-examples/Chinese_Reading_Comprehension_XLT.txt +++ b/opencompass/configs/datasets/CHARM/few-shot-examples/Chinese_Reading_Comprehension_XLT.txt @@ -59,4 +59,4 @@ Step-by-step answer: 1.From this passage, we cannot see (B) that "understanding people and discussing the world" as a literary criticism has been used to this day and has shown strong vitality. 2.Although "knowing people and discussing the world" was indeed proposed by the philosopher Mencius during the Warring States period as a principle and method of literary criticism, this passage does not mention that "knowing people and discussing the world" is still in use today, or that it has shown strong vitality. 3.Therefore, option (B) is an incorrect understanding. -So the answer is (B). \ No newline at end of file +So the answer is (B). diff --git a/opencompass/configs/datasets/CHARM/few-shot-examples/Chinese_Reading_Comprehension_ZH-CoT.txt b/opencompass/configs/datasets/CHARM/few-shot-examples/Chinese_Reading_Comprehension_ZH-CoT.txt index 0bc47b68..12334d80 100644 --- a/opencompass/configs/datasets/CHARM/few-shot-examples/Chinese_Reading_Comprehension_ZH-CoT.txt +++ b/opencompass/configs/datasets/CHARM/few-shot-examples/Chinese_Reading_Comprehension_ZH-CoT.txt @@ -23,4 +23,4 @@ Q:“知人论世”作为一种文学批评的原则和方法,最早由战国 (C) “知人论世”可以帮助后人交结古人和古人成为知音 (D) 了解古人和他所处的时代,有助于理解他的作品 A:让我们一步一步来思考。 -从这段话中,我们看不到(B)“知人论世”作为一种文学批评已经沿用至今,并显示出强大的生命力。虽然“知人论世”确实是战国时期哲学家孟子提出的一种文学批评的原则和方法,但这段话并没有提到“知人论世”在今天仍在使用,也没有提到它已经显示出强大的生命力。因此,选项(B)是一种错误的理解。所以答案是(B)。 \ No newline at end of file +从这段话中,我们看不到(B)“知人论世”作为一种文学批评已经沿用至今,并显示出强大的生命力。虽然“知人论世”确实是战国时期哲学家孟子提出的一种文学批评的原则和方法,但这段话并没有提到“知人论世”在今天仍在使用,也没有提到它已经显示出强大的生命力。因此,选项(B)是一种错误的理解。所以答案是(B)。 diff --git a/opencompass/configs/datasets/CHARM/few-shot-examples/Chinese_Sequence_Understanding_Direct.txt b/opencompass/configs/datasets/CHARM/few-shot-examples/Chinese_Sequence_Understanding_Direct.txt index df407ea6..01995cb4 100644 --- a/opencompass/configs/datasets/CHARM/few-shot-examples/Chinese_Sequence_Understanding_Direct.txt +++ b/opencompass/configs/datasets/CHARM/few-shot-examples/Chinese_Sequence_Understanding_Direct.txt @@ -19,4 +19,4 @@ Q:下列制作老式棒棒糖的步骤正确的是?选项: (B) 准备材料、制作糖浆、将糖浆倒入模具、冷却定型 (C) 准备材料、将糖浆倒入模具、冷却定型、制作糖浆 (D) 准备材料、冷却定型、制作糖浆、将糖浆倒入模具 -A:(B) \ No newline at end of file +A:(B) diff --git a/opencompass/configs/datasets/CHARM/few-shot-examples/Chinese_Sequence_Understanding_EN-CoT.txt b/opencompass/configs/datasets/CHARM/few-shot-examples/Chinese_Sequence_Understanding_EN-CoT.txt index 6494630a..dc8be3ad 100644 --- a/opencompass/configs/datasets/CHARM/few-shot-examples/Chinese_Sequence_Understanding_EN-CoT.txt +++ b/opencompass/configs/datasets/CHARM/few-shot-examples/Chinese_Sequence_Understanding_EN-CoT.txt @@ -22,4 +22,4 @@ Q:下列制作老式棒棒糖的步骤正确的是?选项: (C) 准备材料、将糖浆倒入模具、冷却定型、制作糖浆 (D) 准备材料、冷却定型、制作糖浆、将糖浆倒入模具 A: Let's think step by step. -The title mentions the steps to make old-fashioned lollipops, and the options include "preparing materials", "pouring syrup into the mold", "making syrup", and "cooling and shaping". According to the steps to make old-fashioned lollipops, the first step should be to prepare the materials, then make syrup, pour syrup into the mold, and finally cool and shape. So the answer is (B). \ No newline at end of file +The title mentions the steps to make old-fashioned lollipops, and the options include "preparing materials", "pouring syrup into the mold", "making syrup", and "cooling and shaping". According to the steps to make old-fashioned lollipops, the first step should be to prepare the materials, then make syrup, pour syrup into the mold, and finally cool and shape. So the answer is (B). diff --git a/opencompass/configs/datasets/CHARM/few-shot-examples/Chinese_Sequence_Understanding_XLT.txt b/opencompass/configs/datasets/CHARM/few-shot-examples/Chinese_Sequence_Understanding_XLT.txt index 11ed9c31..089b23ee 100644 --- a/opencompass/configs/datasets/CHARM/few-shot-examples/Chinese_Sequence_Understanding_XLT.txt +++ b/opencompass/configs/datasets/CHARM/few-shot-examples/Chinese_Sequence_Understanding_XLT.txt @@ -59,4 +59,4 @@ Request: Which of the following steps is correct for making old-fashioned lollip Step-by-step answer: 1.The title mentions the steps to make old-fashioned lollipops, and the options include "preparing materials", "pouring syrup into the mold", "making syrup", and "cooling and shaping". 2.According to the steps to make old-fashioned lollipops, the first step should be to prepare the materials, then make syrup, pour syrup into the mold, and finally cool and shape. -So the answer is (B). \ No newline at end of file +So the answer is (B). diff --git a/opencompass/configs/datasets/CHARM/few-shot-examples/Chinese_Sequence_Understanding_ZH-CoT.txt b/opencompass/configs/datasets/CHARM/few-shot-examples/Chinese_Sequence_Understanding_ZH-CoT.txt index c30ac670..74a1734a 100644 --- a/opencompass/configs/datasets/CHARM/few-shot-examples/Chinese_Sequence_Understanding_ZH-CoT.txt +++ b/opencompass/configs/datasets/CHARM/few-shot-examples/Chinese_Sequence_Understanding_ZH-CoT.txt @@ -22,4 +22,4 @@ Q:下列制作老式棒棒糖的步骤正确的是?选项: (C) 准备材料、将糖浆倒入模具、冷却定型、制作糖浆 (D) 准备材料、冷却定型、制作糖浆、将糖浆倒入模具 A:让我们一步一步来思考。 -题目中提到了制作老式棒棒糖的步骤,选项中包含“准备材料”、“将糖浆倒入模具”、“制作糖浆”、“冷却定型”四个步骤,按照制作老式棒棒糖的步骤第一步应该是准备材料,然后需要制作糖浆,再把糖浆倒入模具,最后冷却定型。所以答案是(B)。 \ No newline at end of file +题目中提到了制作老式棒棒糖的步骤,选项中包含“准备材料”、“将糖浆倒入模具”、“制作糖浆”、“冷却定型”四个步骤,按照制作老式棒棒糖的步骤第一步应该是准备材料,然后需要制作糖浆,再把糖浆倒入模具,最后冷却定型。所以答案是(B)。 diff --git a/opencompass/configs/datasets/CHARM/few-shot-examples/Chinese_Sport_Understanding_Direct.txt b/opencompass/configs/datasets/CHARM/few-shot-examples/Chinese_Sport_Understanding_Direct.txt index 344eb9f2..bb8f82ba 100644 --- a/opencompass/configs/datasets/CHARM/few-shot-examples/Chinese_Sport_Understanding_Direct.txt +++ b/opencompass/configs/datasets/CHARM/few-shot-examples/Chinese_Sport_Understanding_Direct.txt @@ -16,4 +16,4 @@ Q: 下面的句子可信吗? "运动员范智毅射门得分" 选项: (A)可信 (B)不可信 -A: (A) \ No newline at end of file +A: (A) diff --git a/opencompass/configs/datasets/CHARM/few-shot-examples/Chinese_Sport_Understanding_EN-CoT.txt b/opencompass/configs/datasets/CHARM/few-shot-examples/Chinese_Sport_Understanding_EN-CoT.txt index 2f8f3611..8b848cab 100644 --- a/opencompass/configs/datasets/CHARM/few-shot-examples/Chinese_Sport_Understanding_EN-CoT.txt +++ b/opencompass/configs/datasets/CHARM/few-shot-examples/Chinese_Sport_Understanding_EN-CoT.txt @@ -19,4 +19,4 @@ Q: 下面的句子可信吗? "运动员范智毅射门得分" (A)可信 (B)不可信 A: Let's think step by step. -Fan Zhiyi is a Chinese football player, and "scoring a goal" is a term used in football. Since he is a professional football player, he is very likely to score a goal. So the answer is (A). \ No newline at end of file +Fan Zhiyi is a Chinese football player, and "scoring a goal" is a term used in football. Since he is a professional football player, he is very likely to score a goal. So the answer is (A). diff --git a/opencompass/configs/datasets/CHARM/few-shot-examples/Chinese_Sport_Understanding_ZH-CoT.txt b/opencompass/configs/datasets/CHARM/few-shot-examples/Chinese_Sport_Understanding_ZH-CoT.txt index 7db7b53f..faee85d4 100644 --- a/opencompass/configs/datasets/CHARM/few-shot-examples/Chinese_Sport_Understanding_ZH-CoT.txt +++ b/opencompass/configs/datasets/CHARM/few-shot-examples/Chinese_Sport_Understanding_ZH-CoT.txt @@ -19,4 +19,4 @@ Q: 下面的句子可信吗? "运动员范智毅射门得分" (A)可信 (B)不可信 A: 让我们一步一步来思考。 -范智毅是一位中国的足球运动员,射门得分是足球运动术语。由于他是一名专业的足球运动员,因此他很可能射门得分,所以答案是(A)。 \ No newline at end of file +范智毅是一位中国的足球运动员,射门得分是足球运动术语。由于他是一名专业的足球运动员,因此他很可能射门得分,所以答案是(A)。 diff --git a/opencompass/configs/datasets/CHARM/few-shot-examples/Chinese_Time_Understanding_Direct.txt b/opencompass/configs/datasets/CHARM/few-shot-examples/Chinese_Time_Understanding_Direct.txt index cf058dc8..0629db6b 100644 --- a/opencompass/configs/datasets/CHARM/few-shot-examples/Chinese_Time_Understanding_Direct.txt +++ b/opencompass/configs/datasets/CHARM/few-shot-examples/Chinese_Time_Understanding_Direct.txt @@ -22,4 +22,4 @@ Q:小李在亥时三刻开始制作画画,他知道他需要10个小时才 (B) 卯时 (C) 午时 (D) 未时 -A:(A) \ No newline at end of file +A:(A) diff --git a/opencompass/configs/datasets/CHARM/few-shot-examples/Chinese_Time_Understanding_EN-CoT.txt b/opencompass/configs/datasets/CHARM/few-shot-examples/Chinese_Time_Understanding_EN-CoT.txt index 9f5efa0d..47bd8426 100644 --- a/opencompass/configs/datasets/CHARM/few-shot-examples/Chinese_Time_Understanding_EN-CoT.txt +++ b/opencompass/configs/datasets/CHARM/few-shot-examples/Chinese_Time_Understanding_EN-CoT.txt @@ -25,4 +25,4 @@ Q:小李在亥时三刻开始制作画画,他知道他需要10个小时才 (C) 午时 (D) 未时 A:Let's think step by step. -According to the ancient Chinese timing method, the third quarter of the pig hour refers to approximately 21:45 minutes, and 10 hours later it is 7:45 minutes, which is the Dragon Hour . So the answer is (A). \ No newline at end of file +According to the ancient Chinese timing method, the third quarter of the pig hour refers to approximately 21:45 minutes, and 10 hours later it is 7:45 minutes, which is the Dragon Hour . So the answer is (A). diff --git a/opencompass/configs/datasets/CHARM/few-shot-examples/Chinese_Time_Understanding_XLT.txt b/opencompass/configs/datasets/CHARM/few-shot-examples/Chinese_Time_Understanding_XLT.txt index b3fa5635..ef0c9f49 100644 --- a/opencompass/configs/datasets/CHARM/few-shot-examples/Chinese_Time_Understanding_XLT.txt +++ b/opencompass/configs/datasets/CHARM/few-shot-examples/Chinese_Time_Understanding_XLT.txt @@ -65,4 +65,4 @@ Option: Step-by-step answer: 1.According to the ancient Chinese timing method, the third quarter of the hour of Hai refers to approximately 21:45, 2.Ten hours later, it will be 7:45, which is Dragon Hour . -So the answer is (A). \ No newline at end of file +So the answer is (A). diff --git a/opencompass/configs/datasets/CHARM/few-shot-examples/Chinese_Time_Understanding_ZH-CoT.txt b/opencompass/configs/datasets/CHARM/few-shot-examples/Chinese_Time_Understanding_ZH-CoT.txt index f24019d1..65ebb0b5 100644 --- a/opencompass/configs/datasets/CHARM/few-shot-examples/Chinese_Time_Understanding_ZH-CoT.txt +++ b/opencompass/configs/datasets/CHARM/few-shot-examples/Chinese_Time_Understanding_ZH-CoT.txt @@ -25,4 +25,4 @@ Q:小李在亥时三刻开始制作画画,他知道他需要10个小时才 (C) 午时 (D) 未时 A:让我们一步一步来思考。 -按照中国古代计时方法,亥时三刻大约指的是21点45分,10个小时后是7点45分,是辰时。所以答案是(A)。 \ No newline at end of file +按照中国古代计时方法,亥时三刻大约指的是21点45分,10个小时后是7点45分,是辰时。所以答案是(A)。 diff --git a/opencompass/configs/datasets/CHARM/few-shot-examples/Global_Anachronisms_Judgment_Direct.txt b/opencompass/configs/datasets/CHARM/few-shot-examples/Global_Anachronisms_Judgment_Direct.txt index f388d385..ef5c3173 100644 --- a/opencompass/configs/datasets/CHARM/few-shot-examples/Global_Anachronisms_Judgment_Direct.txt +++ b/opencompass/configs/datasets/CHARM/few-shot-examples/Global_Anachronisms_Judgment_Direct.txt @@ -19,4 +19,4 @@ Q:以下陈述是否包含时代错误,一个接受了义务教育、具备 选项: (A) 是 (B) 否 -A:(A) \ No newline at end of file +A:(A) diff --git a/opencompass/configs/datasets/CHARM/few-shot-examples/Global_Anachronisms_Judgment_EN-CoT.txt b/opencompass/configs/datasets/CHARM/few-shot-examples/Global_Anachronisms_Judgment_EN-CoT.txt index 791bd909..422fed46 100644 --- a/opencompass/configs/datasets/CHARM/few-shot-examples/Global_Anachronisms_Judgment_EN-CoT.txt +++ b/opencompass/configs/datasets/CHARM/few-shot-examples/Global_Anachronisms_Judgment_EN-CoT.txt @@ -22,4 +22,4 @@ Q:以下陈述是否包含时代错误,一个接受了义务教育、具备 (A) 是 (B) 否 A: Let's think step by step. -The statement mentions Sally Hemings, who was an enslaved woman in the United States during the late 18th and early 19th centuries. However, the mention of watching HBO’s True Detective, which is a modern television show, introduces an anachronism. During Sally Hemings’ time, television did not exist, and the specific mention of watching a specific show like True Detective is clearly out of place for that historical period. So the answer is (A). \ No newline at end of file +The statement mentions Sally Hemings, who was an enslaved woman in the United States during the late 18th and early 19th centuries. However, the mention of watching HBO’s True Detective, which is a modern television show, introduces an anachronism. During Sally Hemings’ time, television did not exist, and the specific mention of watching a specific show like True Detective is clearly out of place for that historical period. So the answer is (A). diff --git a/opencompass/configs/datasets/CHARM/few-shot-examples/Global_Anachronisms_Judgment_XLT.txt b/opencompass/configs/datasets/CHARM/few-shot-examples/Global_Anachronisms_Judgment_XLT.txt index bd4e898b..de1911c9 100644 --- a/opencompass/configs/datasets/CHARM/few-shot-examples/Global_Anachronisms_Judgment_XLT.txt +++ b/opencompass/configs/datasets/CHARM/few-shot-examples/Global_Anachronisms_Judgment_XLT.txt @@ -58,4 +58,4 @@ Options: Step-by-step answer: 1.The statement mentions Sally Hemings, who was an enslaved woman in the United States during the late 18th and early 19th centuries. 2.However, the mention of watching HBO’s True Detective, which is a modern television show, introduces an anachronism. During Sally Hemings’ time, television did not exist, and the specific mention of watching a specific show like True Detective is clearly out of place for that historical period. -So the answer is (A). \ No newline at end of file +So the answer is (A). diff --git a/opencompass/configs/datasets/CHARM/few-shot-examples/Global_Anachronisms_Judgment_ZH-CoT.txt b/opencompass/configs/datasets/CHARM/few-shot-examples/Global_Anachronisms_Judgment_ZH-CoT.txt index 2401b0ea..6306cabf 100644 --- a/opencompass/configs/datasets/CHARM/few-shot-examples/Global_Anachronisms_Judgment_ZH-CoT.txt +++ b/opencompass/configs/datasets/CHARM/few-shot-examples/Global_Anachronisms_Judgment_ZH-CoT.txt @@ -22,4 +22,4 @@ Q:以下陈述是否包含时代错误,一个接受了义务教育、具备 (A) 是 (B) 否 A:让我们一步一步来思考。 -这个陈述提到了萨莉·海明斯,她是 18 世纪末到 19 世纪初美国的一个被奴役的女性。然而,提到她边看 HBO 的《真探》边织毛衣引入了一个时代错误。在萨莉·海明斯所处的时代,电视是不存在的,而且具体提到观看像《真探》这样的特定节目在那个历史时期显然是不合适的。所以答案是(A)。 \ No newline at end of file +这个陈述提到了萨莉·海明斯,她是 18 世纪末到 19 世纪初美国的一个被奴役的女性。然而,提到她边看 HBO 的《真探》边织毛衣引入了一个时代错误。在萨莉·海明斯所处的时代,电视是不存在的,而且具体提到观看像《真探》这样的特定节目在那个历史时期显然是不合适的。所以答案是(A)。 diff --git a/opencompass/configs/datasets/CHARM/few-shot-examples/Global_Movie_and_Music_Recommendation_Direct.txt b/opencompass/configs/datasets/CHARM/few-shot-examples/Global_Movie_and_Music_Recommendation_Direct.txt index a38c8e33..e42929ef 100644 --- a/opencompass/configs/datasets/CHARM/few-shot-examples/Global_Movie_and_Music_Recommendation_Direct.txt +++ b/opencompass/configs/datasets/CHARM/few-shot-examples/Global_Movie_and_Music_Recommendation_Direct.txt @@ -22,4 +22,4 @@ Q: 寻找一部与《惊世骇案》、《勇敢的心》、《低俗小说》 (B)《肖申克的救赎》 (C)《蜘蛛侠2》 (D)《出租车》 -A: (B) \ No newline at end of file +A: (B) diff --git a/opencompass/configs/datasets/CHARM/few-shot-examples/Global_Movie_and_Music_Recommendation_EN-CoT.txt b/opencompass/configs/datasets/CHARM/few-shot-examples/Global_Movie_and_Music_Recommendation_EN-CoT.txt index b842e86b..7731865f 100644 --- a/opencompass/configs/datasets/CHARM/few-shot-examples/Global_Movie_and_Music_Recommendation_EN-CoT.txt +++ b/opencompass/configs/datasets/CHARM/few-shot-examples/Global_Movie_and_Music_Recommendation_EN-CoT.txt @@ -37,4 +37,4 @@ A: Let's think step by step. Shawshank Redemption is a 1994 American drama film directed by Frank Delabond and starring Tim Robbins and Morgan Freeman. It is a film about hope and perseverance. Spider Man 2 is a 2004 American superhero film directed by Sam Remy and starring Toby Maguire. It is the second installment of the Spider Man trilogy. "Taxi" is a 2004 American comedy action film directed by Tim Storey, starring Jimmy Flanders and Quentin Latafa. This movie is an American remake of a 1998 French film. -And the titles of "The Amazing Case", "Brave Heart", "Pulp Fiction", and "Schindler's List" are all very profound, plot rich, and have strong human themes in movies. They have all won high praise from audiences and critics for their excellent scripts, brilliant performances, and profound themes. The Shawshank Redemption tells the story of a wrongly accused banker who maintains hope in prison and ultimately escapes. The plot of this movie is deeply ingrained in people's hearts, with a profound portrayal of human nature, and there are many similarities with the movie in the title. So the answer is (B). \ No newline at end of file +And the titles of "The Amazing Case", "Brave Heart", "Pulp Fiction", and "Schindler's List" are all very profound, plot rich, and have strong human themes in movies. They have all won high praise from audiences and critics for their excellent scripts, brilliant performances, and profound themes. The Shawshank Redemption tells the story of a wrongly accused banker who maintains hope in prison and ultimately escapes. The plot of this movie is deeply ingrained in people's hearts, with a profound portrayal of human nature, and there are many similarities with the movie in the title. So the answer is (B). diff --git a/opencompass/configs/datasets/CHARM/few-shot-examples/Global_Movie_and_Music_Recommendation_XLT.txt b/opencompass/configs/datasets/CHARM/few-shot-examples/Global_Movie_and_Music_Recommendation_XLT.txt index cdcaf0e8..98591536 100644 --- a/opencompass/configs/datasets/CHARM/few-shot-examples/Global_Movie_and_Music_Recommendation_XLT.txt +++ b/opencompass/configs/datasets/CHARM/few-shot-examples/Global_Movie_and_Music_Recommendation_XLT.txt @@ -73,4 +73,4 @@ Step-by-step answer: 3.Spider Man 2 is a 2004 American superhero film directed by Sam Remy and starring Toby Maguire. It is the second installment of the Spider Man trilogy. 4."Taxi" is a 2004 American comedy action film directed by Tim Storey, starring Jimmy Flanders and Quentin Latafa. This movie is an American remake of a 1998 French film. 5.And the titles of "The Amazing Case", "Brave Heart", "Pulp Fiction", and "Schindler's List" are all very profound, plot rich, and have strong human themes in movies. They have all won high praise from audiences and critics for their excellent scripts, brilliant performances, and profound themes. The Shawshank Redemption tells the story of a wrongly accused banker who maintains hope in prison and ultimately escapes. The plot of this movie is deeply ingrained in people's hearts, with a profound portrayal of human nature, and there are many similarities with the movie in the title. -So the answer is (B). \ No newline at end of file +So the answer is (B). diff --git a/opencompass/configs/datasets/CHARM/few-shot-examples/Global_Movie_and_Music_Recommendation_ZH-CoT.txt b/opencompass/configs/datasets/CHARM/few-shot-examples/Global_Movie_and_Music_Recommendation_ZH-CoT.txt index 70abbad2..69b7f0f0 100644 --- a/opencompass/configs/datasets/CHARM/few-shot-examples/Global_Movie_and_Music_Recommendation_ZH-CoT.txt +++ b/opencompass/configs/datasets/CHARM/few-shot-examples/Global_Movie_and_Music_Recommendation_ZH-CoT.txt @@ -37,4 +37,4 @@ A: 让我们一步一步来思考。 《肖申克的救赎》是一部1994年的美国剧情电影,由弗兰克·德拉邦特执导,蒂姆·罗宾斯和摩根·弗里曼主演,这是一部讲述希望和毅力的电影。 《蜘蛛侠2》是一部2004年的美国超级英雄电影,由萨姆·雷米执导,托比·马奎尔主演,是《蜘蛛侠》三部曲的第二部。 《出租车》这是一部2004年的美国喜剧动作片,由蒂姆·斯托瑞执导,吉米·福兰和昆汀·拉塔法主演。这部电影是1998年法国电影的美国翻拍版。 -而题目中《惊世骇案》、《勇敢的心》、《低俗小说》和《辛德勒的名单》都是一些非常深刻、情节丰富且具有强烈人性主题的电影。它们都以其出色的剧本、精彩的表演和深刻的主题赢得了观众和评论家的高度赞誉。选项中《肖申克的救赎》讲述了一名被冤枉的银行家如何在监狱中保持希望,并最终逃脱的故事。这部电影的情节深入人心,人性描绘深刻,与题目中的电影有许多相似之处。所以答案是(B)。 \ No newline at end of file +而题目中《惊世骇案》、《勇敢的心》、《低俗小说》和《辛德勒的名单》都是一些非常深刻、情节丰富且具有强烈人性主题的电影。它们都以其出色的剧本、精彩的表演和深刻的主题赢得了观众和评论家的高度赞誉。选项中《肖申克的救赎》讲述了一名被冤枉的银行家如何在监狱中保持希望,并最终逃脱的故事。这部电影的情节深入人心,人性描绘深刻,与题目中的电影有许多相似之处。所以答案是(B)。 diff --git a/opencompass/configs/datasets/CHARM/few-shot-examples/Global_Natural_Language_Inference_Direct.txt b/opencompass/configs/datasets/CHARM/few-shot-examples/Global_Natural_Language_Inference_Direct.txt index dcac4a19..5a31562f 100644 --- a/opencompass/configs/datasets/CHARM/few-shot-examples/Global_Natural_Language_Inference_Direct.txt +++ b/opencompass/configs/datasets/CHARM/few-shot-examples/Global_Natural_Language_Inference_Direct.txt @@ -22,4 +22,4 @@ Q:语句一:你不确定你已经清楚你站着谁的一面。 (A) 蕴含 (B) 矛盾 (C) 无关 -A:(B) \ No newline at end of file +A:(B) diff --git a/opencompass/configs/datasets/CHARM/few-shot-examples/Global_Natural_Language_Inference_EN-CoT.txt b/opencompass/configs/datasets/CHARM/few-shot-examples/Global_Natural_Language_Inference_EN-CoT.txt index 202421bc..85bb622d 100644 --- a/opencompass/configs/datasets/CHARM/few-shot-examples/Global_Natural_Language_Inference_EN-CoT.txt +++ b/opencompass/configs/datasets/CHARM/few-shot-examples/Global_Natural_Language_Inference_EN-CoT.txt @@ -25,4 +25,4 @@ Q:语句一:你不确定你已经清楚你站着谁的一面。 (B) 矛盾 (C) 无关 A: Let's think step by step. -The first sentence indicates that you are not sure who you support, while the second sentence clearly indicates that your position is obvious, which means you are clear about who you support. Therefore, the content of these two sentences is contradictory to each other. So the answer is (B). \ No newline at end of file +The first sentence indicates that you are not sure who you support, while the second sentence clearly indicates that your position is obvious, which means you are clear about who you support. Therefore, the content of these two sentences is contradictory to each other. So the answer is (B). diff --git a/opencompass/configs/datasets/CHARM/few-shot-examples/Global_Natural_Language_Inference_XLT.txt b/opencompass/configs/datasets/CHARM/few-shot-examples/Global_Natural_Language_Inference_XLT.txt index 16b45350..ecf27f29 100644 --- a/opencompass/configs/datasets/CHARM/few-shot-examples/Global_Natural_Language_Inference_XLT.txt +++ b/opencompass/configs/datasets/CHARM/few-shot-examples/Global_Natural_Language_Inference_XLT.txt @@ -66,4 +66,4 @@ What is the relationship between these two statements? Step-by-step answer: 1.The first sentence indicates that you are not sure who you support, while the second sentence clearly indicates that your position is obvious, which means you are clear about who you support. 2.Therefore, the content of these two sentences is contradictory to each other. -So the answer is (B). \ No newline at end of file +So the answer is (B). diff --git a/opencompass/configs/datasets/CHARM/few-shot-examples/Global_Natural_Language_Inference_ZH-CoT.txt b/opencompass/configs/datasets/CHARM/few-shot-examples/Global_Natural_Language_Inference_ZH-CoT.txt index 4a01f45e..3bd36378 100644 --- a/opencompass/configs/datasets/CHARM/few-shot-examples/Global_Natural_Language_Inference_ZH-CoT.txt +++ b/opencompass/configs/datasets/CHARM/few-shot-examples/Global_Natural_Language_Inference_ZH-CoT.txt @@ -25,4 +25,4 @@ Q:语句一:你不确定你已经清楚你站着谁的一面。 (B) 矛盾 (C) 无关 A:让我们一步一步来思考。 -第一句话表示你不确定你支持谁,而第二句话则明确表明你的立场是显而易见的,也就是说你清楚自己支持谁。因此,这两句话的内容是相互矛盾的。所以答案是(B)。 \ No newline at end of file +第一句话表示你不确定你支持谁,而第二句话则明确表明你的立场是显而易见的,也就是说你清楚自己支持谁。因此,这两句话的内容是相互矛盾的。所以答案是(B)。 diff --git a/opencompass/configs/datasets/CHARM/few-shot-examples/Global_Reading_Comprehension_Direct.txt b/opencompass/configs/datasets/CHARM/few-shot-examples/Global_Reading_Comprehension_Direct.txt index 22e01f5c..b334728c 100644 --- a/opencompass/configs/datasets/CHARM/few-shot-examples/Global_Reading_Comprehension_Direct.txt +++ b/opencompass/configs/datasets/CHARM/few-shot-examples/Global_Reading_Comprehension_Direct.txt @@ -19,4 +19,4 @@ Q:一本书要成为畅销书,必须有可读性或者经过精心的包装. 如 (B) 没经过精心包装的书一定不是畅销书 (C) 有可读性的书一定是畅销书 (D) 大多数人喜欢有可读性的畅销书 -A:(A) \ No newline at end of file +A:(A) diff --git a/opencompass/configs/datasets/CHARM/few-shot-examples/Global_Reading_Comprehension_EN-CoT.txt b/opencompass/configs/datasets/CHARM/few-shot-examples/Global_Reading_Comprehension_EN-CoT.txt index 25120028..8ac79775 100644 --- a/opencompass/configs/datasets/CHARM/few-shot-examples/Global_Reading_Comprehension_EN-CoT.txt +++ b/opencompass/configs/datasets/CHARM/few-shot-examples/Global_Reading_Comprehension_EN-CoT.txt @@ -22,4 +22,4 @@ Q:一本书要成为畅销书,必须有可读性或者经过精心的包装. 如 (C) 有可读性的书一定是畅销书 (D) 大多数人喜欢有可读性的畅销书 A: Let's think step by step -According to the above conclusion, "To become a bestseller, a book must have readability or be carefully packaged." Therefore, (A) "A book without readability or careful packaging is definitely not a bestseller.". Because if a book is neither readable nor carefully packaged, it does not meet the conditions to become a bestseller. So the answer is (A). \ No newline at end of file +According to the above conclusion, "To become a bestseller, a book must have readability or be carefully packaged." Therefore, (A) "A book without readability or careful packaging is definitely not a bestseller.". Because if a book is neither readable nor carefully packaged, it does not meet the conditions to become a bestseller. So the answer is (A). diff --git a/opencompass/configs/datasets/CHARM/few-shot-examples/Global_Reading_Comprehension_XLT.txt b/opencompass/configs/datasets/CHARM/few-shot-examples/Global_Reading_Comprehension_XLT.txt index cab3f02a..b99e7415 100644 --- a/opencompass/configs/datasets/CHARM/few-shot-examples/Global_Reading_Comprehension_XLT.txt +++ b/opencompass/configs/datasets/CHARM/few-shot-examples/Global_Reading_Comprehension_XLT.txt @@ -58,4 +58,4 @@ Request: For a book to be a bestseller,it must be readable or well packaged. If Step-by-step answer: 1.According to the above conclusion, "To become a bestseller, a book must have readability or be carefully packaged." Therefore, (A) "A book without readability or careful packaging is definitely not a bestseller.". 2.Because if a book is neither readable nor carefully packaged, it does not meet the conditions to become a bestseller. -So the answer is (A). \ No newline at end of file +So the answer is (A). diff --git a/opencompass/configs/datasets/CHARM/few-shot-examples/Global_Reading_Comprehension_ZH-CoT.txt b/opencompass/configs/datasets/CHARM/few-shot-examples/Global_Reading_Comprehension_ZH-CoT.txt index ef234967..5f830fba 100644 --- a/opencompass/configs/datasets/CHARM/few-shot-examples/Global_Reading_Comprehension_ZH-CoT.txt +++ b/opencompass/configs/datasets/CHARM/few-shot-examples/Global_Reading_Comprehension_ZH-CoT.txt @@ -22,4 +22,4 @@ Q:一本书要成为畅销书,必须有可读性或者经过精心的包装. 如 (C) 有可读性的书一定是畅销书 (D) 大多数人喜欢有可读性的畅销书 A:让我们一步一步来思考。 -根据上述断定,“一本书要成为畅销书,必须有可读性或者经过精心的包装”,所以 (A) "没有可读性又没有精心包装的书一定不是畅销书" 是一定为真的。因为如果一本书既没有可读性,也没有经过精心的包装,那么它就不满足成为畅销书的条件。所以答案是(A)。 \ No newline at end of file +根据上述断定,“一本书要成为畅销书,必须有可读性或者经过精心的包装”,所以 (A) "没有可读性又没有精心包装的书一定不是畅销书" 是一定为真的。因为如果一本书既没有可读性,也没有经过精心的包装,那么它就不满足成为畅销书的条件。所以答案是(A)。 diff --git a/opencompass/configs/datasets/CHARM/few-shot-examples/Global_Sequence_Understanding_Direct.txt b/opencompass/configs/datasets/CHARM/few-shot-examples/Global_Sequence_Understanding_Direct.txt index 36638b45..c90f4062 100644 --- a/opencompass/configs/datasets/CHARM/few-shot-examples/Global_Sequence_Understanding_Direct.txt +++ b/opencompass/configs/datasets/CHARM/few-shot-examples/Global_Sequence_Understanding_Direct.txt @@ -19,4 +19,4 @@ Q:以下哪个列表按时间顺序排列正确?选项: (B) 世纪末的魔术师,贝克街的亡灵,通往天国的倒计时,侦探们的镇魂歌 (C) 世纪末的魔术师,通往天国的倒计时,贝克街的亡灵,侦探们的镇魂歌 (D) 世纪末的魔术师,贝克街的亡灵,侦探们的镇魂歌,通往天国的倒计时 -A:(C) \ No newline at end of file +A:(C) diff --git a/opencompass/configs/datasets/CHARM/few-shot-examples/Global_Sequence_Understanding_EN-CoT.txt b/opencompass/configs/datasets/CHARM/few-shot-examples/Global_Sequence_Understanding_EN-CoT.txt index 9b09ea7f..d8f333eb 100644 --- a/opencompass/configs/datasets/CHARM/few-shot-examples/Global_Sequence_Understanding_EN-CoT.txt +++ b/opencompass/configs/datasets/CHARM/few-shot-examples/Global_Sequence_Understanding_EN-CoT.txt @@ -22,4 +22,4 @@ Q:以下哪个列表按时间顺序排列正确?选项: (C) 世纪末的魔术师,通往天国的倒计时,贝克街的亡灵,侦探们的镇魂歌 (D) 世纪末的魔术师,贝克街的亡灵,侦探们的镇魂歌,通往天国的倒计时 A: Let's think step by step. -The options mention the names of four Conan theater versions, among which the Magician of the End of the Century was released in 1999, the Undead on Baker Street was released in 2002, the Countdown to Heaven was released in 2001, and the Souls of Detectives was released in 2006. Therefore, in chronological order, they are the Magician of the End of the Century, the Countdown to Heaven, the Undead on Baker Street, and the Souls of Detectives. So the answer is (C). \ No newline at end of file +The options mention the names of four Conan theater versions, among which the Magician of the End of the Century was released in 1999, the Undead on Baker Street was released in 2002, the Countdown to Heaven was released in 2001, and the Souls of Detectives was released in 2006. Therefore, in chronological order, they are the Magician of the End of the Century, the Countdown to Heaven, the Undead on Baker Street, and the Souls of Detectives. So the answer is (C). diff --git a/opencompass/configs/datasets/CHARM/few-shot-examples/Global_Sequence_Understanding_XLT.txt b/opencompass/configs/datasets/CHARM/few-shot-examples/Global_Sequence_Understanding_XLT.txt index e26e293c..f0d77bbc 100644 --- a/opencompass/configs/datasets/CHARM/few-shot-examples/Global_Sequence_Understanding_XLT.txt +++ b/opencompass/configs/datasets/CHARM/few-shot-examples/Global_Sequence_Understanding_XLT.txt @@ -57,4 +57,4 @@ Request:Which of the following lists is correct in chronological order? Options: Step-by-step answer: 1.The options mention the names of four Conan theater versions, among which the Magician of the End of the Century was released in 1999, the Undead on Baker Street was released in 2002, the Countdown to Heaven was released in 2001, and the Souls of Detectives was released in 2006. 2.Therefore, in chronological order, they are the Magician of the End of the Century, the Countdown to Heaven, the Undead on Baker Street, and the Souls of Detectives. -So the answer is (C). \ No newline at end of file +So the answer is (C). diff --git a/opencompass/configs/datasets/CHARM/few-shot-examples/Global_Sequence_Understanding_ZH-CoT.txt b/opencompass/configs/datasets/CHARM/few-shot-examples/Global_Sequence_Understanding_ZH-CoT.txt index b7f052a0..1bd9c3fb 100644 --- a/opencompass/configs/datasets/CHARM/few-shot-examples/Global_Sequence_Understanding_ZH-CoT.txt +++ b/opencompass/configs/datasets/CHARM/few-shot-examples/Global_Sequence_Understanding_ZH-CoT.txt @@ -22,4 +22,4 @@ Q:以下哪个列表按时间顺序排列正确?选项: (C) 世纪末的魔术师,通往天国的倒计时,贝克街的亡灵,侦探们的镇魂歌 (D) 世纪末的魔术师,贝克街的亡灵,侦探们的镇魂歌,通往天国的倒计时 A:让我们一步一步来思考。 -选项中提到了四个柯南剧场版的名称,其中世纪末的魔术师是1999年上映的,贝克街的亡灵是2002年上映的,通往天国的倒计时是2001年上映的,侦探们的镇魂歌是2006年上映的,因此按时间排序是世纪末的魔术师,通往天国的倒计时,贝克街的亡灵,侦探们的镇魂歌。所以答案是(C)。 \ No newline at end of file +选项中提到了四个柯南剧场版的名称,其中世纪末的魔术师是1999年上映的,贝克街的亡灵是2002年上映的,通往天国的倒计时是2001年上映的,侦探们的镇魂歌是2006年上映的,因此按时间排序是世纪末的魔术师,通往天国的倒计时,贝克街的亡灵,侦探们的镇魂歌。所以答案是(C)。 diff --git a/opencompass/configs/datasets/CHARM/few-shot-examples/Global_Sport_Understanding_Direct.txt b/opencompass/configs/datasets/CHARM/few-shot-examples/Global_Sport_Understanding_Direct.txt index 1169a793..da782d94 100644 --- a/opencompass/configs/datasets/CHARM/few-shot-examples/Global_Sport_Understanding_Direct.txt +++ b/opencompass/configs/datasets/CHARM/few-shot-examples/Global_Sport_Understanding_Direct.txt @@ -16,4 +16,4 @@ Q: 下面的句子可信吗? "勒布朗詹姆斯打进老鹰球" 选项: (A) 可信 (B) 不可信 -A: (B) \ No newline at end of file +A: (B) diff --git a/opencompass/configs/datasets/CHARM/few-shot-examples/Global_Sport_Understanding_EN-CoT.txt b/opencompass/configs/datasets/CHARM/few-shot-examples/Global_Sport_Understanding_EN-CoT.txt index b76e297b..c5e36740 100644 --- a/opencompass/configs/datasets/CHARM/few-shot-examples/Global_Sport_Understanding_EN-CoT.txt +++ b/opencompass/configs/datasets/CHARM/few-shot-examples/Global_Sport_Understanding_EN-CoT.txt @@ -19,4 +19,4 @@ Q: 下面的句子可信吗? "勒布朗詹姆斯打进老鹰球" (A) 可信 (B) 不可信 A: Let's think step by step. -LeBron James is a well-known American professional basketball player. An eagle is a term in golf, meaning two strokes under par. Therefore, this sentence is not credible. The correct answer should be (B). \ No newline at end of file +LeBron James is a well-known American professional basketball player. An eagle is a term in golf, meaning two strokes under par. Therefore, this sentence is not credible. The correct answer should be (B). diff --git a/opencompass/configs/datasets/CHARM/few-shot-examples/Global_Sport_Understanding_XLT.txt b/opencompass/configs/datasets/CHARM/few-shot-examples/Global_Sport_Understanding_XLT.txt index c8b26c29..781eac07 100644 --- a/opencompass/configs/datasets/CHARM/few-shot-examples/Global_Sport_Understanding_XLT.txt +++ b/opencompass/configs/datasets/CHARM/few-shot-examples/Global_Sport_Understanding_XLT.txt @@ -54,4 +54,4 @@ Step-by-step answer: 1.LeBron James is a well-known professional basketball player from the United States. 2.An eagle is a term in golf, meaning two strokes under par. 3.Therefore, this sentence is not credible. -So the answer is (B). \ No newline at end of file +So the answer is (B). diff --git a/opencompass/configs/datasets/CHARM/few-shot-examples/Global_Sport_Understanding_ZH-CoT.txt b/opencompass/configs/datasets/CHARM/few-shot-examples/Global_Sport_Understanding_ZH-CoT.txt index 451b3e02..6bb7feff 100644 --- a/opencompass/configs/datasets/CHARM/few-shot-examples/Global_Sport_Understanding_ZH-CoT.txt +++ b/opencompass/configs/datasets/CHARM/few-shot-examples/Global_Sport_Understanding_ZH-CoT.txt @@ -19,4 +19,4 @@ Q: 下面的句子可信吗? "勒布朗詹姆斯打进老鹰球" (A) 可信 (B) 不可信 A: 让我们一步一步来思考。 -勒布朗詹姆斯是一位知名的美国职业篮球运动员,老鹰球是高尔夫球术语,就是比标准杆少了两杆的意思。所以这个句子是不可信的,正确答案应该是(B)。 \ No newline at end of file +勒布朗詹姆斯是一位知名的美国职业篮球运动员,老鹰球是高尔夫球术语,就是比标准杆少了两杆的意思。所以这个句子是不可信的,正确答案应该是(B)。 diff --git a/opencompass/configs/datasets/CHARM/few-shot-examples/Global_Time_Understanding_Direct.txt b/opencompass/configs/datasets/CHARM/few-shot-examples/Global_Time_Understanding_Direct.txt index e48a7359..b0a972d1 100644 --- a/opencompass/configs/datasets/CHARM/few-shot-examples/Global_Time_Understanding_Direct.txt +++ b/opencompass/configs/datasets/CHARM/few-shot-examples/Global_Time_Understanding_Direct.txt @@ -24,4 +24,4 @@ Q: 请根据题目选择正确答案。李静和王伟在1958年1月2日结婚 (D)1960年10月14日 (E)1963年01月03日 (F)1960年12月03日 -A: (E) \ No newline at end of file +A: (E) diff --git a/opencompass/configs/datasets/CHARM/few-shot-examples/Global_Time_Understanding_EN-CoT.txt b/opencompass/configs/datasets/CHARM/few-shot-examples/Global_Time_Understanding_EN-CoT.txt index 4182c81c..7685af94 100644 --- a/opencompass/configs/datasets/CHARM/few-shot-examples/Global_Time_Understanding_EN-CoT.txt +++ b/opencompass/configs/datasets/CHARM/few-shot-examples/Global_Time_Understanding_EN-CoT.txt @@ -27,4 +27,4 @@ Q: 请根据题目选择正确答案。李静和王伟在1958年1月2日结婚 (E)1963年01月03日 (F)1960年12月03日 A: Let's think step by step. -If Jane and John married on Jan 2, 1958, then and if it is their 5-year anniversary today, then today's date is Jan 2, 1963. The date tomorrow is Jan 3, 1963. So the answer is (E). \ No newline at end of file +If Jane and John married on Jan 2, 1958, then and if it is their 5-year anniversary today, then today's date is Jan 2, 1963. The date tomorrow is Jan 3, 1963. So the answer is (E). diff --git a/opencompass/configs/datasets/CHARM/few-shot-examples/Global_Time_Understanding_XLT.txt b/opencompass/configs/datasets/CHARM/few-shot-examples/Global_Time_Understanding_XLT.txt index f7ecad3f..30b7dd1b 100644 --- a/opencompass/configs/datasets/CHARM/few-shot-examples/Global_Time_Understanding_XLT.txt +++ b/opencompass/configs/datasets/CHARM/few-shot-examples/Global_Time_Understanding_XLT.txt @@ -68,4 +68,4 @@ Request:Select the correct answer according to the question. Li Jing and Wang We Step-by-step answer: 1.If Jane and John married on Jan 2, 1958, then and if it is their 5-year anniversary today, then today's date is Jan 2, 1963. 2.The date tomorrow is Jan 3, 1963. -So the answer is (E). \ No newline at end of file +So the answer is (E). diff --git a/opencompass/configs/datasets/CHARM/few-shot-examples/Global_Time_Understanding_ZH-CoT.txt b/opencompass/configs/datasets/CHARM/few-shot-examples/Global_Time_Understanding_ZH-CoT.txt index e5ff431c..76b0dc09 100644 --- a/opencompass/configs/datasets/CHARM/few-shot-examples/Global_Time_Understanding_ZH-CoT.txt +++ b/opencompass/configs/datasets/CHARM/few-shot-examples/Global_Time_Understanding_ZH-CoT.txt @@ -27,4 +27,4 @@ Q: 请根据题目选择正确答案。李静和王伟在1958年1月2日结婚 (E)1963年01月03日 (F)1960年12月03日 A: 让我们一步一步来思考。 -如果李静和王伟在1958年1月2日结婚,那么如果今天是他们结婚五周年纪念日,那么今天的日期就是1963年1月2日。明天的日期是1963年1月3日,也就是1961年01月03日。所以答案是(E)。 \ No newline at end of file +如果李静和王伟在1958年1月2日结婚,那么如果今天是他们结婚五周年纪念日,那么今天的日期就是1963年1月2日。明天的日期是1963年1月3日,也就是1961年01月03日。所以答案是(E)。 diff --git a/opencompass/configs/datasets/CHARM/few-shot-examples_Translate-EN/Chinese_Anachronisms_Judgment_Translate-EN.txt b/opencompass/configs/datasets/CHARM/few-shot-examples_Translate-EN/Chinese_Anachronisms_Judgment_Translate-EN.txt index 3012dc76..7d225009 100644 --- a/opencompass/configs/datasets/CHARM/few-shot-examples_Translate-EN/Chinese_Anachronisms_Judgment_Translate-EN.txt +++ b/opencompass/configs/datasets/CHARM/few-shot-examples_Translate-EN/Chinese_Anachronisms_Judgment_Translate-EN.txt @@ -22,4 +22,4 @@ Options: (A) Yes (B) No A: Let's think step by step. -The statement mentions the year 1998, New China was founded in 1949 and the five-star red flag was established as the national flag of China in 1949, so it is reasonable to expect that October 1, 1998 was a National Day and people celebrated the National Day in Tiananmen Square, so the statement does not contain an anachronism. So the answer is (B). \ No newline at end of file +The statement mentions the year 1998, New China was founded in 1949 and the five-star red flag was established as the national flag of China in 1949, so it is reasonable to expect that October 1, 1998 was a National Day and people celebrated the National Day in Tiananmen Square, so the statement does not contain an anachronism. So the answer is (B). diff --git a/opencompass/configs/datasets/CHARM/few-shot-examples_Translate-EN/Chinese_Movie_and_Music_Recommendation_Translate-EN.txt b/opencompass/configs/datasets/CHARM/few-shot-examples_Translate-EN/Chinese_Movie_and_Music_Recommendation_Translate-EN.txt index 92b08fb1..50ef5938 100644 --- a/opencompass/configs/datasets/CHARM/few-shot-examples_Translate-EN/Chinese_Movie_and_Music_Recommendation_Translate-EN.txt +++ b/opencompass/configs/datasets/CHARM/few-shot-examples_Translate-EN/Chinese_Movie_and_Music_Recommendation_Translate-EN.txt @@ -37,4 +37,4 @@ Nocturne is a song performed by Jay Chou, written by Fong, composed by Jay Chou, "Materia Medica" is a song by Jay Chou, written by Fong, composed by Jay Chou, and arranged by Michael Lam, and featured on Jay Chou's 2006 album Still Vantage Point. Listen to Your Mother" is a song by Jay Chou, composed by Jay Chou and arranged by Lin Maike and Hong Jingyao, included in Jay Chou's 2006 album "Still Vantage". Seven Miles is a song performed by Jay Chou, with lyrics by Fong, music by Jay Chou, and arrangement by Chung Hsing-min, included in Jay Chou's 2004 album of the same name, Seven Miles. -These are all famous Chinese pop songs sung by Jay Chou. Among all the choices, the only song that seems to have something in common with these songs is "Nunchaku" (sung by Jay Chou, composed by Jay Chou, lyrics by Fong Wen-shan, arranged by Chung Hsing-min, and featured in Jay Chou's 2001 album "Fantasia"). So the answer is (A). \ No newline at end of file +These are all famous Chinese pop songs sung by Jay Chou. Among all the choices, the only song that seems to have something in common with these songs is "Nunchaku" (sung by Jay Chou, composed by Jay Chou, lyrics by Fong Wen-shan, arranged by Chung Hsing-min, and featured in Jay Chou's 2001 album "Fantasia"). So the answer is (A). diff --git a/opencompass/configs/datasets/CHARM/few-shot-examples_Translate-EN/Chinese_Natural_Language_Inference_Translate-EN.txt b/opencompass/configs/datasets/CHARM/few-shot-examples_Translate-EN/Chinese_Natural_Language_Inference_Translate-EN.txt index a0cc8333..78281bbe 100644 --- a/opencompass/configs/datasets/CHARM/few-shot-examples_Translate-EN/Chinese_Natural_Language_Inference_Translate-EN.txt +++ b/opencompass/configs/datasets/CHARM/few-shot-examples_Translate-EN/Chinese_Natural_Language_Inference_Translate-EN.txt @@ -25,4 +25,4 @@ What is the relationship between these two statements? (B) contradictory (C) Irrelevant A: Let's think step by step. -The first sentence says that Ming saw the sunset on the mountain yesterday, while the second sentence says that it rained yesterday and Ming didn't go out. There is a contradiction between these two sentences because if Ming didn't go out, then he couldn't have seen the sunset on the mountain. So there is a contradiction between these two sentences. So the answer is (B). \ No newline at end of file +The first sentence says that Ming saw the sunset on the mountain yesterday, while the second sentence says that it rained yesterday and Ming didn't go out. There is a contradiction between these two sentences because if Ming didn't go out, then he couldn't have seen the sunset on the mountain. So there is a contradiction between these two sentences. So the answer is (B). diff --git a/opencompass/configs/datasets/CHARM/few-shot-examples_Translate-EN/Chinese_Reading_Comprehension_Translate-EN.txt b/opencompass/configs/datasets/CHARM/few-shot-examples_Translate-EN/Chinese_Reading_Comprehension_Translate-EN.txt index 2cd6aab0..22455cbe 100644 --- a/opencompass/configs/datasets/CHARM/few-shot-examples_Translate-EN/Chinese_Reading_Comprehension_Translate-EN.txt +++ b/opencompass/configs/datasets/CHARM/few-shot-examples_Translate-EN/Chinese_Reading_Comprehension_Translate-EN.txt @@ -23,4 +23,4 @@ Q:As a principle and method of literary criticism, the concept of "knowing peopl (C) "Knowing people and discussing the world" can help the descendants to make friends with the ancients and the ancients to become their soulmates. (D) Knowing about the ancients and their times helps to understand their works. A: Let us think step by step -From the passage, we fail to see that (B) "Knowing the people of the world" has been used as a kind of literary criticism and has shown a strong vitality. Although it is true that "knowing people and discussing the world" is a principle and method of literary criticism put forward by the philosopher Mencius during the Warring States period, the passage does not mention that "knowing people and discussing the world" is still in use today, nor does it mention that it has shown great vitality. Therefore, option (B) is a misconception. So the answer is (B). \ No newline at end of file +From the passage, we fail to see that (B) "Knowing the people of the world" has been used as a kind of literary criticism and has shown a strong vitality. Although it is true that "knowing people and discussing the world" is a principle and method of literary criticism put forward by the philosopher Mencius during the Warring States period, the passage does not mention that "knowing people and discussing the world" is still in use today, nor does it mention that it has shown great vitality. Therefore, option (B) is a misconception. So the answer is (B). diff --git a/opencompass/configs/datasets/CHARM/few-shot-examples_Translate-EN/Chinese_Sequence_Understanding_Translate-EN.txt b/opencompass/configs/datasets/CHARM/few-shot-examples_Translate-EN/Chinese_Sequence_Understanding_Translate-EN.txt index cfa2236a..9c6ce7db 100644 --- a/opencompass/configs/datasets/CHARM/few-shot-examples_Translate-EN/Chinese_Sequence_Understanding_Translate-EN.txt +++ b/opencompass/configs/datasets/CHARM/few-shot-examples_Translate-EN/Chinese_Sequence_Understanding_Translate-EN.txt @@ -22,4 +22,4 @@ Q:Which of the following steps is correct for making old-fashioned lollipops? Op (C) Prepare ingredients, pour syrup into mold, cool and set, make syrup (D) Prepare ingredients, cool and set, make syrup, pour syrup into molds A: Let's think step by step. -The question refers to the steps in making old-fashioned lollipops, and the options include "prepare ingredients", "pour syrup into molds", and "make syrup", "According to the steps of making old-fashioned lollipops, the first step is to prepare the ingredients, then you need to make the syrup, then pour the syrup into the molds, and finally cool and set the lollipops. So the answer is (B). \ No newline at end of file +The question refers to the steps in making old-fashioned lollipops, and the options include "prepare ingredients", "pour syrup into molds", and "make syrup", "According to the steps of making old-fashioned lollipops, the first step is to prepare the ingredients, then you need to make the syrup, then pour the syrup into the molds, and finally cool and set the lollipops. So the answer is (B). diff --git a/opencompass/configs/datasets/CHARM/few-shot-examples_Translate-EN/Chinese_Sport_Understanding_Translate-EN.txt b/opencompass/configs/datasets/CHARM/few-shot-examples_Translate-EN/Chinese_Sport_Understanding_Translate-EN.txt index 830b38cb..ed5d8465 100644 --- a/opencompass/configs/datasets/CHARM/few-shot-examples_Translate-EN/Chinese_Sport_Understanding_Translate-EN.txt +++ b/opencompass/configs/datasets/CHARM/few-shot-examples_Translate-EN/Chinese_Sport_Understanding_Translate-EN.txt @@ -19,4 +19,4 @@ Options: (A) Credible (B) Not credible A: Let's think step by step. -Fan Zhiyi is a Chinese soccer player, and scoring a goal is a soccer sport term. Since he is a professional soccer player, he is likely to score a goal, so the answer is (A). \ No newline at end of file +Fan Zhiyi is a Chinese soccer player, and scoring a goal is a soccer sport term. Since he is a professional soccer player, he is likely to score a goal, so the answer is (A). diff --git a/opencompass/configs/datasets/CHARM/few-shot-examples_Translate-EN/Chinese_Time_Understanding_Translate-EN.txt b/opencompass/configs/datasets/CHARM/few-shot-examples_Translate-EN/Chinese_Time_Understanding_Translate-EN.txt index ad20ce8d..96074b4f 100644 --- a/opencompass/configs/datasets/CHARM/few-shot-examples_Translate-EN/Chinese_Time_Understanding_Translate-EN.txt +++ b/opencompass/configs/datasets/CHARM/few-shot-examples_Translate-EN/Chinese_Time_Understanding_Translate-EN.txt @@ -25,4 +25,4 @@ Options: (C) Midday (D) the hour of midnight A: Let's think step by step. -According to the ancient Chinese method of timekeeping, three moments of the Ohio hour means approximately 21:45, and 10 hours later it is 7:45, which is the hour of the hour of the moon. So the answer is (A). \ No newline at end of file +According to the ancient Chinese method of timekeeping, three moments of the Ohio hour means approximately 21:45, and 10 hours later it is 7:45, which is the hour of the hour of the moon. So the answer is (A). diff --git a/opencompass/configs/datasets/CHARM/few-shot-examples_Translate-EN/Global_Anachronisms_Judgment_Translate-EN.txt b/opencompass/configs/datasets/CHARM/few-shot-examples_Translate-EN/Global_Anachronisms_Judgment_Translate-EN.txt index 97148105..aa2ea859 100644 --- a/opencompass/configs/datasets/CHARM/few-shot-examples_Translate-EN/Global_Anachronisms_Judgment_Translate-EN.txt +++ b/opencompass/configs/datasets/CHARM/few-shot-examples_Translate-EN/Global_Anachronisms_Judgment_Translate-EN.txt @@ -22,4 +22,4 @@ Options: (A) Yes (B) No A: Let's think step by step. -This statement mentions Sally Hemings, an enslaved woman in late 18th and early 19th century America. However, the reference to her knitting while watching HBO's True Detective introduces an anachronism. Television did not exist in the time period in which Sally Hemings lived, and the specific reference to watching a particular program like True Detective is clearly inappropriate for that time period in history. So the answer is (A). \ No newline at end of file +This statement mentions Sally Hemings, an enslaved woman in late 18th and early 19th century America. However, the reference to her knitting while watching HBO's True Detective introduces an anachronism. Television did not exist in the time period in which Sally Hemings lived, and the specific reference to watching a particular program like True Detective is clearly inappropriate for that time period in history. So the answer is (A). diff --git a/opencompass/configs/datasets/CHARM/few-shot-examples_Translate-EN/Global_Natural_Language_Inference_Translate-EN.txt b/opencompass/configs/datasets/CHARM/few-shot-examples_Translate-EN/Global_Natural_Language_Inference_Translate-EN.txt index bdf47e5e..1296ef2c 100644 --- a/opencompass/configs/datasets/CHARM/few-shot-examples_Translate-EN/Global_Natural_Language_Inference_Translate-EN.txt +++ b/opencompass/configs/datasets/CHARM/few-shot-examples_Translate-EN/Global_Natural_Language_Inference_Translate-EN.txt @@ -25,4 +25,4 @@ What is the relationship between these two statements? (B) Contradictory (C) Unrelated A: Let's think step by step. -The first sentence indicates that you are not sure who you support, while the second sentence makes it clear that your position is obvious, i.e. you know exactly who you support. Therefore, the two sentences contradict each other. So the answer is (B). \ No newline at end of file +The first sentence indicates that you are not sure who you support, while the second sentence makes it clear that your position is obvious, i.e. you know exactly who you support. Therefore, the two sentences contradict each other. So the answer is (B). diff --git a/opencompass/configs/datasets/CHARM/few-shot-examples_Translate-EN/Global_Reading_Comprehension_Translate-EN.txt b/opencompass/configs/datasets/CHARM/few-shot-examples_Translate-EN/Global_Reading_Comprehension_Translate-EN.txt index 777de423..526ff442 100644 --- a/opencompass/configs/datasets/CHARM/few-shot-examples_Translate-EN/Global_Reading_Comprehension_Translate-EN.txt +++ b/opencompass/configs/datasets/CHARM/few-shot-examples_Translate-EN/Global_Reading_Comprehension_Translate-EN.txt @@ -22,4 +22,4 @@ Q:For a book to be a bestseller,it must be readable or well packaged. If the abo (C) A book that is readable must be a bestseller (D) Most people like bestsellers that are readable. A: Let's think step by step. -According to the above assertion, "For a book to be a bestseller, it must be readable or well-packaged", so (A) "A book that is neither readable nor well-packaged must not be a bestseller" must be true. Because if a book is neither readable nor well-packaged, then it does not fulfill the conditions for being a bestseller. So the answer is (A). \ No newline at end of file +According to the above assertion, "For a book to be a bestseller, it must be readable or well-packaged", so (A) "A book that is neither readable nor well-packaged must not be a bestseller" must be true. Because if a book is neither readable nor well-packaged, then it does not fulfill the conditions for being a bestseller. So the answer is (A). diff --git a/opencompass/configs/datasets/CHARM/few-shot-examples_Translate-EN/Global_Sequence_Understanding_Translate-EN.txt b/opencompass/configs/datasets/CHARM/few-shot-examples_Translate-EN/Global_Sequence_Understanding_Translate-EN.txt index b8193c85..85aee8d4 100644 --- a/opencompass/configs/datasets/CHARM/few-shot-examples_Translate-EN/Global_Sequence_Understanding_Translate-EN.txt +++ b/opencompass/configs/datasets/CHARM/few-shot-examples_Translate-EN/Global_Sequence_Understanding_Translate-EN.txt @@ -22,4 +22,4 @@ Q:Which of the following lists is correct in chronological order? Options: (C) The Magician at the End of the Century, Countdown to Heaven, The Undead of Baker Street, The Detectives' Requiem (D) The Magician at the End of the Century, The Undead of Baker Street, The Detectives' Requiem, The Countdown to Heaven A: Let's think step by step. -The options mention the names of the four Conan theatrical releases. The Magician at the End of the Century was released in 1999, The Undead of Baker Street was released in 2002, Countdown to Heaven was released in 2001, and The Detectives' Requiem was released in 2006, so in chronological order it's The Magician at the End of the Century, Countdown to Heaven, The Undead of Baker Street, and The Detectives' Requiem. So the answer is (C). \ No newline at end of file +The options mention the names of the four Conan theatrical releases. The Magician at the End of the Century was released in 1999, The Undead of Baker Street was released in 2002, Countdown to Heaven was released in 2001, and The Detectives' Requiem was released in 2006, so in chronological order it's The Magician at the End of the Century, Countdown to Heaven, The Undead of Baker Street, and The Detectives' Requiem. So the answer is (C). diff --git a/opencompass/configs/datasets/CHARM/few-shot-examples_Translate-EN/Global_Sport_Understanding_Translate-EN.txt b/opencompass/configs/datasets/CHARM/few-shot-examples_Translate-EN/Global_Sport_Understanding_Translate-EN.txt index ef56e501..99d903a8 100644 --- a/opencompass/configs/datasets/CHARM/few-shot-examples_Translate-EN/Global_Sport_Understanding_Translate-EN.txt +++ b/opencompass/configs/datasets/CHARM/few-shot-examples_Translate-EN/Global_Sport_Understanding_Translate-EN.txt @@ -19,4 +19,4 @@ Options: (A) Credible (B) Not credible A: Let's think step by step. -LeBron James is a well-known American professional basketball player. An eagle is a term in golf, meaning two strokes under par. Therefore, this sentence is not credible. The correct answer should be (B). \ No newline at end of file +LeBron James is a well-known American professional basketball player. An eagle is a term in golf, meaning two strokes under par. Therefore, this sentence is not credible. The correct answer should be (B). diff --git a/opencompass/configs/datasets/CHARM/few-shot-examples_Translate-EN/Global_Time_Understanding_Translate-EN.txt b/opencompass/configs/datasets/CHARM/few-shot-examples_Translate-EN/Global_Time_Understanding_Translate-EN.txt index f22f8dd5..a9348295 100644 --- a/opencompass/configs/datasets/CHARM/few-shot-examples_Translate-EN/Global_Time_Understanding_Translate-EN.txt +++ b/opencompass/configs/datasets/CHARM/few-shot-examples_Translate-EN/Global_Time_Understanding_Translate-EN.txt @@ -27,4 +27,4 @@ Q: Select the correct answer according to the question. Li Jing and Wang Wei wer (E)January 03, 1963 (F) December 03, 1960 A: Let's think step by step. -If Li Jing and Wang Wei were married on January 2, 1958, then today's date would be January 2, 1963 if today is their fifth wedding anniversary. Tomorrow's date will be January 3, 1963, which is January 03, 1961. So the answer is (E). \ No newline at end of file +If Li Jing and Wang Wei were married on January 2, 1958, then today's date would be January 2, 1963 if today is their fifth wedding anniversary. Tomorrow's date will be January 3, 1963, which is January 03, 1961. So the answer is (E). diff --git a/opencompass/configs/datasets/IFEval/IFEval_gen.py b/opencompass/configs/datasets/IFEval/IFEval_gen.py index 56ed7e03..e81d35c6 100644 --- a/opencompass/configs/datasets/IFEval/IFEval_gen.py +++ b/opencompass/configs/datasets/IFEval/IFEval_gen.py @@ -1,4 +1,4 @@ from mmengine.config import read_base with read_base(): - from .IFEval_gen_353ae7 import ifeval_datasets # noqa: F401, F403 \ No newline at end of file + from .IFEval_gen_353ae7 import ifeval_datasets # noqa: F401, F403 diff --git a/opencompass/configs/datasets/aime2024/README.md b/opencompass/configs/datasets/aime2024/README.md index b75c9dbb..4728f7a9 100644 --- a/opencompass/configs/datasets/aime2024/README.md +++ b/opencompass/configs/datasets/aime2024/README.md @@ -10,4 +10,4 @@ Math dataset composed of problems from AIME2024 (American Invitational Mathemati | Qwen2.5-72B-Instruct | Qwen2.5-7B-Instruct | internlm2_5-7b-chat | | ----------- | ----------- | ----------- | -| 31.25 | 26.44 | 9.13 | \ No newline at end of file +| 31.25 | 26.44 | 9.13 | diff --git a/opencompass/configs/datasets/aime2024/aime2024_0shot_nocot_gen_2b9dc2.py b/opencompass/configs/datasets/aime2024/aime2024_0shot_nocot_gen_2b9dc2.py index 0a1790e0..c7c5d9e7 100644 --- a/opencompass/configs/datasets/aime2024/aime2024_0shot_nocot_gen_2b9dc2.py +++ b/opencompass/configs/datasets/aime2024/aime2024_0shot_nocot_gen_2b9dc2.py @@ -36,4 +36,4 @@ aime2024_datasets = [ infer_cfg=aime2024_infer_cfg, eval_cfg=aime2024_eval_cfg ) -] \ No newline at end of file +] diff --git a/opencompass/configs/datasets/aime2024/aime2024_0shot_nocot_genericllmeval_gen_2b9dc2.py b/opencompass/configs/datasets/aime2024/aime2024_0shot_nocot_genericllmeval_gen_2b9dc2.py index c4467968..416706c6 100644 --- a/opencompass/configs/datasets/aime2024/aime2024_0shot_nocot_genericllmeval_gen_2b9dc2.py +++ b/opencompass/configs/datasets/aime2024/aime2024_0shot_nocot_genericllmeval_gen_2b9dc2.py @@ -90,4 +90,4 @@ aime2024_datasets = [ eval_cfg=aime2024_eval_cfg, mode='singlescore', ) -] \ No newline at end of file +] diff --git a/opencompass/configs/datasets/aime2024/aime2024_0shot_nocot_genericllmeval_xml_gen_2b9dc2.py b/opencompass/configs/datasets/aime2024/aime2024_0shot_nocot_genericllmeval_xml_gen_2b9dc2.py index 33d84265..9c119382 100644 --- a/opencompass/configs/datasets/aime2024/aime2024_0shot_nocot_genericllmeval_xml_gen_2b9dc2.py +++ b/opencompass/configs/datasets/aime2024/aime2024_0shot_nocot_genericllmeval_xml_gen_2b9dc2.py @@ -92,4 +92,4 @@ aime2024_datasets = [ eval_cfg=aime2024_eval_cfg, mode='singlescore', ) -] \ No newline at end of file +] diff --git a/opencompass/configs/datasets/aime2024/aime2024_0shot_nocot_llmjudge_gen_2b9dc2.py b/opencompass/configs/datasets/aime2024/aime2024_0shot_nocot_llmjudge_gen_2b9dc2.py index 87c65f96..81b7106d 100644 --- a/opencompass/configs/datasets/aime2024/aime2024_0shot_nocot_llmjudge_gen_2b9dc2.py +++ b/opencompass/configs/datasets/aime2024/aime2024_0shot_nocot_llmjudge_gen_2b9dc2.py @@ -84,4 +84,4 @@ aime2024_datasets = [ eval_cfg=aime2024_eval_cfg, mode='singlescore', ) -] \ No newline at end of file +] diff --git a/opencompass/configs/datasets/aime2024/aime2024_gen.py b/opencompass/configs/datasets/aime2024/aime2024_gen.py index 8c63ca7e..416ef6e8 100644 --- a/opencompass/configs/datasets/aime2024/aime2024_gen.py +++ b/opencompass/configs/datasets/aime2024/aime2024_gen.py @@ -1,4 +1,4 @@ from mmengine.config import read_base with read_base(): - from .aime2024_gen_17d799 import aime2024_datasets # noqa: F401, F403 \ No newline at end of file + from .aime2024_gen_17d799 import aime2024_datasets # noqa: F401, F403 diff --git a/opencompass/configs/datasets/aime2024/aime2024_gen_17d799.py b/opencompass/configs/datasets/aime2024/aime2024_gen_17d799.py index 03f6ddec..d54e74f5 100644 --- a/opencompass/configs/datasets/aime2024/aime2024_gen_17d799.py +++ b/opencompass/configs/datasets/aime2024/aime2024_gen_17d799.py @@ -37,4 +37,4 @@ aime2024_datasets = [ infer_cfg=aime2024_infer_cfg, eval_cfg=aime2024_eval_cfg, ) -] \ No newline at end of file +] diff --git a/opencompass/configs/datasets/aime2024/aime2024_gen_6e39a4.py b/opencompass/configs/datasets/aime2024/aime2024_gen_6e39a4.py index 305a4ec5..4609776d 100644 --- a/opencompass/configs/datasets/aime2024/aime2024_gen_6e39a4.py +++ b/opencompass/configs/datasets/aime2024/aime2024_gen_6e39a4.py @@ -36,4 +36,4 @@ aime2024_datasets = [ infer_cfg=aime2024_infer_cfg, eval_cfg=aime2024_eval_cfg ) -] \ No newline at end of file +] diff --git a/opencompass/configs/datasets/aime2024/aime2024_llm_judge_gen.py b/opencompass/configs/datasets/aime2024/aime2024_llm_judge_gen.py index e1525f94..87e7a824 100644 --- a/opencompass/configs/datasets/aime2024/aime2024_llm_judge_gen.py +++ b/opencompass/configs/datasets/aime2024/aime2024_llm_judge_gen.py @@ -1,4 +1,4 @@ from mmengine.config import read_base with read_base(): - from .aime2024_llmjudge_gen_5e9f4f import aime2024_datasets # noqa: F401, F403 \ No newline at end of file + from .aime2024_llmjudge_gen_5e9f4f import aime2024_datasets # noqa: F401, F403 diff --git a/opencompass/configs/datasets/aime2024/aime2024_llmverify_repeat16_gen_bf7475.py b/opencompass/configs/datasets/aime2024/aime2024_llmverify_repeat16_gen_bf7475.py index 070a63bc..9e61a97d 100644 --- a/opencompass/configs/datasets/aime2024/aime2024_llmverify_repeat16_gen_bf7475.py +++ b/opencompass/configs/datasets/aime2024/aime2024_llmverify_repeat16_gen_bf7475.py @@ -93,4 +93,4 @@ aime2024_datasets = [ mode='singlescore', ) for idx in range(16) -] \ No newline at end of file +] diff --git a/opencompass/configs/datasets/aime2024/aime2024_llmverify_repeat8_gen_e8fcee.py b/opencompass/configs/datasets/aime2024/aime2024_llmverify_repeat8_gen_e8fcee.py index 07fa6d1b..90c97eab 100644 --- a/opencompass/configs/datasets/aime2024/aime2024_llmverify_repeat8_gen_e8fcee.py +++ b/opencompass/configs/datasets/aime2024/aime2024_llmverify_repeat8_gen_e8fcee.py @@ -93,4 +93,4 @@ aime2024_datasets = [ mode='singlescore', ) for idx in range(8) -] \ No newline at end of file +] diff --git a/opencompass/configs/datasets/babilong/README.md b/opencompass/configs/datasets/babilong/README.md index 091a9319..721e516b 100644 --- a/opencompass/configs/datasets/babilong/README.md +++ b/opencompass/configs/datasets/babilong/README.md @@ -34,4 +34,4 @@ OpenCompass provides the results of some models on the BABILong dataset. The eva eprint={2406.10149}, archivePrefix={arXiv} } -``` \ No newline at end of file +``` diff --git a/opencompass/configs/datasets/bbeh/bbeh_gen.py b/opencompass/configs/datasets/bbeh/bbeh_gen.py index 522ade24..1c08dd87 100644 --- a/opencompass/configs/datasets/bbeh/bbeh_gen.py +++ b/opencompass/configs/datasets/bbeh/bbeh_gen.py @@ -90,4 +90,4 @@ for _name in bbeh_free_form_sets: abbr=_name, reader_cfg=bbeh_reader_cfg, infer_cfg=bbeh_infer_cfg.copy(), - eval_cfg=bbeh_eval_cfg.copy())) \ No newline at end of file + eval_cfg=bbeh_eval_cfg.copy())) diff --git a/opencompass/configs/datasets/bbeh/bbeh_llmjudge_gen_86c3a0.py b/opencompass/configs/datasets/bbeh/bbeh_llmjudge_gen_86c3a0.py index 6fa3d563..1e05e13f 100644 --- a/opencompass/configs/datasets/bbeh/bbeh_llmjudge_gen_86c3a0.py +++ b/opencompass/configs/datasets/bbeh/bbeh_llmjudge_gen_86c3a0.py @@ -123,4 +123,4 @@ for _name in bbeh_multiple_choice_sets + bbeh_free_form_sets: infer_cfg=bbeh_infer_cfg, eval_cfg=bbeh_eval_cfg, ) - ) \ No newline at end of file + ) diff --git a/opencompass/configs/datasets/bbh/bbh_gen.py b/opencompass/configs/datasets/bbh/bbh_gen.py index 240d4457..b0a748f4 100644 --- a/opencompass/configs/datasets/bbh/bbh_gen.py +++ b/opencompass/configs/datasets/bbh/bbh_gen.py @@ -1,4 +1,4 @@ from mmengine.config import read_base with read_base(): - from .bbh_gen_ee62e9 import bbh_datasets # noqa: F401, F403 \ No newline at end of file + from .bbh_gen_ee62e9 import bbh_datasets # noqa: F401, F403 diff --git a/opencompass/configs/datasets/bbh/bbh_gen_4a31fa.py b/opencompass/configs/datasets/bbh/bbh_gen_366da4.py similarity index 100% rename from opencompass/configs/datasets/bbh/bbh_gen_4a31fa.py rename to opencompass/configs/datasets/bbh/bbh_gen_366da4.py diff --git a/opencompass/configs/datasets/bbh/bbh_gen_5bf00b.py b/opencompass/configs/datasets/bbh/bbh_gen_51d347.py similarity index 100% rename from opencompass/configs/datasets/bbh/bbh_gen_5bf00b.py rename to opencompass/configs/datasets/bbh/bbh_gen_51d347.py diff --git a/opencompass/configs/datasets/bbh/bbh_gen_98fba6.py b/opencompass/configs/datasets/bbh/bbh_gen_bbe236.py similarity index 100% rename from opencompass/configs/datasets/bbh/bbh_gen_98fba6.py rename to opencompass/configs/datasets/bbh/bbh_gen_bbe236.py diff --git a/opencompass/configs/datasets/bbh/bbh_gen_5b92b0.py b/opencompass/configs/datasets/bbh/bbh_gen_dbf069.py similarity index 100% rename from opencompass/configs/datasets/bbh/bbh_gen_5b92b0.py rename to opencompass/configs/datasets/bbh/bbh_gen_dbf069.py diff --git a/opencompass/configs/datasets/bbh/bbh_gen_ee62e9.py b/opencompass/configs/datasets/bbh/bbh_gen_f9c8ef.py similarity index 100% rename from opencompass/configs/datasets/bbh/bbh_gen_ee62e9.py rename to opencompass/configs/datasets/bbh/bbh_gen_f9c8ef.py diff --git a/opencompass/configs/datasets/bbh/bbh_llm_judge_gen.py b/opencompass/configs/datasets/bbh/bbh_llm_judge_gen.py index 1b1c143b..f0e82fc8 100644 --- a/opencompass/configs/datasets/bbh/bbh_llm_judge_gen.py +++ b/opencompass/configs/datasets/bbh/bbh_llm_judge_gen.py @@ -1,4 +1,4 @@ from mmengine.config import read_base with read_base(): - from .bbh_llmjudge_gen_b5bdf1 import bbh_datasets # noqa: F401, F403 \ No newline at end of file + from .bbh_llmjudge_gen_b5bdf1 import bbh_datasets # noqa: F401, F403 diff --git a/opencompass/configs/datasets/bbh/lib_prompt/boolean_expressions.txt b/opencompass/configs/datasets/bbh/lib_prompt/boolean_expressions.txt index 15d7903c..5a15edec 100755 --- a/opencompass/configs/datasets/bbh/lib_prompt/boolean_expressions.txt +++ b/opencompass/configs/datasets/bbh/lib_prompt/boolean_expressions.txt @@ -20,4 +20,4 @@ A: Let's think step by step. Remember that (i) expressions inside brackets are always evaluated first and that (ii) the order of operations from highest priority to lowest priority is "not", "and", "or", respectively. We first simplify this expression "Z" as follows: "Z = not not ( not ( False ) ) = not not ( A )" where "A = not ( False )". Let's evaluate A: A = not ( False ) = not False = True. -Plugging in A, we get: Z = not not ( A ) = not not (True) = not not False = True. So the answer is True. \ No newline at end of file +Plugging in A, we get: Z = not not ( A ) = not not (True) = not not False = True. So the answer is True. diff --git a/opencompass/configs/datasets/bbh/lib_prompt/causal_judgement.txt b/opencompass/configs/datasets/bbh/lib_prompt/causal_judgement.txt index 04394491..fe13ff8a 100755 --- a/opencompass/configs/datasets/bbh/lib_prompt/causal_judgement.txt +++ b/opencompass/configs/datasets/bbh/lib_prompt/causal_judgement.txt @@ -22,4 +22,4 @@ Options: - Yes - No A: Let's think step by step. -Here in this question, we are told that "He aims the dart at the low point region." A typical person might therefore think George did intentionally hit the low point region, because he wanted to lift up the spirit of his sister Lena. So the answer is Yes. \ No newline at end of file +Here in this question, we are told that "He aims the dart at the low point region." A typical person might therefore think George did intentionally hit the low point region, because he wanted to lift up the spirit of his sister Lena. So the answer is Yes. diff --git a/opencompass/configs/datasets/bbh/lib_prompt/date_understanding.txt b/opencompass/configs/datasets/bbh/lib_prompt/date_understanding.txt index a4bb7bbb..5b8ba482 100755 --- a/opencompass/configs/datasets/bbh/lib_prompt/date_understanding.txt +++ b/opencompass/configs/datasets/bbh/lib_prompt/date_understanding.txt @@ -30,4 +30,4 @@ Options: (E) 01/03/1982 (F) 12/03/1960 A: Let's think step by step. -If Jane and John married on Jan 2, 1958, then and if it is their 5-year anniversary today, then today's date is Jan 2, 1963. The date tomorrow is Jan 3, 1963, that is 01/03/1963. So the answer is (B). \ No newline at end of file +If Jane and John married on Jan 2, 1958, then and if it is their 5-year anniversary today, then today's date is Jan 2, 1963. The date tomorrow is Jan 3, 1963, that is 01/03/1963. So the answer is (B). diff --git a/opencompass/configs/datasets/bbh/lib_prompt/disambiguation_qa.txt b/opencompass/configs/datasets/bbh/lib_prompt/disambiguation_qa.txt index 36f9c867..c7abc433 100755 --- a/opencompass/configs/datasets/bbh/lib_prompt/disambiguation_qa.txt +++ b/opencompass/configs/datasets/bbh/lib_prompt/disambiguation_qa.txt @@ -34,4 +34,4 @@ A: Let's think step by step. Here we need to determine who the pronoun "his" might be referring to. There are two possible referents for "his", namely Bailey's and the director's. The verb phrase "plan to meet" might be able to help us determine which one is more likely (if either). Let X be Bailey and Y the director. The sentence is then of the form "X will plan to meet Y at (X or Y)'s office." Let's consider Y first: "X will plan to meet Y at Y's office." This case makes sense, because X might want to meet up with Y at Y's office. Now, consider X: "X will plan to meet Y at X's office." This case also makes sense, because X might want to meet up with Y at X's own office. -Because both X and Y are possible at the same time, we conclude that the antecedent of the pronoun is ambiguous. So the answer is (C). \ No newline at end of file +Because both X and Y are possible at the same time, we conclude that the antecedent of the pronoun is ambiguous. So the answer is (C). diff --git a/opencompass/configs/datasets/bbh/lib_prompt/dyck_languages.txt b/opencompass/configs/datasets/bbh/lib_prompt/dyck_languages.txt index 993cfbe0..8f7fa579 100755 --- a/opencompass/configs/datasets/bbh/lib_prompt/dyck_languages.txt +++ b/opencompass/configs/datasets/bbh/lib_prompt/dyck_languages.txt @@ -69,4 +69,4 @@ We should process each input one by one and keep track of the stack configuratio 29: } ; stack: < [ { Now, we have reached the end. The final stack is "< [ {". We will need to pop out "{", "[", "<" one by one in that order. -So, we need "}", "]", ">". So the answer is } ] >. \ No newline at end of file +So, we need "}", "]", ">". So the answer is } ] >. diff --git a/opencompass/configs/datasets/bbh/lib_prompt/formal_fallacies.txt b/opencompass/configs/datasets/bbh/lib_prompt/formal_fallacies.txt index bf30a67d..4c1fa1f1 100755 --- a/opencompass/configs/datasets/bbh/lib_prompt/formal_fallacies.txt +++ b/opencompass/configs/datasets/bbh/lib_prompt/formal_fallacies.txt @@ -41,4 +41,4 @@ Let’s see whether the Hypothesis can be deduced from the arguments (1) and (2) By (1), we have if X = infrequent-user(Paul Mitchell), then X = rare-consumer(Nioxin) OR X = loyal-buyer(Caress). We need to consider both cases separately: The case X = rare-consumer(Nioxin) does not appear in (2). The case X = loyal-buyer(Caress) does not appear in (2), either. -So, from (1) and (2), we cannot necessarily deduce the Hypothesis. So the answer is invalid. \ No newline at end of file +So, from (1) and (2), we cannot necessarily deduce the Hypothesis. So the answer is invalid. diff --git a/opencompass/configs/datasets/bbh/lib_prompt/geometric_shapes.txt b/opencompass/configs/datasets/bbh/lib_prompt/geometric_shapes.txt index 3b7a95ba..34202292 100755 --- a/opencompass/configs/datasets/bbh/lib_prompt/geometric_shapes.txt +++ b/opencompass/configs/datasets/bbh/lib_prompt/geometric_shapes.txt @@ -75,4 +75,4 @@ Length of side A: |A| = sqrt((41.00-37.00)^2 + (43.00-34.00)^2) = sqrt((4)^2 + ( Length of side B: |B| = sqrt((37.00-41.00)^2 + (34.00-33.00)^2)) = sqrt((4)^2 + (1)^2) = sqrt(16 + 1) = sqrt(17). Length of side C: |C| = sqrt((41.00-45.00)^2 + (33.00-34.00)^2)) = sqrt((-4)^2 + (-1)^2) = sqrt(16 + 1) = sqrt(17). Length of side D: |D| = sqrt((45.00-41.00)^2 + (34.00-43.00)^2)) = sqrt((4)^2 + (-9)^2) = sqrt(16 + 81) = sqrt(97). -Note that |A| = |D| and |B| = |C|. Furthermore, A and D are adjacent and B and C are adjacent. Thus, this polygon has two pairs of equal-length adjacent sides and is "kite". So the answer is (D). \ No newline at end of file +Note that |A| = |D| and |B| = |C|. Furthermore, A and D are adjacent and B and C are adjacent. Thus, this polygon has two pairs of equal-length adjacent sides and is "kite". So the answer is (D). diff --git a/opencompass/configs/datasets/bbh/lib_prompt/hyperbaton.txt b/opencompass/configs/datasets/bbh/lib_prompt/hyperbaton.txt index edcddf90..eaa29794 100755 --- a/opencompass/configs/datasets/bbh/lib_prompt/hyperbaton.txt +++ b/opencompass/configs/datasets/bbh/lib_prompt/hyperbaton.txt @@ -25,4 +25,4 @@ Options: A: Let's think step by step. When there is more than one adjective before a noun, the adjectives need to respect the following order before a noun: "[1. opinion] [2. size] [3. age] [4. shape] [5. color] [6. origin] [7. material] [8. purpose] noun". Option (A): "blue gold wonderful square shoe". (1) "blue" falls into the color category. (2) "gold" falls into the material category. (3) "wonderful" falls into the opinion category. (4) "square" falls into the shape category. The adjective order that Option (A) has is [5. color] [7. material] [1. opinion] [4. shape] (or, in numeric terms, 5 7 1 4). Because 5 < 7 < 1 < 4 is not correct, (A) does not have the correct ordering. -Option (B): "wonderful square blue gold shoe". Option (B) has the following adjective order: [1. opinion] [4. shape] [5. color] [7. material] (or, in numeric terms, 1 4 5 7 ). Because 1 < 4 < 5 < 7 is correct, (B) has the correct ordering. So the answer is (B). \ No newline at end of file +Option (B): "wonderful square blue gold shoe". Option (B) has the following adjective order: [1. opinion] [4. shape] [5. color] [7. material] (or, in numeric terms, 1 4 5 7 ). Because 1 < 4 < 5 < 7 is correct, (B) has the correct ordering. So the answer is (B). diff --git a/opencompass/configs/datasets/bbh/lib_prompt/logical_deduction_five_objects.txt b/opencompass/configs/datasets/bbh/lib_prompt/logical_deduction_five_objects.txt index 2e64ce9e..9dec2016 100755 --- a/opencompass/configs/datasets/bbh/lib_prompt/logical_deduction_five_objects.txt +++ b/opencompass/configs/datasets/bbh/lib_prompt/logical_deduction_five_objects.txt @@ -34,4 +34,4 @@ A: Let's think step by step. (2) The red book is the second from the left: "(left) ? white red gray ? (right)". (3) Combining (1) and (2) we get the following ordering: "(left) white red gray (right)". According to this ordering, the leftmost book is the white book. -The white book is the leftmost. So the answer is (C). \ No newline at end of file +The white book is the leftmost. So the answer is (C). diff --git a/opencompass/configs/datasets/bbh/lib_prompt/logical_deduction_seven_objects.txt b/opencompass/configs/datasets/bbh/lib_prompt/logical_deduction_seven_objects.txt index 2e64ce9e..9dec2016 100755 --- a/opencompass/configs/datasets/bbh/lib_prompt/logical_deduction_seven_objects.txt +++ b/opencompass/configs/datasets/bbh/lib_prompt/logical_deduction_seven_objects.txt @@ -34,4 +34,4 @@ A: Let's think step by step. (2) The red book is the second from the left: "(left) ? white red gray ? (right)". (3) Combining (1) and (2) we get the following ordering: "(left) white red gray (right)". According to this ordering, the leftmost book is the white book. -The white book is the leftmost. So the answer is (C). \ No newline at end of file +The white book is the leftmost. So the answer is (C). diff --git a/opencompass/configs/datasets/bbh/lib_prompt/logical_deduction_three_objects.txt b/opencompass/configs/datasets/bbh/lib_prompt/logical_deduction_three_objects.txt index 2e64ce9e..9dec2016 100755 --- a/opencompass/configs/datasets/bbh/lib_prompt/logical_deduction_three_objects.txt +++ b/opencompass/configs/datasets/bbh/lib_prompt/logical_deduction_three_objects.txt @@ -34,4 +34,4 @@ A: Let's think step by step. (2) The red book is the second from the left: "(left) ? white red gray ? (right)". (3) Combining (1) and (2) we get the following ordering: "(left) white red gray (right)". According to this ordering, the leftmost book is the white book. -The white book is the leftmost. So the answer is (C). \ No newline at end of file +The white book is the leftmost. So the answer is (C). diff --git a/opencompass/configs/datasets/bbh/lib_prompt/movie_recommendation.txt b/opencompass/configs/datasets/bbh/lib_prompt/movie_recommendation.txt index 5cd3c886..9bf37882 100755 --- a/opencompass/configs/datasets/bbh/lib_prompt/movie_recommendation.txt +++ b/opencompass/configs/datasets/bbh/lib_prompt/movie_recommendation.txt @@ -39,4 +39,4 @@ A: Let's think step by step. - Total Recall (action, adventure, science-fiction; 2012) - Inside Out (animation, family, comedy; 2015) - Forrest Gump (comedy, drama, romance; 1994) -These are all famous movies produced in the past few decades.Amongst all the options, the only movie similar to these ones seems to be Edge of Tomorrow (action, adventure, crime, mystery; 2014), as it is also a science-fiction movie and features Tom Cruise. So the answer is (D). \ No newline at end of file +These are all famous movies produced in the past few decades.Amongst all the options, the only movie similar to these ones seems to be Edge of Tomorrow (action, adventure, crime, mystery; 2014), as it is also a science-fiction movie and features Tom Cruise. So the answer is (D). diff --git a/opencompass/configs/datasets/bbh/lib_prompt/multistep_arithmetic_two.txt b/opencompass/configs/datasets/bbh/lib_prompt/multistep_arithmetic_two.txt index 46edfe9a..752029bf 100755 --- a/opencompass/configs/datasets/bbh/lib_prompt/multistep_arithmetic_two.txt +++ b/opencompass/configs/datasets/bbh/lib_prompt/multistep_arithmetic_two.txt @@ -22,4 +22,4 @@ Let’s recall that the order of operations in mathematics is as follows: (1) Pa This equation can be written as "A - B", where A = (-3 + 5 * 8 * -4) and B = (9 - 8 * -7 + -9). Let's calculate A = (-3 + 5 * 8 * -4) = (-3 + (5 * 8) * -4) = (-3 + (40) * -4) = (-3 + (40 * -4)) = (-3 + -160) = -163. Let's calculate B = (9 - 8 * -7 + -9) = (9 - (8 * -7) + -9) = (9 - (-56) + -9) = ((9 - (-56)) + -9) = ((65) + -9)= (65 - 9) = 56. -Then, the final equation is A - B = -163 - 56 = -219. So the answer is -219. \ No newline at end of file +Then, the final equation is A - B = -163 - 56 = -219. So the answer is -219. diff --git a/opencompass/configs/datasets/bbh/lib_prompt/navigate.txt b/opencompass/configs/datasets/bbh/lib_prompt/navigate.txt index ee15776e..22ed7d3f 100755 --- a/opencompass/configs/datasets/bbh/lib_prompt/navigate.txt +++ b/opencompass/configs/datasets/bbh/lib_prompt/navigate.txt @@ -40,4 +40,4 @@ We start at the origin (0, 0), facing the positive y-axis. (2) Take 2 steps right: (0, 2), facing the positive y-axis. (3) Take 9 steps left: (0, -7), facing the positive y-axis. (4) Take 7 steps right: (0, 7), facing the positive y-axis. -Since (0, 0) is (0, 0), we are indeed where we started. So the answer is Yes. \ No newline at end of file +Since (0, 0) is (0, 0), we are indeed where we started. So the answer is Yes. diff --git a/opencompass/configs/datasets/bbh/lib_prompt/object_counting.txt b/opencompass/configs/datasets/bbh/lib_prompt/object_counting.txt index 14258467..00ee8104 100755 --- a/opencompass/configs/datasets/bbh/lib_prompt/object_counting.txt +++ b/opencompass/configs/datasets/bbh/lib_prompt/object_counting.txt @@ -34,4 +34,4 @@ We first identify the vegetables on the list and include their quantity in paren - carrots (2) - garlic (1) - yam (1) -Now, let's add the numbers in parentheses: 1 + 1 + 1 + 1 + 2 + 1 + 1 = 8. So the answer is 8. \ No newline at end of file +Now, let's add the numbers in parentheses: 1 + 1 + 1 + 1 + 2 + 1 + 1 = 8. So the answer is 8. diff --git a/opencompass/configs/datasets/bbh/lib_prompt/penguins_in_a_table.txt b/opencompass/configs/datasets/bbh/lib_prompt/penguins_in_a_table.txt index 97df44aa..daca64ba 100755 --- a/opencompass/configs/datasets/bbh/lib_prompt/penguins_in_a_table.txt +++ b/opencompass/configs/datasets/bbh/lib_prompt/penguins_in_a_table.txt @@ -38,4 +38,4 @@ A: Let's think step by step. This question focuses on the name. We know the following: The names of the penguin in the table are Louis, Bernard, Vincent, and Gwen. When we sort their names alphabetically, we get Bernard, Gwen, Louis, Vincent. The name of the second penguin sorted by alphabetical order is Gwen. -The name of the second penguin sorted by alphabetic order is Gwen. So the answer is (D). \ No newline at end of file +The name of the second penguin sorted by alphabetic order is Gwen. So the answer is (D). diff --git a/opencompass/configs/datasets/bbh/lib_prompt/reasoning_about_colored_objects.txt b/opencompass/configs/datasets/bbh/lib_prompt/reasoning_about_colored_objects.txt index b492f201..6b1ce188 100755 --- a/opencompass/configs/datasets/bbh/lib_prompt/reasoning_about_colored_objects.txt +++ b/opencompass/configs/datasets/bbh/lib_prompt/reasoning_about_colored_objects.txt @@ -60,4 +60,4 @@ Options: A: Let's think step by step. According to this question, the objects are arranged in a row, from left to right, as follows: (1) a teal plate, (2) a burgundy keychain, (3) a yellow scrunchiephone charger, (4) an orange mug, (5) a pink notebook, (6) a grey cup. The teal plate is the first item, namely (1). There is no item to the left of the teal item. -The number of non-orange items to the left of the teal item is zero. So the answer is (A). \ No newline at end of file +The number of non-orange items to the left of the teal item is zero. So the answer is (A). diff --git a/opencompass/configs/datasets/bbh/lib_prompt/ruin_names.txt b/opencompass/configs/datasets/bbh/lib_prompt/ruin_names.txt index 65043e81..f144f579 100755 --- a/opencompass/configs/datasets/bbh/lib_prompt/ruin_names.txt +++ b/opencompass/configs/datasets/bbh/lib_prompt/ruin_names.txt @@ -41,4 +41,4 @@ The original name is "counting crows". This is the name of an American rock band (B) "counting cows": Here the word "crows" is changed to "cows", and this is indeed a playful and meaningful edit that ruins the original name of the band. (C) "courting crows": Here the word "counting" is changed to "courting", and "courting" is an actual word; however, "courting crows" does not sound as humorous as "counting cows". (D) "coutnting crows": Here the word "counting" is changed to "coutnting", but the word "coutnting" is not an actual word. -Above the above, the only humorous edit is (B). So the answer is (B). \ No newline at end of file +Above the above, the only humorous edit is (B). So the answer is (B). diff --git a/opencompass/configs/datasets/bbh/lib_prompt/salient_translation_error_detection.txt b/opencompass/configs/datasets/bbh/lib_prompt/salient_translation_error_detection.txt index 07984f2a..b9fd34e7 100755 --- a/opencompass/configs/datasets/bbh/lib_prompt/salient_translation_error_detection.txt +++ b/opencompass/configs/datasets/bbh/lib_prompt/salient_translation_error_detection.txt @@ -37,4 +37,4 @@ Options: (E) Dropped Content (F) Facts A: Let's think step by step. -We solve this question by first translating the source sentence to English and then by comparing our translation with the provided translation. According to Google Translate, the correct translation of the source sentence from German to English is "Łeba is a small town and seaside resort in the Powiat Lęborski of the Polish Pomeranian Voivodeship." On the other hand, the provided translation is "Łeba is not a small town and seaside resort in the Powiat Léborski county of the Pomeranian Voivodeship of Poland." Note that the provided sentence says, "Łeba is not a small town ..." However, the translation should have been "Łeba is a small town ..." Because a negation is introduced at the beginning of the sentence and has fundamentally changed the meaning of the original source, the translation contains an error pertaining to Negation or Antonyms. So the answer is (C). \ No newline at end of file +We solve this question by first translating the source sentence to English and then by comparing our translation with the provided translation. According to Google Translate, the correct translation of the source sentence from German to English is "Łeba is a small town and seaside resort in the Powiat Lęborski of the Polish Pomeranian Voivodeship." On the other hand, the provided translation is "Łeba is not a small town and seaside resort in the Powiat Léborski county of the Pomeranian Voivodeship of Poland." Note that the provided sentence says, "Łeba is not a small town ..." However, the translation should have been "Łeba is a small town ..." Because a negation is introduced at the beginning of the sentence and has fundamentally changed the meaning of the original source, the translation contains an error pertaining to Negation or Antonyms. So the answer is (C). diff --git a/opencompass/configs/datasets/bbh/lib_prompt/snarks.txt b/opencompass/configs/datasets/bbh/lib_prompt/snarks.txt index 46a9ffd9..5aa46de1 100755 --- a/opencompass/configs/datasets/bbh/lib_prompt/snarks.txt +++ b/opencompass/configs/datasets/bbh/lib_prompt/snarks.txt @@ -27,4 +27,4 @@ Options: A: Let's think step by step. If we look at (A), it likens the consistency in the league's punishments with that in politics. Because politics or political affairs are often not considered to be consistent or dependable, this sentence appears to be satirical. If we look at (B), it likens the consistency in the league's punishments with that in morality. Discussing the consistency of the league's punishments in the context of morality, ethics, or law makes sense and does not appear to make a satirical point about anything. -Above the above, the sarcastic option is (A). So the answer is (A). \ No newline at end of file +Above the above, the sarcastic option is (A). So the answer is (A). diff --git a/opencompass/configs/datasets/bbh/lib_prompt/sports_understanding.txt b/opencompass/configs/datasets/bbh/lib_prompt/sports_understanding.txt index 1f4b89c3..2114db82 100755 --- a/opencompass/configs/datasets/bbh/lib_prompt/sports_understanding.txt +++ b/opencompass/configs/datasets/bbh/lib_prompt/sports_understanding.txt @@ -7,4 +7,4 @@ Q: Is the following sentence plausible? "Santi Cazorla scored a touchdown." A: Let's think step by step. Santi Cazorla is a soccer player. Touchdown is part of American football and rugby. So the answer is no. Q: Is the following sentence plausible? "DeMar DeRozan was called for the goal tend." -A: Let's think step by step. DeMar DeRozan is an American basketball player. Goal tending is part of basketball. So the answer is yes. \ No newline at end of file +A: Let's think step by step. DeMar DeRozan is an American basketball player. Goal tending is part of basketball. So the answer is yes. diff --git a/opencompass/configs/datasets/bbh/lib_prompt/temporal_sequences.txt b/opencompass/configs/datasets/bbh/lib_prompt/temporal_sequences.txt index 6ccb32bc..09ff5341 100755 --- a/opencompass/configs/datasets/bbh/lib_prompt/temporal_sequences.txt +++ b/opencompass/configs/datasets/bbh/lib_prompt/temporal_sequences.txt @@ -74,4 +74,4 @@ Wake-up time: 5am. 2pm-3pm: playing tennis at the tennis court. 3pm-4pm: free. The beach closure time: 4pm. -The only time when Tiffany could have gone to the beach was 3pm to 4pm. So the answer is (D). \ No newline at end of file +The only time when Tiffany could have gone to the beach was 3pm to 4pm. So the answer is (D). diff --git a/opencompass/configs/datasets/bbh/lib_prompt/tracking_shuffled_objects_five_objects.txt b/opencompass/configs/datasets/bbh/lib_prompt/tracking_shuffled_objects_five_objects.txt index ddfefa1b..6c73ec40 100755 --- a/opencompass/configs/datasets/bbh/lib_prompt/tracking_shuffled_objects_five_objects.txt +++ b/opencompass/configs/datasets/bbh/lib_prompt/tracking_shuffled_objects_five_objects.txt @@ -37,4 +37,4 @@ A: Let's think step by step. (1) Alice and Bob switch partners: Alice: Rodrigo, Bob: Lola, Claire: Patrick. (2) Claire and Bob switch partners: Alice: Rodrigo, Bob: Patrick, Claire: Lola. (3) Bob and Alice switch partners: Alice: Patrick, Bob: Rodrigo, Claire: Lola. -At the end of the dance, Alice is dancing with Patrick. So the answer is (C). \ No newline at end of file +At the end of the dance, Alice is dancing with Patrick. So the answer is (C). diff --git a/opencompass/configs/datasets/bbh/lib_prompt/tracking_shuffled_objects_seven_objects.txt b/opencompass/configs/datasets/bbh/lib_prompt/tracking_shuffled_objects_seven_objects.txt index ddfefa1b..6c73ec40 100755 --- a/opencompass/configs/datasets/bbh/lib_prompt/tracking_shuffled_objects_seven_objects.txt +++ b/opencompass/configs/datasets/bbh/lib_prompt/tracking_shuffled_objects_seven_objects.txt @@ -37,4 +37,4 @@ A: Let's think step by step. (1) Alice and Bob switch partners: Alice: Rodrigo, Bob: Lola, Claire: Patrick. (2) Claire and Bob switch partners: Alice: Rodrigo, Bob: Patrick, Claire: Lola. (3) Bob and Alice switch partners: Alice: Patrick, Bob: Rodrigo, Claire: Lola. -At the end of the dance, Alice is dancing with Patrick. So the answer is (C). \ No newline at end of file +At the end of the dance, Alice is dancing with Patrick. So the answer is (C). diff --git a/opencompass/configs/datasets/bbh/lib_prompt/tracking_shuffled_objects_three_objects.txt b/opencompass/configs/datasets/bbh/lib_prompt/tracking_shuffled_objects_three_objects.txt index ddfefa1b..6c73ec40 100755 --- a/opencompass/configs/datasets/bbh/lib_prompt/tracking_shuffled_objects_three_objects.txt +++ b/opencompass/configs/datasets/bbh/lib_prompt/tracking_shuffled_objects_three_objects.txt @@ -37,4 +37,4 @@ A: Let's think step by step. (1) Alice and Bob switch partners: Alice: Rodrigo, Bob: Lola, Claire: Patrick. (2) Claire and Bob switch partners: Alice: Rodrigo, Bob: Patrick, Claire: Lola. (3) Bob and Alice switch partners: Alice: Patrick, Bob: Rodrigo, Claire: Lola. -At the end of the dance, Alice is dancing with Patrick. So the answer is (C). \ No newline at end of file +At the end of the dance, Alice is dancing with Patrick. So the answer is (C). diff --git a/opencompass/configs/datasets/bbh/lib_prompt/web_of_lies.txt b/opencompass/configs/datasets/bbh/lib_prompt/web_of_lies.txt index 7569a978..86cc01b9 100755 --- a/opencompass/configs/datasets/bbh/lib_prompt/web_of_lies.txt +++ b/opencompass/configs/datasets/bbh/lib_prompt/web_of_lies.txt @@ -25,4 +25,4 @@ A: Let's think step by step. (3) Raymond says Michaela tells the truth. Since we know from (2) that Michaela lies, if Raymond says Michaela tells the truth, then Raymond lies. (4) Osvaldo says Raymond tells the truth. Since we know from (3) that Raymond lies, if Osvaldo says Raymond tells the truth, then Osvaldo lies. (5) Jamey says Osvaldo tells the truth. Since we know from (4) that Osvaldo lies, if Jamey says Osvaldo tells the truth, then Jamey lies. -Now, the question asks: Does Jamey tell the truth? We know from (5) that Jamey lies. So the answer is No. \ No newline at end of file +Now, the question asks: Does Jamey tell the truth? We know from (5) that Jamey lies. So the answer is No. diff --git a/opencompass/configs/datasets/bbh/lib_prompt/word_sorting.txt b/opencompass/configs/datasets/bbh/lib_prompt/word_sorting.txt index 45d9bc9f..c20b8b18 100755 --- a/opencompass/configs/datasets/bbh/lib_prompt/word_sorting.txt +++ b/opencompass/configs/datasets/bbh/lib_prompt/word_sorting.txt @@ -14,4 +14,4 @@ The second letter: "ponderosa": "o" (15). "phone": "h" (8). We now have: (8) "ph Q: Sort the following words alphabetically: List: newt arson parthia seismography mugho aspect census A: Let's think step by step. The first letter: "newt": "n" (14). "arson": "a" (1). "parthia": "p" (16). "seismography": "s" (19). "mugho": "m" (13). "aspect": "a" (1). "census": "c" (3). We now have: (1) ["arson" ? "aspect"] < (3) "census" < (13) "mugho" < (14) "newt" < (16) "parthia" < (19) "seismography". Now let's sort this subpart ["arson" ? "aspect"] by looking at their second letters. -The second letter: "arson": "r" (18). "aspect": "s" (19). We now have: (18) "arson" < (19) "aspect". Hence, we have ["arson" < "aspect"] < "census" < "mugho" < "newt" < "parthia" < "seismography". So the answer is arson aspect census mugho newt parthia seismography. \ No newline at end of file +The second letter: "arson": "r" (18). "aspect": "s" (19). We now have: (18) "arson" < (19) "aspect". Hence, we have ["arson" < "aspect"] < "census" < "mugho" < "newt" < "parthia" < "seismography". So the answer is arson aspect census mugho newt parthia seismography. diff --git a/opencompass/configs/datasets/bigcodebench/bigcodebench_gen.py b/opencompass/configs/datasets/bigcodebench/bigcodebench_gen.py index d211b2b6..9550faa7 100644 --- a/opencompass/configs/datasets/bigcodebench/bigcodebench_gen.py +++ b/opencompass/configs/datasets/bigcodebench/bigcodebench_gen.py @@ -4,4 +4,4 @@ with read_base(): from .bigcodebench_hard_instruct_gen import bigcodebench_hard_instruct_datasets from .bigcodebench_hard_complete_gen import bigcodebench_hard_complete_datasets -bigcodebench_hard_datasets = sum((v for k, v in locals().items() if k.endswith('_ds')), []) \ No newline at end of file +bigcodebench_hard_datasets = sum((v for k, v in locals().items() if k.endswith('_ds')), []) diff --git a/opencompass/configs/datasets/cmmlu/cmmlu_gen.py b/opencompass/configs/datasets/cmmlu/cmmlu_gen.py index f8b559cd..ce059377 100644 --- a/opencompass/configs/datasets/cmmlu/cmmlu_gen.py +++ b/opencompass/configs/datasets/cmmlu/cmmlu_gen.py @@ -1,4 +1,4 @@ from mmengine.config import read_base with read_base(): - from .cmmlu_0shot_cot_gen_305931 import cmmlu_datasets # noqa: F401, F403 \ No newline at end of file + from .cmmlu_0shot_cot_gen_305931 import cmmlu_datasets # noqa: F401, F403 diff --git a/opencompass/configs/datasets/cmmlu/cmmlu_llm_judge_gen.py b/opencompass/configs/datasets/cmmlu/cmmlu_llm_judge_gen.py index d5ca44de..d547462e 100644 --- a/opencompass/configs/datasets/cmmlu/cmmlu_llm_judge_gen.py +++ b/opencompass/configs/datasets/cmmlu/cmmlu_llm_judge_gen.py @@ -1,4 +1,4 @@ from mmengine.config import read_base with read_base(): - from .cmmlu_llmjudge_gen_e1cd9a import cmmlu_datasets # noqa: F401, F403 \ No newline at end of file + from .cmmlu_llmjudge_gen_e1cd9a import cmmlu_datasets # noqa: F401, F403 diff --git a/opencompass/configs/datasets/cmo_fib/README.md b/opencompass/configs/datasets/cmo_fib/README.md index 9f397f8b..7a3f079c 100644 --- a/opencompass/configs/datasets/cmo_fib/README.md +++ b/opencompass/configs/datasets/cmo_fib/README.md @@ -10,4 +10,4 @@ Math dataset composed of problems from CMO (Chinese Mathematical Olympiad) 2009- | Qwen2.5-72B-Instruct | Qwen2.5-7B-Instruct | internlm2_5-7b-chat | | ----------- | ----------- | ----------- | -| 20.00 | 16.67 | 6.67 | \ No newline at end of file +| 20.00 | 16.67 | 6.67 | diff --git a/opencompass/configs/datasets/cmo_fib/cmo_fib_0shot_notcot_gen_4c6c29.py b/opencompass/configs/datasets/cmo_fib/cmo_fib_0shot_notcot_gen_4c6c29.py index 39b08adf..3961a440 100644 --- a/opencompass/configs/datasets/cmo_fib/cmo_fib_0shot_notcot_gen_4c6c29.py +++ b/opencompass/configs/datasets/cmo_fib/cmo_fib_0shot_notcot_gen_4c6c29.py @@ -36,4 +36,4 @@ cmo_fib_datasets = [ infer_cfg=cmo_fib_infer_cfg, eval_cfg=cmo_fib_eval_cfg ) -] \ No newline at end of file +] diff --git a/opencompass/configs/datasets/cmo_fib/cmo_fib_gen.py b/opencompass/configs/datasets/cmo_fib/cmo_fib_gen.py index aa12cd51..6f5e56ab 100644 --- a/opencompass/configs/datasets/cmo_fib/cmo_fib_gen.py +++ b/opencompass/configs/datasets/cmo_fib/cmo_fib_gen.py @@ -1,4 +1,4 @@ from mmengine.config import read_base with read_base(): - from .cmo_fib_gen_ace24b import cmo_fib_datasets # noqa: F401, F403 \ No newline at end of file + from .cmo_fib_gen_ace24b import cmo_fib_datasets # noqa: F401, F403 diff --git a/opencompass/configs/datasets/cmo_fib/cmo_fib_gen_2783e5.py b/opencompass/configs/datasets/cmo_fib/cmo_fib_gen_2783e5.py index 6fc1147c..e84248cd 100644 --- a/opencompass/configs/datasets/cmo_fib/cmo_fib_gen_2783e5.py +++ b/opencompass/configs/datasets/cmo_fib/cmo_fib_gen_2783e5.py @@ -36,4 +36,4 @@ cmo_fib_datasets = [ infer_cfg=cmo_fib_infer_cfg, eval_cfg=cmo_fib_eval_cfg ) -] \ No newline at end of file +] diff --git a/opencompass/configs/datasets/cmo_fib/cmo_fib_gen_ace24b.py b/opencompass/configs/datasets/cmo_fib/cmo_fib_gen_ace24b.py index 0fc523e1..2df1dff6 100644 --- a/opencompass/configs/datasets/cmo_fib/cmo_fib_gen_ace24b.py +++ b/opencompass/configs/datasets/cmo_fib/cmo_fib_gen_ace24b.py @@ -36,4 +36,4 @@ cmo_fib_datasets = [ infer_cfg=cmo_fib_infer_cfg, eval_cfg=cmo_fib_eval_cfg ) -] \ No newline at end of file +] diff --git a/opencompass/configs/datasets/drop/drop_gen.py b/opencompass/configs/datasets/drop/drop_gen.py index 44592ff6..6940c25a 100644 --- a/opencompass/configs/datasets/drop/drop_gen.py +++ b/opencompass/configs/datasets/drop/drop_gen.py @@ -1,4 +1,4 @@ from mmengine.config import read_base with read_base(): - from .drop_openai_simple_evals_gen_3857b0 import drop_datasets # noqa: F401, F403 \ No newline at end of file + from .drop_openai_simple_evals_gen_3857b0 import drop_datasets # noqa: F401, F403 diff --git a/opencompass/configs/datasets/drop/drop_llm_judge_gen.py b/opencompass/configs/datasets/drop/drop_llm_judge_gen.py index 0694c276..e339812f 100644 --- a/opencompass/configs/datasets/drop/drop_llm_judge_gen.py +++ b/opencompass/configs/datasets/drop/drop_llm_judge_gen.py @@ -1,4 +1,4 @@ from mmengine.config import read_base with read_base(): - from .drop_llmjudge_gen_3857b0 import drop_datasets # noqa: F401, F403 \ No newline at end of file + from .drop_llmjudge_gen_3857b0 import drop_datasets # noqa: F401, F403 diff --git a/opencompass/configs/datasets/gpqa/gpqa_gen.py b/opencompass/configs/datasets/gpqa/gpqa_gen.py index 433ef9f5..a1ae96f8 100644 --- a/opencompass/configs/datasets/gpqa/gpqa_gen.py +++ b/opencompass/configs/datasets/gpqa/gpqa_gen.py @@ -1,4 +1,4 @@ from mmengine.config import read_base with read_base(): - from .gpqa_openai_simple_evals_gen_5aeece import gpqa_datasets # noqa: F401, F403 \ No newline at end of file + from .gpqa_openai_simple_evals_gen_5aeece import gpqa_datasets # noqa: F401, F403 diff --git a/opencompass/configs/datasets/hellaswag/hellaswag_gen.py b/opencompass/configs/datasets/hellaswag/hellaswag_gen.py index 7806d705..2ad2893c 100644 --- a/opencompass/configs/datasets/hellaswag/hellaswag_gen.py +++ b/opencompass/configs/datasets/hellaswag/hellaswag_gen.py @@ -1,4 +1,4 @@ from mmengine.config import read_base with read_base(): - from .hellaswag_10shot_gen_e42710 import hellaswag_datasets # noqa: F401, F403 \ No newline at end of file + from .hellaswag_10shot_gen_e42710 import hellaswag_datasets # noqa: F401, F403 diff --git a/opencompass/configs/datasets/hellaswag/hellaswag_llm_judge_gen.py b/opencompass/configs/datasets/hellaswag/hellaswag_llm_judge_gen.py index ff641d26..30aa503a 100644 --- a/opencompass/configs/datasets/hellaswag/hellaswag_llm_judge_gen.py +++ b/opencompass/configs/datasets/hellaswag/hellaswag_llm_judge_gen.py @@ -1,4 +1,4 @@ from mmengine.config import read_base with read_base(): - from .hellaswag_llmjudge_gen_809ef1 import hellaswag_datasets # noqa: F401, F403 \ No newline at end of file + from .hellaswag_llmjudge_gen_809ef1 import hellaswag_datasets # noqa: F401, F403 diff --git a/opencompass/configs/datasets/humaneval/humaneval_gen.py b/opencompass/configs/datasets/humaneval/humaneval_gen.py index 61c3f3b3..9ce7b267 100644 --- a/opencompass/configs/datasets/humaneval/humaneval_gen.py +++ b/opencompass/configs/datasets/humaneval/humaneval_gen.py @@ -1,4 +1,4 @@ from mmengine.config import read_base with read_base(): - from .humaneval_openai_sample_evals_gen_dcae0e import humaneval_datasets # noqa: F401, F403 \ No newline at end of file + from .humaneval_openai_sample_evals_gen_dcae0e import humaneval_datasets # noqa: F401, F403 diff --git a/opencompass/configs/datasets/korbench/korbench_gen.py b/opencompass/configs/datasets/korbench/korbench_gen.py index 0492922a..4847c150 100644 --- a/opencompass/configs/datasets/korbench/korbench_gen.py +++ b/opencompass/configs/datasets/korbench/korbench_gen.py @@ -1,4 +1,4 @@ from mmengine.config import read_base with read_base(): - from .korbench_single_0_shot_gen import korbench_0shot_single_datasets # noqa: F401, F403 \ No newline at end of file + from .korbench_single_0_shot_gen import korbench_0shot_single_datasets # noqa: F401, F403 diff --git a/opencompass/configs/datasets/korbench/korbench_llm_judge_gen.py b/opencompass/configs/datasets/korbench/korbench_llm_judge_gen.py index e87b86f8..7fb894f3 100644 --- a/opencompass/configs/datasets/korbench/korbench_llm_judge_gen.py +++ b/opencompass/configs/datasets/korbench/korbench_llm_judge_gen.py @@ -1,4 +1,4 @@ from mmengine.config import read_base with read_base(): - from .korbench_single_0shot_genericllmeval_gen_56cf43 import korbench_0shot_single_datasets # noqa: F401, F403 \ No newline at end of file + from .korbench_single_0shot_genericllmeval_gen_56cf43 import korbench_0shot_single_datasets # noqa: F401, F403 diff --git a/opencompass/configs/datasets/korbench/korbench_mixed_gen_d00bdd.py b/opencompass/configs/datasets/korbench/korbench_mixed_gen_d00bdd.py index 2a174f39..dc042d89 100644 --- a/opencompass/configs/datasets/korbench/korbench_mixed_gen_d00bdd.py +++ b/opencompass/configs/datasets/korbench/korbench_mixed_gen_d00bdd.py @@ -56,4 +56,4 @@ for category in categories: eval_cfg=eval_cfg, ) - korbench_mixed_datasets.append(korbench_dataset) \ No newline at end of file + korbench_mixed_datasets.append(korbench_dataset) diff --git a/opencompass/configs/datasets/korbench/korbench_single_0shot_genericllmeval_gen_17854d.py b/opencompass/configs/datasets/korbench/korbench_single_0shot_genericllmeval_gen_17854d.py index 196a7978..c2ebbfad 100644 --- a/opencompass/configs/datasets/korbench/korbench_single_0shot_genericllmeval_gen_17854d.py +++ b/opencompass/configs/datasets/korbench/korbench_single_0shot_genericllmeval_gen_17854d.py @@ -113,4 +113,4 @@ for category in categories: mode='singlescore', ) - korbench_0shot_single_datasets.append(korbench_dataset) \ No newline at end of file + korbench_0shot_single_datasets.append(korbench_dataset) diff --git a/opencompass/configs/datasets/livecodebench/livecodebench_gen.py b/opencompass/configs/datasets/livecodebench/livecodebench_gen.py index b1966fe9..c75280d0 100644 --- a/opencompass/configs/datasets/livecodebench/livecodebench_gen.py +++ b/opencompass/configs/datasets/livecodebench/livecodebench_gen.py @@ -1,4 +1,4 @@ from mmengine.config import read_base with read_base(): - from .livecodebench_gen_a4f90b import LCB_datasets # noqa: F401, F403 \ No newline at end of file + from .livecodebench_gen_a4f90b import LCB_datasets # noqa: F401, F403 diff --git a/opencompass/configs/datasets/livemathbench/livemathbench_gen.py b/opencompass/configs/datasets/livemathbench/livemathbench_gen.py index 4977f1e2..b48484f4 100644 --- a/opencompass/configs/datasets/livemathbench/livemathbench_gen.py +++ b/opencompass/configs/datasets/livemathbench/livemathbench_gen.py @@ -1,4 +1,4 @@ from mmengine.config import read_base with read_base(): - from .livemathbench_gen_9befbf import livemathbench_datasets # noqa: F401, F403 \ No newline at end of file + from .livemathbench_gen_9befbf import livemathbench_datasets # noqa: F401, F403 diff --git a/opencompass/configs/datasets/livemathbench/livemathbench_gen_6eb711.py b/opencompass/configs/datasets/livemathbench/livemathbench_gen_6eb711.py index 4b8ca8fd..37fdee9c 100644 --- a/opencompass/configs/datasets/livemathbench/livemathbench_gen_6eb711.py +++ b/opencompass/configs/datasets/livemathbench/livemathbench_gen_6eb711.py @@ -46,4 +46,4 @@ livemathbench_datasets = [ infer_cfg=livemathbench_infer_cfg, eval_cfg=livemathbench_eval_cfg ) -] \ No newline at end of file +] diff --git a/opencompass/configs/datasets/livemathbench/livemathbench_gen_caed8f.py b/opencompass/configs/datasets/livemathbench/livemathbench_gen_caed8f.py index d0f73023..b789df5b 100644 --- a/opencompass/configs/datasets/livemathbench/livemathbench_gen_caed8f.py +++ b/opencompass/configs/datasets/livemathbench/livemathbench_gen_caed8f.py @@ -46,4 +46,4 @@ livemathbench_datasets = [ infer_cfg=livemathbench_infer_cfg, eval_cfg=livemathbench_eval_cfg ) -] \ No newline at end of file +] diff --git a/opencompass/configs/datasets/livemathbench/livemathbench_greedy_gen.py b/opencompass/configs/datasets/livemathbench/livemathbench_greedy_gen.py index c1d72d15..06bdf467 100644 --- a/opencompass/configs/datasets/livemathbench/livemathbench_greedy_gen.py +++ b/opencompass/configs/datasets/livemathbench/livemathbench_greedy_gen.py @@ -1,4 +1,4 @@ from mmengine.config import read_base with read_base(): - from .livemathbench_greedy_gen_9befbf import livemathbench_datasets # noqa: F401, F403 \ No newline at end of file + from .livemathbench_greedy_gen_9befbf import livemathbench_datasets # noqa: F401, F403 diff --git a/opencompass/configs/datasets/livemathbench/livemathbench_greedy_gen_9befbf.py b/opencompass/configs/datasets/livemathbench/livemathbench_greedy_gen_9befbf.py index c8f66615..a88313d5 100644 --- a/opencompass/configs/datasets/livemathbench/livemathbench_greedy_gen_9befbf.py +++ b/opencompass/configs/datasets/livemathbench/livemathbench_greedy_gen_9befbf.py @@ -42,4 +42,4 @@ livemathbench_dataset = dict( ) ) ) -livemathbench_datasets = [livemathbench_dataset] \ No newline at end of file +livemathbench_datasets = [livemathbench_dataset] diff --git a/opencompass/configs/datasets/livemathbench/livemathbench_hard_gen_353ae7.py b/opencompass/configs/datasets/livemathbench/livemathbench_hard_gen_353ae7.py index 6b2f9f5a..b5984960 100644 --- a/opencompass/configs/datasets/livemathbench/livemathbench_hard_gen_353ae7.py +++ b/opencompass/configs/datasets/livemathbench/livemathbench_hard_gen_353ae7.py @@ -41,4 +41,4 @@ livemathbench_dataset = dict( ) ) ) -livemathbench_datasets = [livemathbench_dataset] \ No newline at end of file +livemathbench_datasets = [livemathbench_dataset] diff --git a/opencompass/configs/datasets/livemathbench/livemathbench_hard_greedy_gen_353ae7.py b/opencompass/configs/datasets/livemathbench/livemathbench_hard_greedy_gen_353ae7.py index f956f83e..65b40d97 100644 --- a/opencompass/configs/datasets/livemathbench/livemathbench_hard_greedy_gen_353ae7.py +++ b/opencompass/configs/datasets/livemathbench/livemathbench_hard_greedy_gen_353ae7.py @@ -41,4 +41,4 @@ livemathbench_dataset = dict( ) ) ) -livemathbench_datasets = [livemathbench_dataset] \ No newline at end of file +livemathbench_datasets = [livemathbench_dataset] diff --git a/opencompass/configs/datasets/maritimebench/maritimebench_gen.py b/opencompass/configs/datasets/maritimebench/maritimebench_gen.py new file mode 100644 index 00000000..9f28bc3b --- /dev/null +++ b/opencompass/configs/datasets/maritimebench/maritimebench_gen.py @@ -0,0 +1,42 @@ +from opencompass.datasets import MaritimeBenchDataset +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.utils.text_postprocessors import first_option_postprocess +from opencompass.openicl.icl_evaluator import AccEvaluator +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer + +maritimebench_reader_cfg = dict( + input_columns=['question', 'A', 'B', 'C', 'D'], + output_column='answer', + train_split='test' # 明确指定使用test分割 +) + +maritimebench_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict(role='HUMAN', prompt='请回答单选题。要求只输出选项,不输出解释,将选项放在<>里,直接输出答案。示例:\n\n题目:在船舶主推进动力装置中,传动轴系在运转中承受以下复杂的应力和负荷,但不包括______。\n选项:\nA. 电磁力\nB. 压拉应力\nC. 弯曲应力\nD. 扭应力\n答: 当前题目:\n {question}\nA:{A}\nB:{B}\nC:{C}\nD:{D}') + ] + ), + ), + retriever=dict(type=ZeroRetriever), # 不使用上下文 + inferencer=dict(type=GenInferencer) # 添加推理器配置 +) + +maritimebench_eval_cfg = dict( + evaluator=dict(type=AccEvaluator), + pred_postprocessor=dict(type=first_option_postprocess, options='ABCD') +) + +maritimebench_datasets = [ + dict( + abbr='maritimebench', + type=MaritimeBenchDataset, + name='default', + path='opencompass/maritimebench', + reader_cfg=maritimebench_reader_cfg, + infer_cfg=maritimebench_infer_cfg, + eval_cfg=maritimebench_eval_cfg + ) +] \ No newline at end of file diff --git a/opencompass/configs/datasets/math/math_llm_judge_gen.py b/opencompass/configs/datasets/math/math_llm_judge_gen.py index fff23ef6..72839900 100644 --- a/opencompass/configs/datasets/math/math_llm_judge_gen.py +++ b/opencompass/configs/datasets/math/math_llm_judge_gen.py @@ -1,4 +1,4 @@ from mmengine.config import read_base with read_base(): - from .math_llm_judge_gen_56606f import math_datasets # noqa: F401, F403 \ No newline at end of file + from .math_llm_judge_gen_56606f import math_datasets # noqa: F401, F403 diff --git a/opencompass/configs/datasets/math/math_prm800k_500_0shot_cot_gen.py b/opencompass/configs/datasets/math/math_prm800k_500_0shot_cot_gen.py index 0faf8630..a50bce75 100644 --- a/opencompass/configs/datasets/math/math_prm800k_500_0shot_cot_gen.py +++ b/opencompass/configs/datasets/math/math_prm800k_500_0shot_cot_gen.py @@ -41,4 +41,4 @@ math_datasets = [ infer_cfg=math_infer_cfg, eval_cfg=math_eval_cfg, ) -] \ No newline at end of file +] diff --git a/opencompass/configs/datasets/math/math_prm800k_500_gen.py b/opencompass/configs/datasets/math/math_prm800k_500_gen.py index 759b1b63..cce9e8c6 100644 --- a/opencompass/configs/datasets/math/math_prm800k_500_gen.py +++ b/opencompass/configs/datasets/math/math_prm800k_500_gen.py @@ -1,4 +1,4 @@ from mmengine.config import read_base with read_base(): - from .math_prm800k_500_0shot_cot_gen_11c4b5 import math_datasets # noqa: F401, F403 \ No newline at end of file + from .math_prm800k_500_0shot_cot_gen_11c4b5 import math_datasets # noqa: F401, F403 diff --git a/opencompass/configs/datasets/math/math_prm800k_500_llm_judge_gen.py b/opencompass/configs/datasets/math/math_prm800k_500_llm_judge_gen.py index 461b3a9a..5cdbb4d7 100644 --- a/opencompass/configs/datasets/math/math_prm800k_500_llm_judge_gen.py +++ b/opencompass/configs/datasets/math/math_prm800k_500_llm_judge_gen.py @@ -1,4 +1,4 @@ from mmengine.config import read_base with read_base(): - from .math_prm800k_500_0shot_nocot_genericllmeval_gen_6ff468 import math_datasets # noqa: F401, F403 \ No newline at end of file + from .math_prm800k_500_0shot_nocot_genericllmeval_gen_6ff468 import math_datasets # noqa: F401, F403 diff --git a/opencompass/configs/datasets/mmlu/mmlu_gen.py b/opencompass/configs/datasets/mmlu/mmlu_gen.py index 5c8303b8..5ceebaf5 100644 --- a/opencompass/configs/datasets/mmlu/mmlu_gen.py +++ b/opencompass/configs/datasets/mmlu/mmlu_gen.py @@ -1,4 +1,4 @@ from mmengine.config import read_base with read_base(): - from .mmlu_openai_simple_evals_gen_b618ea import mmlu_datasets # noqa: F401, F403 \ No newline at end of file + from .mmlu_openai_simple_evals_gen_b618ea import mmlu_datasets # noqa: F401, F403 diff --git a/opencompass/configs/datasets/mmlu/mmlu_llm_judge_gen.py b/opencompass/configs/datasets/mmlu/mmlu_llm_judge_gen.py index b2389fb2..655cfa69 100644 --- a/opencompass/configs/datasets/mmlu/mmlu_llm_judge_gen.py +++ b/opencompass/configs/datasets/mmlu/mmlu_llm_judge_gen.py @@ -1,4 +1,4 @@ from mmengine.config import read_base with read_base(): - from .mmlu_llmjudge_gen_f4336b import mmlu_datasets # noqa: F401, F403 \ No newline at end of file + from .mmlu_llmjudge_gen_f4336b import mmlu_datasets # noqa: F401, F403 diff --git a/opencompass/configs/datasets/mmlu_pro/mmlu_pro_gen.py b/opencompass/configs/datasets/mmlu_pro/mmlu_pro_gen.py index 228dad99..65357db4 100644 --- a/opencompass/configs/datasets/mmlu_pro/mmlu_pro_gen.py +++ b/opencompass/configs/datasets/mmlu_pro/mmlu_pro_gen.py @@ -1,4 +1,4 @@ from mmengine.config import read_base with read_base(): - from .mmlu_pro_0shot_cot_gen_08c1de import mmlu_pro_datasets # noqa: F401, F403 \ No newline at end of file + from .mmlu_pro_0shot_cot_gen_08c1de import mmlu_pro_datasets # noqa: F401, F403 diff --git a/opencompass/configs/datasets/mmlu_pro/mmlu_pro_llm_judge_gen.py b/opencompass/configs/datasets/mmlu_pro/mmlu_pro_llm_judge_gen.py index a895d5c2..0ef6435d 100644 --- a/opencompass/configs/datasets/mmlu_pro/mmlu_pro_llm_judge_gen.py +++ b/opencompass/configs/datasets/mmlu_pro/mmlu_pro_llm_judge_gen.py @@ -1,4 +1,4 @@ from mmengine.config import read_base with read_base(): - from .mmlu_pro_0shot_nocot_genericllmeval_gen_08c1de import mmlu_pro_datasets # noqa: F401, F403 \ No newline at end of file + from .mmlu_pro_0shot_nocot_genericllmeval_gen_08c1de import mmlu_pro_datasets # noqa: F401, F403 diff --git a/opencompass/configs/datasets/mmmlu/README.md b/opencompass/configs/datasets/mmmlu/README.md index 8faef6b2..da130ff1 100644 --- a/opencompass/configs/datasets/mmmlu/README.md +++ b/opencompass/configs/datasets/mmmlu/README.md @@ -32,4 +32,4 @@ from datasets import load_dataset ds = load_dataset("openai/MMMLU", "default") from datasets import load_dataset ds = load_dataset("openai/MMMLU", "by_language") -``` \ No newline at end of file +``` diff --git a/opencompass/configs/datasets/mmmlu/mmmlu_gen.py b/opencompass/configs/datasets/mmmlu/mmmlu_gen.py index 55790978..328ddefd 100644 --- a/opencompass/configs/datasets/mmmlu/mmmlu_gen.py +++ b/opencompass/configs/datasets/mmmlu/mmmlu_gen.py @@ -1,4 +1,4 @@ from mmengine.config import read_base with read_base(): - from .mmmlu_gen_c51a84 import mmmlu_datasets # noqa: F401, F403 \ No newline at end of file + from .mmmlu_gen_c51a84 import mmmlu_datasets # noqa: F401, F403 diff --git a/opencompass/configs/datasets/mmmlu/mmmlu_gen_c51a84.py b/opencompass/configs/datasets/mmmlu/mmmlu_gen_c51a84.py index 85c196c2..2034d3db 100644 --- a/opencompass/configs/datasets/mmmlu/mmmlu_gen_c51a84.py +++ b/opencompass/configs/datasets/mmmlu/mmmlu_gen_c51a84.py @@ -102,4 +102,4 @@ for _name in mmmlu_all_sets: eval_cfg=mmmlu_eval_cfg, )) -del _name, _hint, _prompt \ No newline at end of file +del _name, _hint, _prompt diff --git a/opencompass/configs/datasets/mmmlu_lite/README.md b/opencompass/configs/datasets/mmmlu_lite/README.md index f7866a0b..6b02397f 100644 --- a/opencompass/configs/datasets/mmmlu_lite/README.md +++ b/opencompass/configs/datasets/mmmlu_lite/README.md @@ -35,4 +35,4 @@ MMMLU contains the MMLU test set translated into the following locales: ```python from datasets import load_dataset ds = load_dataset("opencompass/mmmlu_lite", "AR_XY") -``` \ No newline at end of file +``` diff --git a/opencompass/configs/datasets/mmmlu_lite/mmmlu_lite_gen.py b/opencompass/configs/datasets/mmmlu_lite/mmmlu_lite_gen.py index d918cc36..f6df5bdc 100644 --- a/opencompass/configs/datasets/mmmlu_lite/mmmlu_lite_gen.py +++ b/opencompass/configs/datasets/mmmlu_lite/mmmlu_lite_gen.py @@ -1,4 +1,4 @@ from mmengine.config import read_base with read_base(): - from .mmmlu_lite_gen_c51a84 import mmmlu_lite_datasets # noqa: F401, F403 \ No newline at end of file + from .mmmlu_lite_gen_c51a84 import mmmlu_lite_datasets # noqa: F401, F403 diff --git a/opencompass/configs/datasets/mmmlu_lite/mmmlu_lite_gen_c51a84.py b/opencompass/configs/datasets/mmmlu_lite/mmmlu_lite_gen_c51a84.py index 9e9a8ab4..61ec9b07 100644 --- a/opencompass/configs/datasets/mmmlu_lite/mmmlu_lite_gen_c51a84.py +++ b/opencompass/configs/datasets/mmmlu_lite/mmmlu_lite_gen_c51a84.py @@ -102,4 +102,4 @@ for _name in mmmlu_lite_all_sets: eval_cfg=mmmlu_lite_eval_cfg, )) -del _name, _hint, _prompt \ No newline at end of file +del _name, _hint, _prompt diff --git a/opencompass/configs/datasets/musr/musr_llm_judge_gen.py b/opencompass/configs/datasets/musr/musr_llm_judge_gen.py index 29bc39dc..a0f71ed0 100644 --- a/opencompass/configs/datasets/musr/musr_llm_judge_gen.py +++ b/opencompass/configs/datasets/musr/musr_llm_judge_gen.py @@ -1,4 +1,4 @@ from mmengine.config import read_base with read_base(): - from .musr_llmjudge_gen_b47fd3 import musr_datasets # noqa: F401, F403 \ No newline at end of file + from .musr_llmjudge_gen_b47fd3 import musr_datasets # noqa: F401, F403 diff --git a/opencompass/configs/datasets/omni_math/README.md b/opencompass/configs/datasets/omni_math/README.md index 12f54c1b..c4b9a7d6 100644 --- a/opencompass/configs/datasets/omni_math/README.md +++ b/opencompass/configs/datasets/omni_math/README.md @@ -40,4 +40,4 @@ omni_math_dataset['eval_cfg']['evaluator'].update( | llama-3_1-8b-instruct | qwen-2_5-7b-instruct | InternLM3-8b-Instruct | | -- | -- | -- | -| 15.18 | 29.97 | 32.75 | \ No newline at end of file +| 15.18 | 29.97 | 32.75 | diff --git a/opencompass/configs/datasets/omni_math/omni_math_gen.py b/opencompass/configs/datasets/omni_math/omni_math_gen.py index bb193ccf..ef368325 100644 --- a/opencompass/configs/datasets/omni_math/omni_math_gen.py +++ b/opencompass/configs/datasets/omni_math/omni_math_gen.py @@ -1,4 +1,4 @@ from mmengine.config import read_base with read_base(): - from .omni_math_gen_18cc08 import omni_math_datasets # noqa: F401, F403 \ No newline at end of file + from .omni_math_gen_18cc08 import omni_math_datasets # noqa: F401, F403 diff --git a/opencompass/configs/datasets/omni_math/omni_math_gen_18cc08.py b/opencompass/configs/datasets/omni_math/omni_math_gen_18cc08.py index 20920694..1372d1d7 100644 --- a/opencompass/configs/datasets/omni_math/omni_math_gen_18cc08.py +++ b/opencompass/configs/datasets/omni_math/omni_math_gen_18cc08.py @@ -42,4 +42,4 @@ omni_math_datasets = [ infer_cfg=infer_cfg, eval_cfg=eval_cfg ) -] \ No newline at end of file +] diff --git a/opencompass/configs/datasets/omni_math/omni_math_llmverify_gen_ccf9c0.py b/opencompass/configs/datasets/omni_math/omni_math_llmverify_gen_ccf9c0.py index 2a4320b8..0925f045 100644 --- a/opencompass/configs/datasets/omni_math/omni_math_llmverify_gen_ccf9c0.py +++ b/opencompass/configs/datasets/omni_math/omni_math_llmverify_gen_ccf9c0.py @@ -86,4 +86,4 @@ omnimath_datasets = [ infer_cfg=omnimath_infer_cfg, eval_cfg=omnimath_eval_cfg ) -] \ No newline at end of file +] diff --git a/opencompass/configs/models/deepseek/lmdeploy_deepseek_v2.py b/opencompass/configs/models/deepseek/lmdeploy_deepseek_v2.py index a535003e..c4f61898 100644 --- a/opencompass/configs/models/deepseek/lmdeploy_deepseek_v2.py +++ b/opencompass/configs/models/deepseek/lmdeploy_deepseek_v2.py @@ -15,4 +15,4 @@ lmdeploy_deepseek_v2_model = [ batch_size=4, run_cfg=dict(num_gpus=8), ) -] \ No newline at end of file +] diff --git a/opencompass/configs/models/qwen2_5/hf_qwen_2_5_32b.py b/opencompass/configs/models/qwen2_5/hf_qwen_2_5_32b.py index ddd27f7f..823f3676 100644 --- a/opencompass/configs/models/qwen2_5/hf_qwen_2_5_32b.py +++ b/opencompass/configs/models/qwen2_5/hf_qwen_2_5_32b.py @@ -9,4 +9,4 @@ models = [ batch_size=8, run_cfg=dict(num_gpus=2), ) -] \ No newline at end of file +] diff --git a/opencompass/configs/models/qwen2_5/hf_qwen_2_5_7b.py b/opencompass/configs/models/qwen2_5/hf_qwen_2_5_7b.py index 579950c6..9b3b2f6b 100644 --- a/opencompass/configs/models/qwen2_5/hf_qwen_2_5_7b.py +++ b/opencompass/configs/models/qwen2_5/hf_qwen_2_5_7b.py @@ -9,4 +9,4 @@ models = [ batch_size=8, run_cfg=dict(num_gpus=1), ) -] \ No newline at end of file +] diff --git a/opencompass/configs/models/qwq/lmdeploy_qwq_32b.py b/opencompass/configs/models/qwq/lmdeploy_qwq_32b.py index 6c2bf078..b52d40ee 100644 --- a/opencompass/configs/models/qwq/lmdeploy_qwq_32b.py +++ b/opencompass/configs/models/qwq/lmdeploy_qwq_32b.py @@ -14,4 +14,4 @@ models = [ run_cfg=dict(num_gpus=2), pred_postprocessor=dict(type=extract_non_reasoning_content) ) -] \ No newline at end of file +] diff --git a/opencompass/configs/summarizers/groups/bbeh.py b/opencompass/configs/summarizers/groups/bbeh.py index 95697144..44873412 100644 --- a/opencompass/configs/summarizers/groups/bbeh.py +++ b/opencompass/configs/summarizers/groups/bbeh.py @@ -10,4 +10,4 @@ _bbeh = [ 'bbeh_web_of_lies', 'bbeh_word_sorting', 'bbeh_zebra_puzzles' ] bbeh_summary_groups.append({'name': 'bbeh', 'subsets': _bbeh, 'metric':'naive_average'}) -bbeh_summary_groups.append({'name': 'bbeh', 'subsets': _bbeh, 'metric':'harmonic_mean'}) \ No newline at end of file +bbeh_summary_groups.append({'name': 'bbeh', 'subsets': _bbeh, 'metric':'harmonic_mean'}) diff --git a/opencompass/datasets/NPHardEval/__init__.py b/opencompass/datasets/NPHardEval/__init__.py index dcaabd11..76a3c4f1 100644 --- a/opencompass/datasets/NPHardEval/__init__.py +++ b/opencompass/datasets/NPHardEval/__init__.py @@ -1,9 +1,9 @@ -from .cmp_GCP_D import * # noqa: F401, F403 -from .cmp_KSP import * # noqa: F401, F403 -from .cmp_TSP_D import * # noqa: F401, F403 -from .hard_GCP import * # noqa: F401, F403 -from .hard_MSP import * # noqa: F401, F403 -from .hard_TSP import * # noqa: F401, F403 -from .p_BSP import * # noqa: F401, F403 -from .p_EDP import * # noqa: F401, F403 -from .p_SPP import * # noqa: F401, F403 +from .cmp_GCP_D import * # noqa: F401, F403 +from .cmp_KSP import * # noqa: F401, F403 +from .cmp_TSP_D import * # noqa: F401, F403 +from .hard_GCP import * # noqa: F401, F403 +from .hard_MSP import * # noqa: F401, F403 +from .hard_TSP import * # noqa: F401, F403 +from .p_BSP import * # noqa: F401, F403 +from .p_EDP import * # noqa: F401, F403 +from .p_SPP import * # noqa: F401, F403 diff --git a/opencompass/datasets/NPHardEval/cmp_GCP_D.py b/opencompass/datasets/NPHardEval/cmp_GCP_D.py index 5090fb50..f0d40213 100644 --- a/opencompass/datasets/NPHardEval/cmp_GCP_D.py +++ b/opencompass/datasets/NPHardEval/cmp_GCP_D.py @@ -1,167 +1,167 @@ -import ast - -try: - import networkx as nx -except ImportError: - nx = None - -from datasets import Dataset - -from opencompass.openicl.icl_evaluator import BaseEvaluator -from opencompass.registry import ICL_EVALUATORS, LOAD_DATASET -from opencompass.utils import get_data_path - -from ..base import BaseDataset -from .prompts import gcp_dPrompts - - -def q2text(q, p=gcp_dPrompts): - number_of_colors = q.split('\n')[0].split()[-2] # last character of the first line - number_of_vertices = q.split('\n')[1].split(' ')[2] # third word of the second line - prompt_text = p['Intro'] + '\n' + \ - p['Initial_question'].format(total_vertices=number_of_vertices, number_of_colors=number_of_colors) + '\n' + \ - p['Output_content'] + '\n' + \ - p['Output_format'] + '\n' + \ - '\n The graph is below: \n' - for line in q.split('\n')[2:]: - vertex_list = line.split(' ') - this_line = 'Vertex {} is connected to vertex {}.'.format( - vertex_list[1], vertex_list[2]) - prompt_text += this_line + '\n' - return prompt_text - - -@LOAD_DATASET.register_module(force=True) -class CMP_GCP_D_Dataset(BaseDataset): - - @staticmethod - def load(path: str): - path = get_data_path(path, local_mode=True) - raw_data = [] - data_path = path - all_data = [] - for file_num in range(10): - with open(data_path + 'decision_data_GCP_{}.txt'.format(file_num)) as f: - data = f.read() - sample = data.split('\n\n')[:-1] - all_data += zip([file_num + 1] * len(sample), sample) - for (level, q) in all_data: - prompt = q2text(q) - raw_data.append({ - 'prompt': prompt, - 'q': str(level) + '####\n' + q, - 'level': level - }) - dataset = Dataset.from_list(raw_data) - return dataset - - -@ICL_EVALUATORS.register_module(force=True) -class CMP_GCP_D_Evaluator(BaseEvaluator): - - def score(self, predictions, references): - assert len(predictions) == len(references) - - result = {'pass': 0, 'fail': 0} - details = {} - for index, (q, output) in enumerate(zip(references, predictions)): - output_dict = {} - level = int(q.split('####\n')[0]) - q = q.split('####\n')[-1] - try: - number_of_colors = int(q.split('\n')[0].split()[-2]) - output, reasoning = self.parse_xml_to_dict(output) - output_dict['output'] = output - output_dict['correctness'], _ = self.gcp_decision_check(q, output, number_of_colors) - except Exception as e: - print(f'Attempt failed: {e}') - output_dict['correctness'] = False - output_dict['reasoning'] = reasoning - - if output_dict['correctness']: - r = 'pass' - else: - r = 'fail' - result[r] += level - details[str(index)] = {'q': q, 'output': output, 'result': r} - - result['score'] = result['pass'] / (result['pass'] + result['fail']) * 100 - result['details'] = details - final_result = {'Weighted Accuracy': result['score']} - return final_result - - def parse_xml_to_dict(self, xml_string): - try: - assert '' in xml_string - assert '' in xml_string - assert '' in xml_string - assert '' in xml_string - final_answer_start = xml_string.index('') + len('') - final_answer_end = xml_string.index('') - reasoning_start = xml_string.index('') + len('') - reasoning_end = xml_string.index('') - final_answer_element = xml_string[final_answer_start:final_answer_end].rstrip().strip().rstrip() - reasoning_element = xml_string[reasoning_start:reasoning_end].rstrip().strip().rstrip() - try: - final_answer_element = ast.literal_eval(final_answer_element) - except Exception: - final_answer_element = '' - except Exception: - final_answer_element = '' - reasoning_element = '' - - return final_answer_element, reasoning_element - - def read_dimacs_format(self, dimacs_str): - lines = dimacs_str.strip().split('\n') - p_line = next(line for line in lines if line.startswith('p')) - _, _, num_vertices, num_edges = p_line.split() - num_vertices, num_edges = int(num_vertices), int(num_edges) - - adjacency_list = {i: set() for i in range(1, num_vertices + 1)} - for line in lines: - if line.startswith('e'): - _, vertex1, vertex2 = line.split() - vertex1, vertex2 = int(vertex1), int(vertex2) - if vertex1 in adjacency_list and vertex2 in adjacency_list: - adjacency_list[vertex1].add(vertex2) - adjacency_list[vertex2].add(vertex1) - - return num_vertices, adjacency_list - - def gcp_greedy_solution(self, adjacency_list): - """Provides a greedy solution to the GCP problem. - - :param adjacency_list: A dictionary of the adjacency list. - :return: A tuple of (num_colors, coloring). - """ - G = nx.Graph() - G.add_nodes_from(adjacency_list.keys()) - for vertex, neighbors in adjacency_list.items(): - for neighbor in neighbors: - G.add_edge(vertex, neighbor) - coloring = nx.coloring.greedy_color(G, strategy='largest_first') - num_colors = max(coloring.values()) + 1 - return num_colors, coloring - - def gcp_decision_check(self, dimacs_str, answer, k_colors): - """Check if the given GCP instance is feasible with k_colors. - - :param dimacs_str: The DIMACS format string of the GCP instance. - :param answer: The answer returned by the model. - :param k_colors: The target number of colors. - :return: A tuple of (is_correct, message). - """ - num_vertices, adjacency_list = self.read_dimacs_format(dimacs_str) - try: - is_feasible = answer.get('Feasible', 'no').lower() == 'yes' - except Exception: - return False, 'Feasible key not found' - num_colors, coloring = self.gcp_greedy_solution(adjacency_list) - exist_optimal = num_colors <= k_colors - if is_feasible != exist_optimal: - if exist_optimal: - return False, f'Feasibility mismatch: {coloring}' - else: - return False, f'Feasibility mismatch: {is_feasible} vs {exist_optimal}' - return True, 'Feasible' if is_feasible else 'Infeasible' +import ast + +try: + import networkx as nx +except ImportError: + nx = None + +from datasets import Dataset + +from opencompass.openicl.icl_evaluator import BaseEvaluator +from opencompass.registry import ICL_EVALUATORS, LOAD_DATASET +from opencompass.utils import get_data_path + +from ..base import BaseDataset +from .prompts import gcp_dPrompts + + +def q2text(q, p=gcp_dPrompts): + number_of_colors = q.split('\n')[0].split()[-2] # last character of the first line + number_of_vertices = q.split('\n')[1].split(' ')[2] # third word of the second line + prompt_text = p['Intro'] + '\n' + \ + p['Initial_question'].format(total_vertices=number_of_vertices, number_of_colors=number_of_colors) + '\n' + \ + p['Output_content'] + '\n' + \ + p['Output_format'] + '\n' + \ + '\n The graph is below: \n' + for line in q.split('\n')[2:]: + vertex_list = line.split(' ') + this_line = 'Vertex {} is connected to vertex {}.'.format( + vertex_list[1], vertex_list[2]) + prompt_text += this_line + '\n' + return prompt_text + + +@LOAD_DATASET.register_module(force=True) +class CMP_GCP_D_Dataset(BaseDataset): + + @staticmethod + def load(path: str): + path = get_data_path(path, local_mode=True) + raw_data = [] + data_path = path + all_data = [] + for file_num in range(10): + with open(data_path + 'decision_data_GCP_{}.txt'.format(file_num)) as f: + data = f.read() + sample = data.split('\n\n')[:-1] + all_data += zip([file_num + 1] * len(sample), sample) + for (level, q) in all_data: + prompt = q2text(q) + raw_data.append({ + 'prompt': prompt, + 'q': str(level) + '####\n' + q, + 'level': level + }) + dataset = Dataset.from_list(raw_data) + return dataset + + +@ICL_EVALUATORS.register_module(force=True) +class CMP_GCP_D_Evaluator(BaseEvaluator): + + def score(self, predictions, references): + assert len(predictions) == len(references) + + result = {'pass': 0, 'fail': 0} + details = {} + for index, (q, output) in enumerate(zip(references, predictions)): + output_dict = {} + level = int(q.split('####\n')[0]) + q = q.split('####\n')[-1] + try: + number_of_colors = int(q.split('\n')[0].split()[-2]) + output, reasoning = self.parse_xml_to_dict(output) + output_dict['output'] = output + output_dict['correctness'], _ = self.gcp_decision_check(q, output, number_of_colors) + except Exception as e: + print(f'Attempt failed: {e}') + output_dict['correctness'] = False + output_dict['reasoning'] = reasoning + + if output_dict['correctness']: + r = 'pass' + else: + r = 'fail' + result[r] += level + details[str(index)] = {'q': q, 'output': output, 'result': r} + + result['score'] = result['pass'] / (result['pass'] + result['fail']) * 100 + result['details'] = details + final_result = {'Weighted Accuracy': result['score']} + return final_result + + def parse_xml_to_dict(self, xml_string): + try: + assert '' in xml_string + assert '' in xml_string + assert '' in xml_string + assert '' in xml_string + final_answer_start = xml_string.index('') + len('') + final_answer_end = xml_string.index('') + reasoning_start = xml_string.index('') + len('') + reasoning_end = xml_string.index('') + final_answer_element = xml_string[final_answer_start:final_answer_end].rstrip().strip().rstrip() + reasoning_element = xml_string[reasoning_start:reasoning_end].rstrip().strip().rstrip() + try: + final_answer_element = ast.literal_eval(final_answer_element) + except Exception: + final_answer_element = '' + except Exception: + final_answer_element = '' + reasoning_element = '' + + return final_answer_element, reasoning_element + + def read_dimacs_format(self, dimacs_str): + lines = dimacs_str.strip().split('\n') + p_line = next(line for line in lines if line.startswith('p')) + _, _, num_vertices, num_edges = p_line.split() + num_vertices, num_edges = int(num_vertices), int(num_edges) + + adjacency_list = {i: set() for i in range(1, num_vertices + 1)} + for line in lines: + if line.startswith('e'): + _, vertex1, vertex2 = line.split() + vertex1, vertex2 = int(vertex1), int(vertex2) + if vertex1 in adjacency_list and vertex2 in adjacency_list: + adjacency_list[vertex1].add(vertex2) + adjacency_list[vertex2].add(vertex1) + + return num_vertices, adjacency_list + + def gcp_greedy_solution(self, adjacency_list): + """Provides a greedy solution to the GCP problem. + + :param adjacency_list: A dictionary of the adjacency list. + :return: A tuple of (num_colors, coloring). + """ + G = nx.Graph() + G.add_nodes_from(adjacency_list.keys()) + for vertex, neighbors in adjacency_list.items(): + for neighbor in neighbors: + G.add_edge(vertex, neighbor) + coloring = nx.coloring.greedy_color(G, strategy='largest_first') + num_colors = max(coloring.values()) + 1 + return num_colors, coloring + + def gcp_decision_check(self, dimacs_str, answer, k_colors): + """Check if the given GCP instance is feasible with k_colors. + + :param dimacs_str: The DIMACS format string of the GCP instance. + :param answer: The answer returned by the model. + :param k_colors: The target number of colors. + :return: A tuple of (is_correct, message). + """ + num_vertices, adjacency_list = self.read_dimacs_format(dimacs_str) + try: + is_feasible = answer.get('Feasible', 'no').lower() == 'yes' + except Exception: + return False, 'Feasible key not found' + num_colors, coloring = self.gcp_greedy_solution(adjacency_list) + exist_optimal = num_colors <= k_colors + if is_feasible != exist_optimal: + if exist_optimal: + return False, f'Feasibility mismatch: {coloring}' + else: + return False, f'Feasibility mismatch: {is_feasible} vs {exist_optimal}' + return True, 'Feasible' if is_feasible else 'Infeasible' diff --git a/opencompass/datasets/NPHardEval/cmp_KSP.py b/opencompass/datasets/NPHardEval/cmp_KSP.py index d1277a79..1c374d8d 100644 --- a/opencompass/datasets/NPHardEval/cmp_KSP.py +++ b/opencompass/datasets/NPHardEval/cmp_KSP.py @@ -1,185 +1,185 @@ -import ast -import json - -from datasets import Dataset - -from opencompass.openicl.icl_evaluator import BaseEvaluator -from opencompass.registry import ICL_EVALUATORS, LOAD_DATASET -from opencompass.utils import get_data_path - -from ..base import BaseDataset -from .prompts import kspPrompts - - -def q2text(q, p=kspPrompts): - knapsack_capacity = q['knapsack_capacity'] - items = q['items'] - prompt_text = p['Intro'] + '\n' + \ - p['Initial_question'].format(knapsack_capacity=knapsack_capacity) + '\n' + \ - p['Output_content'] + '\n' + \ - p['Output_format'] + \ - '\n The items details are as below: \n' - for item in items: - this_line = f"Item {item['id']} has weight {item['weight']} and value {item['value']}." - prompt_text += this_line + '\n' - return prompt_text - - -@LOAD_DATASET.register_module(force=True) -class CMP_KSP_Dataset(BaseDataset): - - @staticmethod - def load(path: str): - path = get_data_path(path, local_mode=True) - raw_data = [] - data_path = path - all_data = [] - with open(data_path + 'ksp_instances.json', 'r') as f: - data = json.load(f) - for sample in data: - level = len(sample['items']) - 3 - all_data.append((level, sample)) - for (level, q) in all_data: - prompt = q2text(q) - raw_data.append({ - 'prompt': prompt, - 'q': str(level) + '####\n' + json.dumps(q), - 'level': level - }) - dataset = Dataset.from_list(raw_data) - return dataset - - -@ICL_EVALUATORS.register_module(force=True) -class CMP_KSP_Evaluator(BaseEvaluator): - - def score(self, predictions, references): - assert len(predictions) == len(references) - - result = {'pass': 0, 'fail': 0} - details = {} - for index, (q, output) in enumerate(zip(references, predictions)): - output_dict = {} - level = int(q.split('####\n')[0]) - q = json.loads(q.split('####\n')[-1]) - try: - llm_string = q - output, reasoning = self.parse_xml_to_dict(llm_string) - output_dict['output'] = output - output_dict['correctness'], _ = self.kspCheck(q, output) - output_dict['reasoning'] = reasoning - output_dict['level'] = level - except Exception as e: - print(f'Attempt failed: {e}') - if output_dict: - if output_dict['correctness']: - r = 'pass' - else: - r = 'fail' - else: - print(f'Failed to run {q}') - r = 'fail' - - result[r] += level - details[str(index)] = {'q': q, 'output': output, 'result': r} - - result['score'] = result['pass'] / (result['pass'] + result['fail']) * 100 - result['details'] = details - final_result = {'Weighted Accuracy': result['score']} - return final_result - - def parse_xml_to_dict(self, xml_string): - try: - assert '' in xml_string - assert '' in xml_string - assert '' in xml_string - assert '' in xml_string - final_answer_start = xml_string.index('') + len('') - final_answer_end = xml_string.index('') - reasoning_start = xml_string.index('') + len('') - reasoning_end = xml_string.index('') - final_answer_element = xml_string[final_answer_start:final_answer_end].rstrip().strip().rstrip() - reasoning_element = xml_string[reasoning_start:reasoning_end].rstrip().strip().rstrip() - try: - final_answer_element = ast.literal_eval(final_answer_element) - except Exception: - final_answer_element = '' - except Exception: - final_answer_element = '' - reasoning_element = '' - - return final_answer_element, reasoning_element - - def ksp_optimal_solution(self, knapsacks, capacity): - """Provides the optimal solution for the KSP instance with dynamic - programming. - - :param knapsacks: A dictionary of the knapsacks. - :param capacity: The capacity of the knapsack. - :return: The optimal value. - """ - # num_knapsacks = len(knapsacks) - - # Create a one-dimensional array to store intermediate solutions - dp = [0] * (capacity + 1) - - for itemId, (weight, value) in knapsacks.items(): - for w in range(capacity, weight - 1, -1): - dp[w] = max(dp[w], value + dp[w - weight]) - - return dp[capacity] - - # KSP - def kspCheck(self, instance, solution): - """Validates the solution for the KSP instance. - - :param instance: A dictionary of the KSP instance. - :param solution: A dictionary of the solution. - :return: A tuple of (is_correct, message). - """ - # Change string key to integer key and value to boolean - items = instance.get('items', []) - knapsacks = { - item['id']: (item['weight'], item['value']) - for item in items - } - - ksp_optimal_value = self.ksp_optimal_solution( - knapsacks, instance['knapsack_capacity']) - - try: - is_feasible = (solution.get('Feasible', '').lower() == 'yes') - except Exception: - return False, f'Output format is incorrect.' - if is_feasible != (ksp_optimal_value > 0): - return False, f'The solution is {is_feasible} but the optimal solution is {ksp_optimal_value > 0}.' - - total_value = int(solution.get('TotalValue', -1)) - selectedItems = list(map(int, solution.get('SelectedItemIds', []))) - - if len(set(selectedItems)) != len(selectedItems): - return False, f'Duplicate items are selected.' - - total_weight = 0 - cum_value = 0 - - # Calculate total weight and value of selected items - for item in selectedItems: - if knapsacks.get(item, False): - weight, value = knapsacks[item] - total_weight += weight - cum_value += value - else: - return False, f'Item {item} does not exist.' - - # Check if the item weight exceeds the knapsack capacity - if total_weight > instance['knapsack_capacity']: - return False, f"Total weight {total_weight} exceeds knapsack capacity {instance['knapsack_capacity']}." - - if total_value != cum_value: - return False, f'The total value {total_value} does not match the cumulative value {cum_value} of the selected items.' - - if total_value != ksp_optimal_value: - return False, f'The total value {total_value} does not match the optimal value {ksp_optimal_value}.' - - return True, f'The solution is valid with total weight {total_weight} and total value {total_value}.' +import ast +import json + +from datasets import Dataset + +from opencompass.openicl.icl_evaluator import BaseEvaluator +from opencompass.registry import ICL_EVALUATORS, LOAD_DATASET +from opencompass.utils import get_data_path + +from ..base import BaseDataset +from .prompts import kspPrompts + + +def q2text(q, p=kspPrompts): + knapsack_capacity = q['knapsack_capacity'] + items = q['items'] + prompt_text = p['Intro'] + '\n' + \ + p['Initial_question'].format(knapsack_capacity=knapsack_capacity) + '\n' + \ + p['Output_content'] + '\n' + \ + p['Output_format'] + \ + '\n The items details are as below: \n' + for item in items: + this_line = f"Item {item['id']} has weight {item['weight']} and value {item['value']}." + prompt_text += this_line + '\n' + return prompt_text + + +@LOAD_DATASET.register_module(force=True) +class CMP_KSP_Dataset(BaseDataset): + + @staticmethod + def load(path: str): + path = get_data_path(path, local_mode=True) + raw_data = [] + data_path = path + all_data = [] + with open(data_path + 'ksp_instances.json', 'r') as f: + data = json.load(f) + for sample in data: + level = len(sample['items']) - 3 + all_data.append((level, sample)) + for (level, q) in all_data: + prompt = q2text(q) + raw_data.append({ + 'prompt': prompt, + 'q': str(level) + '####\n' + json.dumps(q), + 'level': level + }) + dataset = Dataset.from_list(raw_data) + return dataset + + +@ICL_EVALUATORS.register_module(force=True) +class CMP_KSP_Evaluator(BaseEvaluator): + + def score(self, predictions, references): + assert len(predictions) == len(references) + + result = {'pass': 0, 'fail': 0} + details = {} + for index, (q, output) in enumerate(zip(references, predictions)): + output_dict = {} + level = int(q.split('####\n')[0]) + q = json.loads(q.split('####\n')[-1]) + try: + llm_string = q + output, reasoning = self.parse_xml_to_dict(llm_string) + output_dict['output'] = output + output_dict['correctness'], _ = self.kspCheck(q, output) + output_dict['reasoning'] = reasoning + output_dict['level'] = level + except Exception as e: + print(f'Attempt failed: {e}') + if output_dict: + if output_dict['correctness']: + r = 'pass' + else: + r = 'fail' + else: + print(f'Failed to run {q}') + r = 'fail' + + result[r] += level + details[str(index)] = {'q': q, 'output': output, 'result': r} + + result['score'] = result['pass'] / (result['pass'] + result['fail']) * 100 + result['details'] = details + final_result = {'Weighted Accuracy': result['score']} + return final_result + + def parse_xml_to_dict(self, xml_string): + try: + assert '' in xml_string + assert '' in xml_string + assert '' in xml_string + assert '' in xml_string + final_answer_start = xml_string.index('') + len('') + final_answer_end = xml_string.index('') + reasoning_start = xml_string.index('') + len('') + reasoning_end = xml_string.index('') + final_answer_element = xml_string[final_answer_start:final_answer_end].rstrip().strip().rstrip() + reasoning_element = xml_string[reasoning_start:reasoning_end].rstrip().strip().rstrip() + try: + final_answer_element = ast.literal_eval(final_answer_element) + except Exception: + final_answer_element = '' + except Exception: + final_answer_element = '' + reasoning_element = '' + + return final_answer_element, reasoning_element + + def ksp_optimal_solution(self, knapsacks, capacity): + """Provides the optimal solution for the KSP instance with dynamic + programming. + + :param knapsacks: A dictionary of the knapsacks. + :param capacity: The capacity of the knapsack. + :return: The optimal value. + """ + # num_knapsacks = len(knapsacks) + + # Create a one-dimensional array to store intermediate solutions + dp = [0] * (capacity + 1) + + for itemId, (weight, value) in knapsacks.items(): + for w in range(capacity, weight - 1, -1): + dp[w] = max(dp[w], value + dp[w - weight]) + + return dp[capacity] + + # KSP + def kspCheck(self, instance, solution): + """Validates the solution for the KSP instance. + + :param instance: A dictionary of the KSP instance. + :param solution: A dictionary of the solution. + :return: A tuple of (is_correct, message). + """ + # Change string key to integer key and value to boolean + items = instance.get('items', []) + knapsacks = { + item['id']: (item['weight'], item['value']) + for item in items + } + + ksp_optimal_value = self.ksp_optimal_solution( + knapsacks, instance['knapsack_capacity']) + + try: + is_feasible = (solution.get('Feasible', '').lower() == 'yes') + except Exception: + return False, f'Output format is incorrect.' + if is_feasible != (ksp_optimal_value > 0): + return False, f'The solution is {is_feasible} but the optimal solution is {ksp_optimal_value > 0}.' + + total_value = int(solution.get('TotalValue', -1)) + selectedItems = list(map(int, solution.get('SelectedItemIds', []))) + + if len(set(selectedItems)) != len(selectedItems): + return False, f'Duplicate items are selected.' + + total_weight = 0 + cum_value = 0 + + # Calculate total weight and value of selected items + for item in selectedItems: + if knapsacks.get(item, False): + weight, value = knapsacks[item] + total_weight += weight + cum_value += value + else: + return False, f'Item {item} does not exist.' + + # Check if the item weight exceeds the knapsack capacity + if total_weight > instance['knapsack_capacity']: + return False, f"Total weight {total_weight} exceeds knapsack capacity {instance['knapsack_capacity']}." + + if total_value != cum_value: + return False, f'The total value {total_value} does not match the cumulative value {cum_value} of the selected items.' + + if total_value != ksp_optimal_value: + return False, f'The total value {total_value} does not match the optimal value {ksp_optimal_value}.' + + return True, f'The solution is valid with total weight {total_weight} and total value {total_value}.' diff --git a/opencompass/datasets/NPHardEval/cmp_TSP_D.py b/opencompass/datasets/NPHardEval/cmp_TSP_D.py index bff15260..95768750 100644 --- a/opencompass/datasets/NPHardEval/cmp_TSP_D.py +++ b/opencompass/datasets/NPHardEval/cmp_TSP_D.py @@ -1,156 +1,156 @@ -import ast -import json - -try: - import networkx as nx -except ImportError: - nx = None - -import pandas as pd -from datasets import Dataset - -from opencompass.openicl.icl_evaluator import BaseEvaluator -from opencompass.registry import ICL_EVALUATORS, LOAD_DATASET -from opencompass.utils import get_data_path - -from ..base import BaseDataset -from .prompts import tsp_dPrompts - - -def q2text(adj_matrix, distance_limit, p=tsp_dPrompts): - total_cities = adj_matrix.shape[0] # exclude the last row - prompt_text = p['Intro'] + '\n' + \ - p['Initial_question'].format(total_cities=total_cities, distance_limit=distance_limit) + '\n' + \ - p['Output_content'] + '\n' + \ - p['Output_format'] + '\n' + \ - 'The distances between cities are below: \n' - - for i in range(adj_matrix.shape[0]): - for j in range(adj_matrix.shape[1]): - if i < j: # only use the upper triangle - this_line = 'The distance between City {} and City {} is {}.'.format(i, j, adj_matrix[i, j]) - prompt_text += this_line + '\n' - return prompt_text - - -@LOAD_DATASET.register_module(force=True) -class CMP_TSP_D_Dataset(BaseDataset): - - @staticmethod - def load(path: str): - path = get_data_path(path, local_mode=True) - raw_data = [] - data_path = path - all_data = [] - for level in range(10): - for file_num in range(10): - df = pd.read_csv(data_path + 'decision_data_TSP_level_{}_instance_{}.csv'.format(level, file_num + 1), - header=None, - index_col=False) - all_data.append((level + 1, df)) - - for (level, q) in all_data: - threshold = q.iloc[-1, 0] # therashold is the last row - distance_matrix = q.iloc[: - -1].values # distance matrix is the rest of the rows - prompt = q2text(distance_matrix, threshold) - raw_data.append({ - 'prompt': prompt, - 'q': str(level) + '####\n' + json.dumps(q.to_json()), - 'level': level - }) - dataset = Dataset.from_list(raw_data) - return dataset - - -@ICL_EVALUATORS.register_module(force=True) -class CMP_TSP_D_Evaluator(BaseEvaluator): - - def score(self, predictions, references): - assert len(predictions) == len(references) - - result = {'pass': 0, 'fail': 0} - details = {} - tsp_d_Results = [] - for index, (q, llm_string) in enumerate(zip(references, predictions)): - output_dict = {} - output, reasoning = self.parse_xml_to_dict(llm_string) - level = int(q.split('####\n')[0]) - q = json.loads(q.split('####\n')[-1]) - q = pd.DataFrame(eval(q)) - threshold = q.iloc[-1, 0] # therashold is the last row - distance_matrix = q.iloc[:-1].values # distance matrix is the rest of the rows - output_dict['output'] = output - try: - output_dict['correctness'], _ = self.tsp_decision_check(distance_matrix, threshold, output) - except Exception as e: - print(f'Check failed: {e}') - output_dict['correctness'] = False - output_dict['reasoning'] = reasoning - output_dict['level'] = level - if output_dict: - tsp_d_Results.append(output_dict) - if output_dict['correctness']: - r = 'pass' - else: - r = 'fail' - - result[r] += level - details[str(index)] = {'q': q, 'output': output, 'result': r} - - result['score'] = result['pass'] / (result['pass'] + result['fail']) * 100 - result['details'] = details - final_result = {'Weighted Accuracy': result['score']} - return final_result - - def parse_xml_to_dict(self, xml_string): - try: - assert '' in xml_string - assert '' in xml_string - assert '' in xml_string - assert '' in xml_string - final_answer_start = xml_string.index('') + len('') - final_answer_end = xml_string.index('') - reasoning_start = xml_string.index('') + len('') - reasoning_end = xml_string.index('') - final_answer_element = xml_string[final_answer_start:final_answer_end].rstrip().strip().rstrip() - reasoning_element = xml_string[reasoning_start:reasoning_end].rstrip().strip().rstrip() - try: - final_answer_element = ast.literal_eval(final_answer_element) - except Exception: - final_answer_element = '' - except Exception: - final_answer_element = '' - reasoning_element = '' - - return final_answer_element, reasoning_element - - def tsp_approx(self, distance_matrix): - """Returns an approximate solution to the TSP problem. - - :param distance_matrix: A 2D numpy array representing the distance matrix. - :return: A list of the cities in the order they were visited. - """ - G = nx.from_numpy_array(distance_matrix) - return nx.approximation.traveling_salesman_problem(G) - - def tsp_decision_check(self, distance_matrix, threshold, tour): - """Checks if a given TSP tour is valid and within the threshold - distance. - - :param distance_matrix: A 2D numpy array representing the distance matrix. - :param threshold: The maximum distance allowed. - :param tour: A dictionary containing the feasibility. - """ - try: - is_feasible = tour.get('Feasible', 'no').lower() == 'yes' - except Exception: - return False, 'Output format incorrect' - - # Calculate the approxed distance of the tour - tours = self.tsp_approx(distance_matrix) - tour_distance = sum(distance_matrix[tours[i], tours[i + 1]] for i in range(len(tours) - 1)) + distance_matrix[tours[-1], tours[0]] - - if is_feasible != (tour_distance <= threshold): - return False, f'Feasibility mismatch: {is_feasible} vs {tour_distance} > {threshold}' - return True, 'Feasible: {} <= {}'.format(tour_distance, threshold) +import ast +import json + +try: + import networkx as nx +except ImportError: + nx = None + +import pandas as pd +from datasets import Dataset + +from opencompass.openicl.icl_evaluator import BaseEvaluator +from opencompass.registry import ICL_EVALUATORS, LOAD_DATASET +from opencompass.utils import get_data_path + +from ..base import BaseDataset +from .prompts import tsp_dPrompts + + +def q2text(adj_matrix, distance_limit, p=tsp_dPrompts): + total_cities = adj_matrix.shape[0] # exclude the last row + prompt_text = p['Intro'] + '\n' + \ + p['Initial_question'].format(total_cities=total_cities, distance_limit=distance_limit) + '\n' + \ + p['Output_content'] + '\n' + \ + p['Output_format'] + '\n' + \ + 'The distances between cities are below: \n' + + for i in range(adj_matrix.shape[0]): + for j in range(adj_matrix.shape[1]): + if i < j: # only use the upper triangle + this_line = 'The distance between City {} and City {} is {}.'.format(i, j, adj_matrix[i, j]) + prompt_text += this_line + '\n' + return prompt_text + + +@LOAD_DATASET.register_module(force=True) +class CMP_TSP_D_Dataset(BaseDataset): + + @staticmethod + def load(path: str): + path = get_data_path(path, local_mode=True) + raw_data = [] + data_path = path + all_data = [] + for level in range(10): + for file_num in range(10): + df = pd.read_csv(data_path + 'decision_data_TSP_level_{}_instance_{}.csv'.format(level, file_num + 1), + header=None, + index_col=False) + all_data.append((level + 1, df)) + + for (level, q) in all_data: + threshold = q.iloc[-1, 0] # therashold is the last row + distance_matrix = q.iloc[: + -1].values # distance matrix is the rest of the rows + prompt = q2text(distance_matrix, threshold) + raw_data.append({ + 'prompt': prompt, + 'q': str(level) + '####\n' + json.dumps(q.to_json()), + 'level': level + }) + dataset = Dataset.from_list(raw_data) + return dataset + + +@ICL_EVALUATORS.register_module(force=True) +class CMP_TSP_D_Evaluator(BaseEvaluator): + + def score(self, predictions, references): + assert len(predictions) == len(references) + + result = {'pass': 0, 'fail': 0} + details = {} + tsp_d_Results = [] + for index, (q, llm_string) in enumerate(zip(references, predictions)): + output_dict = {} + output, reasoning = self.parse_xml_to_dict(llm_string) + level = int(q.split('####\n')[0]) + q = json.loads(q.split('####\n')[-1]) + q = pd.DataFrame(eval(q)) + threshold = q.iloc[-1, 0] # therashold is the last row + distance_matrix = q.iloc[:-1].values # distance matrix is the rest of the rows + output_dict['output'] = output + try: + output_dict['correctness'], _ = self.tsp_decision_check(distance_matrix, threshold, output) + except Exception as e: + print(f'Check failed: {e}') + output_dict['correctness'] = False + output_dict['reasoning'] = reasoning + output_dict['level'] = level + if output_dict: + tsp_d_Results.append(output_dict) + if output_dict['correctness']: + r = 'pass' + else: + r = 'fail' + + result[r] += level + details[str(index)] = {'q': q, 'output': output, 'result': r} + + result['score'] = result['pass'] / (result['pass'] + result['fail']) * 100 + result['details'] = details + final_result = {'Weighted Accuracy': result['score']} + return final_result + + def parse_xml_to_dict(self, xml_string): + try: + assert '' in xml_string + assert '' in xml_string + assert '' in xml_string + assert '' in xml_string + final_answer_start = xml_string.index('') + len('') + final_answer_end = xml_string.index('') + reasoning_start = xml_string.index('') + len('') + reasoning_end = xml_string.index('') + final_answer_element = xml_string[final_answer_start:final_answer_end].rstrip().strip().rstrip() + reasoning_element = xml_string[reasoning_start:reasoning_end].rstrip().strip().rstrip() + try: + final_answer_element = ast.literal_eval(final_answer_element) + except Exception: + final_answer_element = '' + except Exception: + final_answer_element = '' + reasoning_element = '' + + return final_answer_element, reasoning_element + + def tsp_approx(self, distance_matrix): + """Returns an approximate solution to the TSP problem. + + :param distance_matrix: A 2D numpy array representing the distance matrix. + :return: A list of the cities in the order they were visited. + """ + G = nx.from_numpy_array(distance_matrix) + return nx.approximation.traveling_salesman_problem(G) + + def tsp_decision_check(self, distance_matrix, threshold, tour): + """Checks if a given TSP tour is valid and within the threshold + distance. + + :param distance_matrix: A 2D numpy array representing the distance matrix. + :param threshold: The maximum distance allowed. + :param tour: A dictionary containing the feasibility. + """ + try: + is_feasible = tour.get('Feasible', 'no').lower() == 'yes' + except Exception: + return False, 'Output format incorrect' + + # Calculate the approxed distance of the tour + tours = self.tsp_approx(distance_matrix) + tour_distance = sum(distance_matrix[tours[i], tours[i + 1]] for i in range(len(tours) - 1)) + distance_matrix[tours[-1], tours[0]] + + if is_feasible != (tour_distance <= threshold): + return False, f'Feasibility mismatch: {is_feasible} vs {tour_distance} > {threshold}' + return True, 'Feasible: {} <= {}'.format(tour_distance, threshold) diff --git a/opencompass/datasets/NPHardEval/hard_GCP.py b/opencompass/datasets/NPHardEval/hard_GCP.py index 24cc6fc5..99222cc9 100644 --- a/opencompass/datasets/NPHardEval/hard_GCP.py +++ b/opencompass/datasets/NPHardEval/hard_GCP.py @@ -1,191 +1,191 @@ -import ast -import xml.etree.ElementTree as ET - -from datasets import Dataset - -from opencompass.openicl.icl_evaluator import BaseEvaluator -from opencompass.registry import ICL_EVALUATORS, LOAD_DATASET -from opencompass.utils import get_data_path - -from ..base import BaseDataset -from .prompts import gcpPrompts - - -def q2text(q, p=gcpPrompts): # q is the data for the HP-hard question, p is the prompt - # print(q) - chromatic_number = q.split('\n')[0][-1] # last character of the first line - number_of_vertices = q.split('\n')[1].split(' ')[2] # third word of the second line - prompt_text = p['Intro'] + '\n' \ - + p['Initial_question'].format(max_vertices=number_of_vertices,max_colors=chromatic_number) + '\n' \ - + p['Output_content'] + '\n' \ - + p['Output_format'] + \ - '\n The graph is below: \n' - for line in q.split('\n')[2:]: - vertex_list = line.split(' ') - this_line = 'Vertex {} is connected to vertex {}.'.format(vertex_list[1], vertex_list[2]) - prompt_text += this_line + '\n' - - return prompt_text - - -@LOAD_DATASET.register_module(force=True) -class HardGCPDataset(BaseDataset): - - @staticmethod - def load(path: str): - path = get_data_path(path, local_mode=True) - raw_data = [] - data_path = path - all_data = [] - for file_num in range(10): - with open(data_path + 'synthesized_data_GCP_{}.txt'.format(file_num)) as f: - data = f.read() - sample = data.split('\n\n')[:-1] - all_data += zip([file_num + 1] * len(sample), sample) - for (level, q) in all_data: - prompt = q2text(q) - raw_data.append({ - 'prompt': prompt, - 'q': str(level) + '####\n' + q, - 'level': level - }) - dataset = Dataset.from_list(raw_data) - return dataset - - -@ICL_EVALUATORS.register_module(force=True) -class HardGCPEvaluator(BaseEvaluator): - - def score(self, predictions, references): - assert len(predictions) == len(references) - - result = {'pass': 0, 'fail': 0} - details = {} - for index, (q, output) in enumerate(zip(references, predictions)): - output_dict = {} - level = int(q.split('####\n')[0]) - q = q.split('####\n')[-1] - - output_dict['output'] = output - try: - output_dict['correctness'] = self.gcpCheck(q, output) - except Exception as e: - print(f'Check failed: {e}') - output_dict['correctness'] = False - output_dict['level'] = level - - if output_dict['correctness']: - r = 'pass' - else: - r = 'fail' - result[r] += level - details[str(index)] = {'q': q, 'output': output, 'result': r} - - result['score'] = result['pass'] / (result['pass'] + result['fail']) * 100 - result['details'] = details - final_result = {'Weighted Accuracy': result['score']} - return final_result - - def parse_xml_to_dict(self, xml_string): - try: - # Parse the XML string - root = ET.fromstring(xml_string) - - # Find the 'final_answer' tag - final_answer_element = root.find('final_answer') - - # Find the 'reasoning' tag - reasoning_element = root.find('reasoning') - except Exception: - try: - assert '' in xml_string - assert '' in xml_string - assert '' in xml_string - assert '' in xml_string - final_answer_start = xml_string.index('') + len('') - final_answer_end = xml_string.index('') - reasoning_start = xml_string.index('') + len('') - reasoning_end = xml_string.index('') - final_answer_element = xml_string[final_answer_start:final_answer_end] - reasoning_element = xml_string[reasoning_start:reasoning_end] - except Exception: - final_answer_element = '' - reasoning_element = '' - - return final_answer_element, reasoning_element - - def gcpCheck(self, dimacs_str, answer_str): - num_vertices, adjacency_list = self.read_dimacs_format(dimacs_str) - answer_colors = self.parse_answer(answer_str) - # print(adjacency_list) - # print(answer_colors) - - # Check if all colors in the answer are valid - for vertex, neighbors in adjacency_list.items(): - for neighbor in neighbors: - try: - if answer_colors[vertex] == answer_colors[neighbor]: - print(f'Invalid coloring: Vertex {vertex} and {neighbor} have the same color.') - return False - except: - print(f'Invalid input.') # dealing with hullucination - return False - - print(f'Valid coloring found with {len(set(answer_colors.values()))} colors: {answer_colors}') - return True - - def read_dimacs_format(self, dimacs_str): - lines = dimacs_str.strip().split('\n') - # Read the number of vertices and edges - p_line = next(line for line in lines if line.startswith('p')) - _, _, num_vertices, num_edges = p_line.split() - num_vertices, num_edges = int(num_vertices), int(num_edges) - - # Create adjacency list - adjacency_list = {i: set() for i in range(1, num_vertices + 1)} - - # Read the edges and ignore those that reference non-existing vertices - for line in lines: - if line.startswith('e'): - _, vertex1, vertex2 = line.split() - vertex1, vertex2 = int(vertex1), int(vertex2) - if vertex1 in adjacency_list and vertex2 in adjacency_list: - adjacency_list[vertex1].add(vertex2) - adjacency_list[vertex2].add(vertex1) - - return num_vertices, adjacency_list - - def parse_answer(self, llm_string): - # # Convert the answer string to a dictionary - # answer_dict = {} - # # Remove the braces and split the string by commas - # entries = answer_str.strip("}{").split(', ') - # for entry in entries: - # vertex, color = entry.split(':') - # answer_dict[int(vertex)] = color - # return answer_dict - - all_answers, reasoning_element = self.parse_xml_to_dict(llm_string) - - if all_answers == '': - return {} - elif all_answers is None: - return {} - else: - if isinstance(all_answers, str): - try: - all_answers = ast.literal_eval(all_answers) - except Exception: - try: - all_answers = ast.literal_eval('{' + all_answers + '}') - except Exception: - return {} - else: - all_answers = ast.literal_eval(all_answers.text) - # answer_dict = {} - # for pair in all_answers: - # vertex, color = pair.split(":") - # answer_dict[int(vertex)] = color - # convert key type to int - all_answers = {int(k): v for k, v in all_answers.items()} - return all_answers # answer_dict +import ast +import xml.etree.ElementTree as ET + +from datasets import Dataset + +from opencompass.openicl.icl_evaluator import BaseEvaluator +from opencompass.registry import ICL_EVALUATORS, LOAD_DATASET +from opencompass.utils import get_data_path + +from ..base import BaseDataset +from .prompts import gcpPrompts + + +def q2text(q, p=gcpPrompts): # q is the data for the HP-hard question, p is the prompt + # print(q) + chromatic_number = q.split('\n')[0][-1] # last character of the first line + number_of_vertices = q.split('\n')[1].split(' ')[2] # third word of the second line + prompt_text = p['Intro'] + '\n' \ + + p['Initial_question'].format(max_vertices=number_of_vertices,max_colors=chromatic_number) + '\n' \ + + p['Output_content'] + '\n' \ + + p['Output_format'] + \ + '\n The graph is below: \n' + for line in q.split('\n')[2:]: + vertex_list = line.split(' ') + this_line = 'Vertex {} is connected to vertex {}.'.format(vertex_list[1], vertex_list[2]) + prompt_text += this_line + '\n' + + return prompt_text + + +@LOAD_DATASET.register_module(force=True) +class HardGCPDataset(BaseDataset): + + @staticmethod + def load(path: str): + path = get_data_path(path, local_mode=True) + raw_data = [] + data_path = path + all_data = [] + for file_num in range(10): + with open(data_path + 'synthesized_data_GCP_{}.txt'.format(file_num)) as f: + data = f.read() + sample = data.split('\n\n')[:-1] + all_data += zip([file_num + 1] * len(sample), sample) + for (level, q) in all_data: + prompt = q2text(q) + raw_data.append({ + 'prompt': prompt, + 'q': str(level) + '####\n' + q, + 'level': level + }) + dataset = Dataset.from_list(raw_data) + return dataset + + +@ICL_EVALUATORS.register_module(force=True) +class HardGCPEvaluator(BaseEvaluator): + + def score(self, predictions, references): + assert len(predictions) == len(references) + + result = {'pass': 0, 'fail': 0} + details = {} + for index, (q, output) in enumerate(zip(references, predictions)): + output_dict = {} + level = int(q.split('####\n')[0]) + q = q.split('####\n')[-1] + + output_dict['output'] = output + try: + output_dict['correctness'] = self.gcpCheck(q, output) + except Exception as e: + print(f'Check failed: {e}') + output_dict['correctness'] = False + output_dict['level'] = level + + if output_dict['correctness']: + r = 'pass' + else: + r = 'fail' + result[r] += level + details[str(index)] = {'q': q, 'output': output, 'result': r} + + result['score'] = result['pass'] / (result['pass'] + result['fail']) * 100 + result['details'] = details + final_result = {'Weighted Accuracy': result['score']} + return final_result + + def parse_xml_to_dict(self, xml_string): + try: + # Parse the XML string + root = ET.fromstring(xml_string) + + # Find the 'final_answer' tag + final_answer_element = root.find('final_answer') + + # Find the 'reasoning' tag + reasoning_element = root.find('reasoning') + except Exception: + try: + assert '' in xml_string + assert '' in xml_string + assert '' in xml_string + assert '' in xml_string + final_answer_start = xml_string.index('') + len('') + final_answer_end = xml_string.index('') + reasoning_start = xml_string.index('') + len('') + reasoning_end = xml_string.index('') + final_answer_element = xml_string[final_answer_start:final_answer_end] + reasoning_element = xml_string[reasoning_start:reasoning_end] + except Exception: + final_answer_element = '' + reasoning_element = '' + + return final_answer_element, reasoning_element + + def gcpCheck(self, dimacs_str, answer_str): + num_vertices, adjacency_list = self.read_dimacs_format(dimacs_str) + answer_colors = self.parse_answer(answer_str) + # print(adjacency_list) + # print(answer_colors) + + # Check if all colors in the answer are valid + for vertex, neighbors in adjacency_list.items(): + for neighbor in neighbors: + try: + if answer_colors[vertex] == answer_colors[neighbor]: + print(f'Invalid coloring: Vertex {vertex} and {neighbor} have the same color.') + return False + except: + print(f'Invalid input.') # dealing with hullucination + return False + + print(f'Valid coloring found with {len(set(answer_colors.values()))} colors: {answer_colors}') + return True + + def read_dimacs_format(self, dimacs_str): + lines = dimacs_str.strip().split('\n') + # Read the number of vertices and edges + p_line = next(line for line in lines if line.startswith('p')) + _, _, num_vertices, num_edges = p_line.split() + num_vertices, num_edges = int(num_vertices), int(num_edges) + + # Create adjacency list + adjacency_list = {i: set() for i in range(1, num_vertices + 1)} + + # Read the edges and ignore those that reference non-existing vertices + for line in lines: + if line.startswith('e'): + _, vertex1, vertex2 = line.split() + vertex1, vertex2 = int(vertex1), int(vertex2) + if vertex1 in adjacency_list and vertex2 in adjacency_list: + adjacency_list[vertex1].add(vertex2) + adjacency_list[vertex2].add(vertex1) + + return num_vertices, adjacency_list + + def parse_answer(self, llm_string): + # # Convert the answer string to a dictionary + # answer_dict = {} + # # Remove the braces and split the string by commas + # entries = answer_str.strip("}{").split(', ') + # for entry in entries: + # vertex, color = entry.split(':') + # answer_dict[int(vertex)] = color + # return answer_dict + + all_answers, reasoning_element = self.parse_xml_to_dict(llm_string) + + if all_answers == '': + return {} + elif all_answers is None: + return {} + else: + if isinstance(all_answers, str): + try: + all_answers = ast.literal_eval(all_answers) + except Exception: + try: + all_answers = ast.literal_eval('{' + all_answers + '}') + except Exception: + return {} + else: + all_answers = ast.literal_eval(all_answers.text) + # answer_dict = {} + # for pair in all_answers: + # vertex, color = pair.split(":") + # answer_dict[int(vertex)] = color + # convert key type to int + all_answers = {int(k): v for k, v in all_answers.items()} + return all_answers # answer_dict diff --git a/opencompass/datasets/NPHardEval/hard_MSP.py b/opencompass/datasets/NPHardEval/hard_MSP.py index 7c4f4db3..e49f8a21 100644 --- a/opencompass/datasets/NPHardEval/hard_MSP.py +++ b/opencompass/datasets/NPHardEval/hard_MSP.py @@ -1,205 +1,205 @@ -import ast -import json -import xml.etree.ElementTree as ET - -from datasets import Dataset - -from opencompass.openicl.icl_evaluator import BaseEvaluator -from opencompass.registry import ICL_EVALUATORS, LOAD_DATASET -from opencompass.utils import get_data_path - -from ..base import BaseDataset -from .prompts import mspPrompts - - -def q2text(q, p=mspPrompts): # q is the data for the HP-hard question, p is the prompt - total_participants = q['participants'] - total_timeslots = q['time_slots'] - prompt_text = p['Intro'] + '\n' \ - + p['Initial_question'].format(total_participants=total_participants,total_timeslots=total_timeslots) + '\n' \ - + p['Output_content'] + '\n' \ - + p['Output_format'] + \ - '\n The meetings and participants details are as below: \n' - meetings = q['meetings'] - participants = q['participants'] - for meeting in meetings: - this_line = 'Meeting {} is with duration {}.'.format(meeting['id'], meeting['duration']) - prompt_text += this_line + '\n' - for j in participants.keys(): - this_line = 'Participant {} is available at time slots {} and has meetings {}.'.format(j, participants[j]['available_slots'], participants[j]['meetings']) - prompt_text += this_line + '\n' - return prompt_text - - -@LOAD_DATASET.register_module(force=True) -class Hard_MSP_Dataset(BaseDataset): - - @staticmethod - def load(path: str): - path = get_data_path(path, local_mode=True) - raw_data = [] - data_path = path - all_data = [] - with open(data_path + 'msp_instances.json', 'r') as f: - data = json.load(f) - all_data = zip([int(d['complexity_level']) for d in data], data) - - for (level, q) in all_data: - prompt = q2text(q) - raw_data.append({ - 'prompt': prompt, - 'q': str(level) + '####\n' + json.dumps(q), - 'level': level - }) - dataset = Dataset.from_list(raw_data) - return dataset - - -@ICL_EVALUATORS.register_module(force=True) -class Hard_MSP_Evaluator(BaseEvaluator): - - def score(self, predictions, references): - assert len(predictions) == len(references) - - result = {'pass': 0, 'fail': 0} - for index, (q, output) in enumerate(zip(references, predictions)): - output_dict = {} - level = int(q.split('####\n')[0]) - q = json.loads(q.split('####\n')[-1]) - - output_dict['output'] = output - output_dict['level'] = level - try: - output_dict['correctness'], _ = self.mspCheck(q, output) - except Exception as e: - print(f'Check failed: {e}') - output_dict['correctness'] = False - - if output_dict['correctness']: - r = 'pass' - else: - r = 'fail' - result[r] += level - - result['score'] = result['pass'] / (result['pass'] + result['fail']) * 100 - final_result = {'Weighted Accuracy': result['score']} - return final_result - - def mspCheck(self, instance, llm_string): - """Validate the MSP solution. - - Parameters: - - instance: The MSP instance as a dictionary. - - solution: A dictionary with meeting ids as keys and lists of scheduled time slots as values. - - Returns: - - A tuple (is_valid, message). is_valid is True if the solution is valid, False otherwise. - message contains information about the validity of the solution. - """ - # print(llm_string) - solution, reasoning_element = self.parse_xml_to_dict(llm_string) - # print(solution.text) - - # convert solution to dictionary - if solution == '': - return False, None - elif solution is None: - return False, None - else: - if isinstance(solution, str): - try: - solution = ast.literal_eval(solution) - if solution is None: - return False, None - except Exception: - try: - solution = ast.literal_eval('{' + solution + '}') - if solution is None: - return False, None - except Exception: - return False, None - else: - try: - solution = ast.literal_eval(solution.text) - if solution is None: - return False, None - except Exception: - return False, None - # convert key type to int - if isinstance(solution, dict): - print(solution) - solution = {int(k): v for k, v in solution.items()} - else: - return False, None - - # Check if all meetings are scheduled within the available time slots - for meeting in instance['meetings']: - m_id = meeting['id'] - duration = meeting['duration'] - scheduled_slots = solution.get(m_id, None) - - # Check if the meeting is scheduled - if scheduled_slots is None: - return False, f'Meeting {m_id} is not scheduled.' - - # Check if the meeting fits within the number of total time slots - if any(slot >= instance['time_slots'] for slot in scheduled_slots): - return False, f'Meeting {m_id} does not fit within the available time slots.' - - # Check if the scheduled slots are contiguous and fit the meeting duration - if len(scheduled_slots) != duration or not all(scheduled_slots[i] + 1 == scheduled_slots[i + 1] - for i in range(len(scheduled_slots) - 1)): - return False, f'Meeting {m_id} is not scheduled in contiguous time slots fitting its duration.' - - # Check if all participants are available at the scheduled time - for p_id, participant in instance['participants'].items(): - if m_id in participant['meetings']: - if not all(slot in participant['available_slots'] for slot in scheduled_slots): - return False, f'Participant {p_id} is not available for meeting {m_id} at the scheduled time.' - - # Check if any participant is double-booked - participants_schedule = {p_id: [] for p_id in instance['participants']} - for m_id, time_slots in solution.items(): - try: - duration = next(meeting['duration'] for meeting in instance['meetings'] if meeting['id'] == m_id) - if len(time_slots) != duration: - return False, f'Meeting {m_id} duration does not match the number of scheduled time slots.' - for p_id, participant in instance['participants'].items(): - if m_id in participant['meetings']: - participants_schedule[p_id].extend(time_slots) - except Exception: - return False, f'Meeting {m_id} is not in the instance or program error.' - - for p_id, slots in participants_schedule.items(): - if len(slots) != len(set(slots)): - return False, f'Participant {p_id} is double-booked.' - - return True, 'The solution is valid.' - - def parse_xml_to_dict(self, xml_string): - try: - # Parse the XML string - root = ET.fromstring(xml_string) - - # Find the 'final_answer' tag - final_answer_element = root.find('final_answer') - - # Find the 'reasoning' tag - reasoning_element = root.find('reasoning') - except: - try: - assert '' in xml_string - assert '' in xml_string - assert '' in xml_string - assert '' in xml_string - final_answer_start = xml_string.index('') + len('') - final_answer_end = xml_string.index('') - reasoning_start = xml_string.index('') + len('') - reasoning_end = xml_string.index('') - final_answer_element = xml_string[final_answer_start:final_answer_end] - reasoning_element = xml_string[reasoning_start:reasoning_end] - except: - final_answer_element = '' - reasoning_element = '' - - return final_answer_element, reasoning_element +import ast +import json +import xml.etree.ElementTree as ET + +from datasets import Dataset + +from opencompass.openicl.icl_evaluator import BaseEvaluator +from opencompass.registry import ICL_EVALUATORS, LOAD_DATASET +from opencompass.utils import get_data_path + +from ..base import BaseDataset +from .prompts import mspPrompts + + +def q2text(q, p=mspPrompts): # q is the data for the HP-hard question, p is the prompt + total_participants = q['participants'] + total_timeslots = q['time_slots'] + prompt_text = p['Intro'] + '\n' \ + + p['Initial_question'].format(total_participants=total_participants,total_timeslots=total_timeslots) + '\n' \ + + p['Output_content'] + '\n' \ + + p['Output_format'] + \ + '\n The meetings and participants details are as below: \n' + meetings = q['meetings'] + participants = q['participants'] + for meeting in meetings: + this_line = 'Meeting {} is with duration {}.'.format(meeting['id'], meeting['duration']) + prompt_text += this_line + '\n' + for j in participants.keys(): + this_line = 'Participant {} is available at time slots {} and has meetings {}.'.format(j, participants[j]['available_slots'], participants[j]['meetings']) + prompt_text += this_line + '\n' + return prompt_text + + +@LOAD_DATASET.register_module(force=True) +class Hard_MSP_Dataset(BaseDataset): + + @staticmethod + def load(path: str): + path = get_data_path(path, local_mode=True) + raw_data = [] + data_path = path + all_data = [] + with open(data_path + 'msp_instances.json', 'r') as f: + data = json.load(f) + all_data = zip([int(d['complexity_level']) for d in data], data) + + for (level, q) in all_data: + prompt = q2text(q) + raw_data.append({ + 'prompt': prompt, + 'q': str(level) + '####\n' + json.dumps(q), + 'level': level + }) + dataset = Dataset.from_list(raw_data) + return dataset + + +@ICL_EVALUATORS.register_module(force=True) +class Hard_MSP_Evaluator(BaseEvaluator): + + def score(self, predictions, references): + assert len(predictions) == len(references) + + result = {'pass': 0, 'fail': 0} + for index, (q, output) in enumerate(zip(references, predictions)): + output_dict = {} + level = int(q.split('####\n')[0]) + q = json.loads(q.split('####\n')[-1]) + + output_dict['output'] = output + output_dict['level'] = level + try: + output_dict['correctness'], _ = self.mspCheck(q, output) + except Exception as e: + print(f'Check failed: {e}') + output_dict['correctness'] = False + + if output_dict['correctness']: + r = 'pass' + else: + r = 'fail' + result[r] += level + + result['score'] = result['pass'] / (result['pass'] + result['fail']) * 100 + final_result = {'Weighted Accuracy': result['score']} + return final_result + + def mspCheck(self, instance, llm_string): + """Validate the MSP solution. + + Parameters: + - instance: The MSP instance as a dictionary. + - solution: A dictionary with meeting ids as keys and lists of scheduled time slots as values. + + Returns: + - A tuple (is_valid, message). is_valid is True if the solution is valid, False otherwise. + message contains information about the validity of the solution. + """ + # print(llm_string) + solution, reasoning_element = self.parse_xml_to_dict(llm_string) + # print(solution.text) + + # convert solution to dictionary + if solution == '': + return False, None + elif solution is None: + return False, None + else: + if isinstance(solution, str): + try: + solution = ast.literal_eval(solution) + if solution is None: + return False, None + except Exception: + try: + solution = ast.literal_eval('{' + solution + '}') + if solution is None: + return False, None + except Exception: + return False, None + else: + try: + solution = ast.literal_eval(solution.text) + if solution is None: + return False, None + except Exception: + return False, None + # convert key type to int + if isinstance(solution, dict): + print(solution) + solution = {int(k): v for k, v in solution.items()} + else: + return False, None + + # Check if all meetings are scheduled within the available time slots + for meeting in instance['meetings']: + m_id = meeting['id'] + duration = meeting['duration'] + scheduled_slots = solution.get(m_id, None) + + # Check if the meeting is scheduled + if scheduled_slots is None: + return False, f'Meeting {m_id} is not scheduled.' + + # Check if the meeting fits within the number of total time slots + if any(slot >= instance['time_slots'] for slot in scheduled_slots): + return False, f'Meeting {m_id} does not fit within the available time slots.' + + # Check if the scheduled slots are contiguous and fit the meeting duration + if len(scheduled_slots) != duration or not all(scheduled_slots[i] + 1 == scheduled_slots[i + 1] + for i in range(len(scheduled_slots) - 1)): + return False, f'Meeting {m_id} is not scheduled in contiguous time slots fitting its duration.' + + # Check if all participants are available at the scheduled time + for p_id, participant in instance['participants'].items(): + if m_id in participant['meetings']: + if not all(slot in participant['available_slots'] for slot in scheduled_slots): + return False, f'Participant {p_id} is not available for meeting {m_id} at the scheduled time.' + + # Check if any participant is double-booked + participants_schedule = {p_id: [] for p_id in instance['participants']} + for m_id, time_slots in solution.items(): + try: + duration = next(meeting['duration'] for meeting in instance['meetings'] if meeting['id'] == m_id) + if len(time_slots) != duration: + return False, f'Meeting {m_id} duration does not match the number of scheduled time slots.' + for p_id, participant in instance['participants'].items(): + if m_id in participant['meetings']: + participants_schedule[p_id].extend(time_slots) + except Exception: + return False, f'Meeting {m_id} is not in the instance or program error.' + + for p_id, slots in participants_schedule.items(): + if len(slots) != len(set(slots)): + return False, f'Participant {p_id} is double-booked.' + + return True, 'The solution is valid.' + + def parse_xml_to_dict(self, xml_string): + try: + # Parse the XML string + root = ET.fromstring(xml_string) + + # Find the 'final_answer' tag + final_answer_element = root.find('final_answer') + + # Find the 'reasoning' tag + reasoning_element = root.find('reasoning') + except: + try: + assert '' in xml_string + assert '' in xml_string + assert '' in xml_string + assert '' in xml_string + final_answer_start = xml_string.index('') + len('') + final_answer_end = xml_string.index('') + reasoning_start = xml_string.index('') + len('') + reasoning_end = xml_string.index('') + final_answer_element = xml_string[final_answer_start:final_answer_end] + reasoning_element = xml_string[reasoning_start:reasoning_end] + except: + final_answer_element = '' + reasoning_element = '' + + return final_answer_element, reasoning_element diff --git a/opencompass/datasets/NPHardEval/hard_TSP.py b/opencompass/datasets/NPHardEval/hard_TSP.py index fd74a17a..b372b2b2 100644 --- a/opencompass/datasets/NPHardEval/hard_TSP.py +++ b/opencompass/datasets/NPHardEval/hard_TSP.py @@ -1,213 +1,213 @@ -import ast -import json -import xml.etree.ElementTree as ET - -import numpy as np -import pandas as pd -from datasets import Dataset - -from opencompass.openicl.icl_evaluator import BaseEvaluator -from opencompass.registry import ICL_EVALUATORS, LOAD_DATASET -from opencompass.utils import get_data_path - -from ..base import BaseDataset -from .prompts import tspPrompts - - -def q2text(q, p=tspPrompts): # q is the data for the HP-hard question, p is the prompt - total_cities = q.shape[0] - prompt_text = p['Intro'] + '\n' \ - + p['Initial_question'].format(total_cities=total_cities) + '\n' \ - + p['Output_content'] + '\n' \ - + p['Output_format'] + \ - '\n The distances between cities are below: \n' - for i in range(q.shape[0]): - for j in range(q.shape[1]): - if i < j: # only use the upper triangle - this_line = 'The path between City {} and City {} is with distance {}.'.format(i, j, q.iloc[i, j]) - prompt_text += this_line + '\n' - return prompt_text - - -@LOAD_DATASET.register_module(force=True) -class Hard_TSP_Dataset(BaseDataset): - - @staticmethod - def load(path: str): - path = get_data_path(path, local_mode=True) - raw_data = [] - data_path = path - all_data = [] - for level in range(10): - for file_num in range(10): - # read np array - df = pd.read_csv(data_path + 'synthesized_data_TSP_level_{}_instance_{}.csv'.format(level, file_num + 1), - header=None, - index_col=False) - # transform df to - all_data.append((level + 1, df)) - for (level, q) in all_data: - prompt = q2text(q) - raw_data.append({ - 'prompt': prompt, - 'q': str(level) + '####\n' + json.dumps(q.to_json()), - 'level': level - }) - dataset = Dataset.from_list(raw_data) - return dataset - - -@ICL_EVALUATORS.register_module(force=True) -class Hard_TSP_Evaluator(BaseEvaluator): - - def score(self, predictions, references): - assert len(predictions) == len(references) - - result = {'pass': 0, 'fail': 0} - for index, (q, output) in enumerate(zip(references, predictions)): - output_dict = {} - level = int(q.split('####\n')[0]) - q = json.loads(q.split('####\n')[-1]) - q = pd.DataFrame(eval(q)) - - output_dict['output'] = output - try: - output_dict['correctness'], _ = self.tspCheck(q, output) - except Exception as e: - print(f'Check failed: {e}') - output_dict['correctness'] = False - output_dict['level'] = level - - if output_dict['correctness']: - r = 'pass' - else: - r = 'fail' - result[r] += level - - result['score'] = result['pass'] / (result['pass'] + result['fail']) * 100 - final_result = {'Weighted Accuracy': result['score']} - return final_result - - def parse_xml_to_dict(self, xml_string): - try: - # Parse the XML string - root = ET.fromstring(xml_string) - - # Find the 'final_answer' tag - final_answer_element = root.find('final_answer') - - # Find the 'reasoning' tag - reasoning_element = root.find('reasoning') - except: - try: - assert '' in xml_string - assert '' in xml_string - assert '' in xml_string - assert '' in xml_string - final_answer_start = xml_string.index('') + len('') - final_answer_end = xml_string.index('') - reasoning_start = xml_string.index('') + len('') - reasoning_end = xml_string.index('') - final_answer_element = xml_string[final_answer_start:final_answer_end] - reasoning_element = xml_string[reasoning_start:reasoning_end] - except: - final_answer_element = '' - reasoning_element = '' - - return final_answer_element, reasoning_element - - def tspCheck(self, distance_matrix, llm_string): - """Check if the TSP solution is complete and if the distance matches - the greedy solution. - - :param tour_string: String representing the TSP tour in the format "0->1->2->...->N->0" - :param distance_matrix: 2D numpy array representing the distances between cities - :return: Boolean indicating whether the tour is complete and matches the greedy distance - """ - # convert distance_matrix to numpy array - distance_matrix = np.array(distance_matrix) - - # Convert the tour string to a list of integers - # print(llm_string) - final_answer_element, reasoning_element = self.parse_xml_to_dict(llm_string) - # convert solution to dictionary - if final_answer_element == '': - return False, '' - elif final_answer_element is None: - return False, '' - else: - if isinstance(final_answer_element, str): - try: - tour_string = ast.literal_eval(final_answer_element)['Path'] - if tour_string is None: - return False, '' - except Exception: - try: - tour_string = ast.literal_eval('{' + final_answer_element + '}')['Path'] - if tour_string is None: - return False, '' - except Exception: - return False, '' - else: - try: - tour_string = ast.literal_eval(final_answer_element.text)['Path'] - if tour_string is None: - return False, '' - except Exception: - return False, '' - try: - tour = list(map(int, tour_string.split('->'))) - except Exception: - return False, '' - # we could also prinpt `reasoning_element` to see the reasoning of the answer - # we could also print the final distance of the tour by `final_answer_element['Distance']` - - # Check if tour is a cycle - if tour[0] != tour[-1]: - return False, 'The tour must start and end at the same city.' - - # Check if all cities are visited - if len(tour) != len(distance_matrix) + 1: - return False, 'The tour does not visit all cities exactly once.' - - # Calculate the distance of the provided tour - tour_distance = sum(distance_matrix[tour[i]][tour[i + 1]] - for i in range(len(tour) - 1)) - - # Find the greedy tour distance for comparison - greedy_tour, greedy_distance = self.greedy_tsp(distance_matrix) - - # Check if the provided tour distance is equal to the greedy tour distance - if tour_distance != greedy_distance: - return False, f'The tour distance ({tour_distance}) does not match the greedy solution ({greedy_distance}).' - - return True, 'The solution is complete and matches the greedy solution distance.' - - def greedy_tsp(self, distance_matrix): - """Solve the Traveling Salesman Problem using a greedy algorithm. - - :param distance_matrix: 2D numpy array where the element at [i, j] is the distance between city i and j - :return: A tuple containing a list of the cities in the order they were visited and the total distance - """ - num_cities = distance_matrix.shape[0] - unvisited_cities = set(range(num_cities)) - current_city = np.random.choice(list(unvisited_cities)) - tour = [current_city] - total_distance = 0 - - while unvisited_cities: - unvisited_cities.remove(current_city) - if unvisited_cities: - # Find the nearest unvisited city - distances_to_unvisited = distance_matrix[current_city][list(unvisited_cities)] - nearest_city = list(unvisited_cities)[np.argmin(distances_to_unvisited)] - tour.append(nearest_city) - # Update the total distance - total_distance += distance_matrix[current_city, nearest_city] - current_city = nearest_city - - # Return to start - total_distance += distance_matrix[current_city, tour[0]] - tour.append(tour[0]) - - return tour, total_distance +import ast +import json +import xml.etree.ElementTree as ET + +import numpy as np +import pandas as pd +from datasets import Dataset + +from opencompass.openicl.icl_evaluator import BaseEvaluator +from opencompass.registry import ICL_EVALUATORS, LOAD_DATASET +from opencompass.utils import get_data_path + +from ..base import BaseDataset +from .prompts import tspPrompts + + +def q2text(q, p=tspPrompts): # q is the data for the HP-hard question, p is the prompt + total_cities = q.shape[0] + prompt_text = p['Intro'] + '\n' \ + + p['Initial_question'].format(total_cities=total_cities) + '\n' \ + + p['Output_content'] + '\n' \ + + p['Output_format'] + \ + '\n The distances between cities are below: \n' + for i in range(q.shape[0]): + for j in range(q.shape[1]): + if i < j: # only use the upper triangle + this_line = 'The path between City {} and City {} is with distance {}.'.format(i, j, q.iloc[i, j]) + prompt_text += this_line + '\n' + return prompt_text + + +@LOAD_DATASET.register_module(force=True) +class Hard_TSP_Dataset(BaseDataset): + + @staticmethod + def load(path: str): + path = get_data_path(path, local_mode=True) + raw_data = [] + data_path = path + all_data = [] + for level in range(10): + for file_num in range(10): + # read np array + df = pd.read_csv(data_path + 'synthesized_data_TSP_level_{}_instance_{}.csv'.format(level, file_num + 1), + header=None, + index_col=False) + # transform df to + all_data.append((level + 1, df)) + for (level, q) in all_data: + prompt = q2text(q) + raw_data.append({ + 'prompt': prompt, + 'q': str(level) + '####\n' + json.dumps(q.to_json()), + 'level': level + }) + dataset = Dataset.from_list(raw_data) + return dataset + + +@ICL_EVALUATORS.register_module(force=True) +class Hard_TSP_Evaluator(BaseEvaluator): + + def score(self, predictions, references): + assert len(predictions) == len(references) + + result = {'pass': 0, 'fail': 0} + for index, (q, output) in enumerate(zip(references, predictions)): + output_dict = {} + level = int(q.split('####\n')[0]) + q = json.loads(q.split('####\n')[-1]) + q = pd.DataFrame(eval(q)) + + output_dict['output'] = output + try: + output_dict['correctness'], _ = self.tspCheck(q, output) + except Exception as e: + print(f'Check failed: {e}') + output_dict['correctness'] = False + output_dict['level'] = level + + if output_dict['correctness']: + r = 'pass' + else: + r = 'fail' + result[r] += level + + result['score'] = result['pass'] / (result['pass'] + result['fail']) * 100 + final_result = {'Weighted Accuracy': result['score']} + return final_result + + def parse_xml_to_dict(self, xml_string): + try: + # Parse the XML string + root = ET.fromstring(xml_string) + + # Find the 'final_answer' tag + final_answer_element = root.find('final_answer') + + # Find the 'reasoning' tag + reasoning_element = root.find('reasoning') + except: + try: + assert '' in xml_string + assert '' in xml_string + assert '' in xml_string + assert '' in xml_string + final_answer_start = xml_string.index('') + len('') + final_answer_end = xml_string.index('') + reasoning_start = xml_string.index('') + len('') + reasoning_end = xml_string.index('') + final_answer_element = xml_string[final_answer_start:final_answer_end] + reasoning_element = xml_string[reasoning_start:reasoning_end] + except: + final_answer_element = '' + reasoning_element = '' + + return final_answer_element, reasoning_element + + def tspCheck(self, distance_matrix, llm_string): + """Check if the TSP solution is complete and if the distance matches + the greedy solution. + + :param tour_string: String representing the TSP tour in the format "0->1->2->...->N->0" + :param distance_matrix: 2D numpy array representing the distances between cities + :return: Boolean indicating whether the tour is complete and matches the greedy distance + """ + # convert distance_matrix to numpy array + distance_matrix = np.array(distance_matrix) + + # Convert the tour string to a list of integers + # print(llm_string) + final_answer_element, reasoning_element = self.parse_xml_to_dict(llm_string) + # convert solution to dictionary + if final_answer_element == '': + return False, '' + elif final_answer_element is None: + return False, '' + else: + if isinstance(final_answer_element, str): + try: + tour_string = ast.literal_eval(final_answer_element)['Path'] + if tour_string is None: + return False, '' + except Exception: + try: + tour_string = ast.literal_eval('{' + final_answer_element + '}')['Path'] + if tour_string is None: + return False, '' + except Exception: + return False, '' + else: + try: + tour_string = ast.literal_eval(final_answer_element.text)['Path'] + if tour_string is None: + return False, '' + except Exception: + return False, '' + try: + tour = list(map(int, tour_string.split('->'))) + except Exception: + return False, '' + # we could also prinpt `reasoning_element` to see the reasoning of the answer + # we could also print the final distance of the tour by `final_answer_element['Distance']` + + # Check if tour is a cycle + if tour[0] != tour[-1]: + return False, 'The tour must start and end at the same city.' + + # Check if all cities are visited + if len(tour) != len(distance_matrix) + 1: + return False, 'The tour does not visit all cities exactly once.' + + # Calculate the distance of the provided tour + tour_distance = sum(distance_matrix[tour[i]][tour[i + 1]] + for i in range(len(tour) - 1)) + + # Find the greedy tour distance for comparison + greedy_tour, greedy_distance = self.greedy_tsp(distance_matrix) + + # Check if the provided tour distance is equal to the greedy tour distance + if tour_distance != greedy_distance: + return False, f'The tour distance ({tour_distance}) does not match the greedy solution ({greedy_distance}).' + + return True, 'The solution is complete and matches the greedy solution distance.' + + def greedy_tsp(self, distance_matrix): + """Solve the Traveling Salesman Problem using a greedy algorithm. + + :param distance_matrix: 2D numpy array where the element at [i, j] is the distance between city i and j + :return: A tuple containing a list of the cities in the order they were visited and the total distance + """ + num_cities = distance_matrix.shape[0] + unvisited_cities = set(range(num_cities)) + current_city = np.random.choice(list(unvisited_cities)) + tour = [current_city] + total_distance = 0 + + while unvisited_cities: + unvisited_cities.remove(current_city) + if unvisited_cities: + # Find the nearest unvisited city + distances_to_unvisited = distance_matrix[current_city][list(unvisited_cities)] + nearest_city = list(unvisited_cities)[np.argmin(distances_to_unvisited)] + tour.append(nearest_city) + # Update the total distance + total_distance += distance_matrix[current_city, nearest_city] + current_city = nearest_city + + # Return to start + total_distance += distance_matrix[current_city, tour[0]] + tour.append(tour[0]) + + return tour, total_distance diff --git a/opencompass/datasets/NPHardEval/p_BSP.py b/opencompass/datasets/NPHardEval/p_BSP.py index f59c6c5d..224cf865 100644 --- a/opencompass/datasets/NPHardEval/p_BSP.py +++ b/opencompass/datasets/NPHardEval/p_BSP.py @@ -1,126 +1,126 @@ -import ast -import json - -from datasets import Dataset - -from opencompass.openicl.icl_evaluator import BaseEvaluator -from opencompass.registry import ICL_EVALUATORS, LOAD_DATASET -from opencompass.utils import get_data_path - -from ..base import BaseDataset -from .prompts import bspPrompts - - -def q2text(q, p=bspPrompts): - target_value = q['target'] - # TO-DO: fix data not being sorted - array = sorted(q['array']) - prompt_text = p['Intro'] + '\n' + \ - p['Initial_question'].format(target_value=target_value) + '\n' + \ - p['Output_content'] + '\n' + \ - p['Output_format'] + \ - '\n The sorted array elements are: ' + ', '.join(map(str, array)) + '\n' - - return prompt_text - - -@LOAD_DATASET.register_module(force=True) -class P_BSP_Dataset(BaseDataset): - - @staticmethod - def load(path: str): - path = get_data_path(path, local_mode=True) - raw_data = [] - data_path = path - all_data, newdata = [], [] - with open(data_path + 'bsp_instances.json', 'r') as f: - data = json.load(f) - for sample in data: - level = len(sample['array']) - 2 - all_data.append((level, sample)) - - for level, q in all_data: - prompt = q2text(q) - raw_data.append({ - 'prompt': prompt, - 'q': str(level) + '####\n' + json.dumps(q), - 'level': level - }) - dataset = Dataset.from_list(raw_data) - return dataset - - -@ICL_EVALUATORS.register_module(force=True) -class P_BSP_Evaluator(BaseEvaluator): - - def score(self, predictions, references): - assert len(predictions) == len(references) - - result = {'pass': 0, 'fail': 0} - for index, (q, output) in enumerate(zip(references, predictions)): - output_dict = {} - level = int(q.split('####\n')[0]) - q = json.loads(q.split('####\n')[-1]) - output, reasoning = self.parse_xml_to_dict(output) - output_dict['output'] = output - try: - output_dict['correctness'], _ = self.bsp_check(q, output) - except Exception as e: - print(f'Check failed: {e}') - output_dict['correctness'] = False - output_dict['reasoning'] = reasoning - output_dict['level'] = level - - if output_dict['correctness']: - r = 'pass' - else: - r = 'fail' - result[r] += level - - result['score'] = result['pass'] / (result['pass'] + result['fail']) * 100 - final_result = {'Weighted Accuracy': result['score']} - return final_result - - def parse_xml_to_dict(self, xml_string): - try: - assert '' in xml_string - assert '' in xml_string - assert '' in xml_string - assert '' in xml_string - final_answer_start = xml_string.index('') + len('') - final_answer_end = xml_string.index('') - reasoning_start = xml_string.index('') + len('') - reasoning_end = xml_string.index('') - final_answer_element = xml_string[final_answer_start:final_answer_end].rstrip().strip().rstrip() - reasoning_element = xml_string[reasoning_start:reasoning_end].rstrip().strip().rstrip() - try: - final_answer_element = ast.literal_eval(final_answer_element) - except Exception: - final_answer_element = '' - except Exception: - final_answer_element = '' - reasoning_element = '' - - return final_answer_element, reasoning_element - - def bsp_check(self, instance, solution): - """Check if the binary search solution is valid. - - :param instance: The instance dictionary with array and target value. - :param solution: The solution dictionary with the position of the target value. - :return: A tuple of (is_correct, message). - """ - array = sorted(instance['array']) - target_value = instance['target'] - solution, reasoning = self.parse_xml_to_dict(solution) - if isinstance(solution, str): - return False, f'The solution is invalid.' - try: - position = int(solution['Position']) - except Exception: - return False, f'The solution is invalid.' - if position == -1 or position >= len(array): - return False, f'The solution is invalid.' - elif array[position] != target_value: - return False, f'The target index is incorrect.' - return True, 'The solution is valid.' +import ast +import json + +from datasets import Dataset + +from opencompass.openicl.icl_evaluator import BaseEvaluator +from opencompass.registry import ICL_EVALUATORS, LOAD_DATASET +from opencompass.utils import get_data_path + +from ..base import BaseDataset +from .prompts import bspPrompts + + +def q2text(q, p=bspPrompts): + target_value = q['target'] + # TO-DO: fix data not being sorted + array = sorted(q['array']) + prompt_text = p['Intro'] + '\n' + \ + p['Initial_question'].format(target_value=target_value) + '\n' + \ + p['Output_content'] + '\n' + \ + p['Output_format'] + \ + '\n The sorted array elements are: ' + ', '.join(map(str, array)) + '\n' + + return prompt_text + + +@LOAD_DATASET.register_module(force=True) +class P_BSP_Dataset(BaseDataset): + + @staticmethod + def load(path: str): + path = get_data_path(path, local_mode=True) + raw_data = [] + data_path = path + all_data, newdata = [], [] + with open(data_path + 'bsp_instances.json', 'r') as f: + data = json.load(f) + for sample in data: + level = len(sample['array']) - 2 + all_data.append((level, sample)) + + for level, q in all_data: + prompt = q2text(q) + raw_data.append({ + 'prompt': prompt, + 'q': str(level) + '####\n' + json.dumps(q), + 'level': level + }) + dataset = Dataset.from_list(raw_data) + return dataset + + +@ICL_EVALUATORS.register_module(force=True) +class P_BSP_Evaluator(BaseEvaluator): + + def score(self, predictions, references): + assert len(predictions) == len(references) + + result = {'pass': 0, 'fail': 0} + for index, (q, output) in enumerate(zip(references, predictions)): + output_dict = {} + level = int(q.split('####\n')[0]) + q = json.loads(q.split('####\n')[-1]) + output, reasoning = self.parse_xml_to_dict(output) + output_dict['output'] = output + try: + output_dict['correctness'], _ = self.bsp_check(q, output) + except Exception as e: + print(f'Check failed: {e}') + output_dict['correctness'] = False + output_dict['reasoning'] = reasoning + output_dict['level'] = level + + if output_dict['correctness']: + r = 'pass' + else: + r = 'fail' + result[r] += level + + result['score'] = result['pass'] / (result['pass'] + result['fail']) * 100 + final_result = {'Weighted Accuracy': result['score']} + return final_result + + def parse_xml_to_dict(self, xml_string): + try: + assert '' in xml_string + assert '' in xml_string + assert '' in xml_string + assert '' in xml_string + final_answer_start = xml_string.index('') + len('') + final_answer_end = xml_string.index('') + reasoning_start = xml_string.index('') + len('') + reasoning_end = xml_string.index('') + final_answer_element = xml_string[final_answer_start:final_answer_end].rstrip().strip().rstrip() + reasoning_element = xml_string[reasoning_start:reasoning_end].rstrip().strip().rstrip() + try: + final_answer_element = ast.literal_eval(final_answer_element) + except Exception: + final_answer_element = '' + except Exception: + final_answer_element = '' + reasoning_element = '' + + return final_answer_element, reasoning_element + + def bsp_check(self, instance, solution): + """Check if the binary search solution is valid. + + :param instance: The instance dictionary with array and target value. + :param solution: The solution dictionary with the position of the target value. + :return: A tuple of (is_correct, message). + """ + array = sorted(instance['array']) + target_value = instance['target'] + solution, reasoning = self.parse_xml_to_dict(solution) + if isinstance(solution, str): + return False, f'The solution is invalid.' + try: + position = int(solution['Position']) + except Exception: + return False, f'The solution is invalid.' + if position == -1 or position >= len(array): + return False, f'The solution is invalid.' + elif array[position] != target_value: + return False, f'The target index is incorrect.' + return True, 'The solution is valid.' diff --git a/opencompass/datasets/NPHardEval/p_EDP.py b/opencompass/datasets/NPHardEval/p_EDP.py index f94116aa..747f7132 100644 --- a/opencompass/datasets/NPHardEval/p_EDP.py +++ b/opencompass/datasets/NPHardEval/p_EDP.py @@ -1,147 +1,147 @@ -import ast -import json - -from datasets import Dataset - -from opencompass.openicl.icl_evaluator import BaseEvaluator -from opencompass.registry import ICL_EVALUATORS, LOAD_DATASET -from opencompass.utils import get_data_path - -from ..base import BaseDataset -from .prompts import edpPrompts - - -def q2text(q, p=edpPrompts): - string_a = q['string_a'] - string_b = q['string_b'] - prompt_text = p['Intro'] + '\n' + \ - p['Initial_question'].format(string_a=string_a, string_b=string_b) + '\n' + \ - p['Output_content'] + '\n' + \ - p['Output_format'] - return prompt_text - - -@LOAD_DATASET.register_module(force=True) -class P_EDP_Dataset(BaseDataset): - - @staticmethod - def load(path: str): - path = get_data_path(path, local_mode=True) - raw_data = [] - data_path = path - all_data = [] - with open(data_path + 'edp_instances.json', 'r') as f: - data = json.load(f) - for sample in data: - level = len(sample['string_a']) - 2 - all_data.append((level, sample)) - - for level, q in all_data: - prompt = q2text(q) - raw_data.append({ - 'prompt': prompt, - 'q': str(level) + '####\n' + json.dumps(q), - 'level': level - }) - dataset = Dataset.from_list(raw_data) - return dataset - - -@ICL_EVALUATORS.register_module(force=True) -class P_EDP_Evaluator(BaseEvaluator): - - def score(self, predictions, references): - assert len(predictions) == len(references) - - result = {'pass': 0, 'fail': 0} - for index, (q, output) in enumerate(zip(references, predictions)): - output_dict = {} - level = int(q.split('####\n')[0]) - q = json.loads(q.split('####\n')[-1]) - output, reasoning = self.parse_xml_to_dict(output) - output_dict['output'] = output - try: - output_dict['correctness'], _ = self.edp_check(q, output) - except Exception as e: - print(f'Check failed: {e}') - output_dict['correctness'] = False - output_dict['reasoning'] = reasoning - output_dict['level'] = level - - if output_dict['correctness']: - r = 'pass' - else: - r = 'fail' - result[r] += level - - result['score'] = result['pass'] / (result['pass'] + result['fail']) * 100 - final_result = {'Weighted Accuracy': result['score']} - return final_result - - def compute_min_edit_distance(self, string_a, string_b): - """Computes the minimum edit distance between two strings using dynamic - programming.""" - m, n = len(string_a), len(string_b) - dp = [[0] * (n + 1) for _ in range(m + 1)] - - for i in range(m + 1): - for j in range(n + 1): - if i == 0: - dp[i][j] = j - elif j == 0: - dp[i][j] = i - elif string_a[i - 1] == string_b[j - 1]: - dp[i][j] = dp[i - 1][j - 1] - else: - dp[i][j] = 1 + min(dp[i - 1][j], dp[i][j - 1], dp[i - 1][j - 1]) - return dp[m][n] - - def edp_check(self, instance, solution): - """Check if the edit distance solution is valid. - - :param instance: The instance dictionary with 'string_a' and 'string_b'. - :param solution: The solution dictionary with the reported 'edit_distance'. - :return: A tuple of (is_correct, message). - """ - string_a = instance['string_a'] - string_b = instance['string_b'] - try: - reported_distance = int(solution.get('Operations', -1)) - except Exception: - reported_distance = -1 - - actual_distance = self.compute_min_edit_distance(string_a, string_b) - - if reported_distance == -1: - return False, 'No solution provided.' - elif reported_distance != actual_distance: - return False, f'The reported edit distance ({reported_distance}) is incorrect. Actual distance: {actual_distance}.' - return True, 'The solution is valid.' - - def parse_xml_to_dict(self, xml_string): - try: - assert '' in xml_string - assert '' in xml_string - # assert '' in xml_string - # assert '' in xml_string - final_answer_start = xml_string.index('') + len('') - final_answer_end = xml_string.index('') - # reasoning_start = xml_string.index('') + len('') - # reasoning_end = xml_string.index('') - final_answer_element = xml_string[final_answer_start:final_answer_end].rstrip().strip().rstrip() - assert '{' in final_answer_element - assert '}' in final_answer_element - dic_start = final_answer_element.index('{') - dic_end = final_answer_element.index('}') - final_answer_element = final_answer_element[dic_start:dic_end + 1].rstrip().strip().rstrip() - reasoning_element = xml_string - # reasoning_element = xml_string[reasoning_start:reasoning_end].rstrip().strip().rstrip() - try: - final_answer_element = ast.literal_eval(final_answer_element) - except Exception: - final_answer_element = '' - except Exception: - final_answer_element = '' - reasoning_element = '' - - return final_answer_element, reasoning_element +import ast +import json + +from datasets import Dataset + +from opencompass.openicl.icl_evaluator import BaseEvaluator +from opencompass.registry import ICL_EVALUATORS, LOAD_DATASET +from opencompass.utils import get_data_path + +from ..base import BaseDataset +from .prompts import edpPrompts + + +def q2text(q, p=edpPrompts): + string_a = q['string_a'] + string_b = q['string_b'] + prompt_text = p['Intro'] + '\n' + \ + p['Initial_question'].format(string_a=string_a, string_b=string_b) + '\n' + \ + p['Output_content'] + '\n' + \ + p['Output_format'] + return prompt_text + + +@LOAD_DATASET.register_module(force=True) +class P_EDP_Dataset(BaseDataset): + + @staticmethod + def load(path: str): + path = get_data_path(path, local_mode=True) + raw_data = [] + data_path = path + all_data = [] + with open(data_path + 'edp_instances.json', 'r') as f: + data = json.load(f) + for sample in data: + level = len(sample['string_a']) - 2 + all_data.append((level, sample)) + + for level, q in all_data: + prompt = q2text(q) + raw_data.append({ + 'prompt': prompt, + 'q': str(level) + '####\n' + json.dumps(q), + 'level': level + }) + dataset = Dataset.from_list(raw_data) + return dataset + + +@ICL_EVALUATORS.register_module(force=True) +class P_EDP_Evaluator(BaseEvaluator): + + def score(self, predictions, references): + assert len(predictions) == len(references) + + result = {'pass': 0, 'fail': 0} + for index, (q, output) in enumerate(zip(references, predictions)): + output_dict = {} + level = int(q.split('####\n')[0]) + q = json.loads(q.split('####\n')[-1]) + output, reasoning = self.parse_xml_to_dict(output) + output_dict['output'] = output + try: + output_dict['correctness'], _ = self.edp_check(q, output) + except Exception as e: + print(f'Check failed: {e}') + output_dict['correctness'] = False + output_dict['reasoning'] = reasoning + output_dict['level'] = level + + if output_dict['correctness']: + r = 'pass' + else: + r = 'fail' + result[r] += level + + result['score'] = result['pass'] / (result['pass'] + result['fail']) * 100 + final_result = {'Weighted Accuracy': result['score']} + return final_result + + def compute_min_edit_distance(self, string_a, string_b): + """Computes the minimum edit distance between two strings using dynamic + programming.""" + m, n = len(string_a), len(string_b) + dp = [[0] * (n + 1) for _ in range(m + 1)] + + for i in range(m + 1): + for j in range(n + 1): + if i == 0: + dp[i][j] = j + elif j == 0: + dp[i][j] = i + elif string_a[i - 1] == string_b[j - 1]: + dp[i][j] = dp[i - 1][j - 1] + else: + dp[i][j] = 1 + min(dp[i - 1][j], dp[i][j - 1], dp[i - 1][j - 1]) + return dp[m][n] + + def edp_check(self, instance, solution): + """Check if the edit distance solution is valid. + + :param instance: The instance dictionary with 'string_a' and 'string_b'. + :param solution: The solution dictionary with the reported 'edit_distance'. + :return: A tuple of (is_correct, message). + """ + string_a = instance['string_a'] + string_b = instance['string_b'] + try: + reported_distance = int(solution.get('Operations', -1)) + except Exception: + reported_distance = -1 + + actual_distance = self.compute_min_edit_distance(string_a, string_b) + + if reported_distance == -1: + return False, 'No solution provided.' + elif reported_distance != actual_distance: + return False, f'The reported edit distance ({reported_distance}) is incorrect. Actual distance: {actual_distance}.' + return True, 'The solution is valid.' + + def parse_xml_to_dict(self, xml_string): + try: + assert '' in xml_string + assert '' in xml_string + # assert '' in xml_string + # assert '' in xml_string + final_answer_start = xml_string.index('') + len('') + final_answer_end = xml_string.index('') + # reasoning_start = xml_string.index('') + len('') + # reasoning_end = xml_string.index('') + final_answer_element = xml_string[final_answer_start:final_answer_end].rstrip().strip().rstrip() + assert '{' in final_answer_element + assert '}' in final_answer_element + dic_start = final_answer_element.index('{') + dic_end = final_answer_element.index('}') + final_answer_element = final_answer_element[dic_start:dic_end + 1].rstrip().strip().rstrip() + reasoning_element = xml_string + # reasoning_element = xml_string[reasoning_start:reasoning_end].rstrip().strip().rstrip() + try: + final_answer_element = ast.literal_eval(final_answer_element) + except Exception: + final_answer_element = '' + except Exception: + final_answer_element = '' + reasoning_element = '' + + return final_answer_element, reasoning_element diff --git a/opencompass/datasets/NPHardEval/p_SPP.py b/opencompass/datasets/NPHardEval/p_SPP.py index d9b073e9..f0345a3c 100644 --- a/opencompass/datasets/NPHardEval/p_SPP.py +++ b/opencompass/datasets/NPHardEval/p_SPP.py @@ -1,202 +1,202 @@ -import ast -import json - -try: - import networkx as nx -except ImportError: - nx = None - -from datasets import Dataset - -from opencompass.openicl.icl_evaluator import BaseEvaluator -from opencompass.registry import ICL_EVALUATORS, LOAD_DATASET -from opencompass.utils import get_data_path - -from ..base import BaseDataset -from .prompts import sppPrompts - - -def q2text(q, p=sppPrompts): - # start_node = q['start_node'] - # end_node = q['end_node'] - # TO-DO: fix later - start_node = q['nodes'][0] - end_node = q['nodes'][-1] - edges = q['edges'] - prompt_text = p['Intro'] + '\n' + \ - p['Initial_question'].format(start_node=start_node, end_node=end_node) + '\n' + \ - p['Output_content'] + '\n' + \ - p['Output_format'] + \ - "\n The graph's edges and weights are as follows: \n" - for edge in edges: - this_line = f"Edge from {edge['from']} to {edge['to']} has a weight of {edge['weight']}." - prompt_text += this_line + '\n' - return prompt_text - - -@LOAD_DATASET.register_module(force=True) -class P_SPP_Dataset(BaseDataset): - - @staticmethod - def load(path: str): - path = get_data_path(path, local_mode=True) - raw_data = [] - data_path = path - all_data = [] - with open(data_path + 'spp_instances.json', 'r') as f: - data = json.load(f) - all_data = zip([int(d['complexity_level']) for d in data], data) - for level, q in all_data: - prompt = q2text(q) - raw_data.append({ - 'prompt': prompt, - 'q': str(level) + '####\n' + json.dumps(q), - 'level': level - }) - dataset = Dataset.from_list(raw_data) - return dataset - - -@ICL_EVALUATORS.register_module(force=True) -class P_SPP_Evaluator(BaseEvaluator): - - def score(self, predictions, references): - assert len(predictions) == len(references) - - result = {'pass': 0, 'fail': 0} - for index, (q, output) in enumerate(zip(references, predictions)): - output_dict = {} - level = int(q.split('####\n')[0]) - q = json.loads(q.split('####\n')[-1]) - output, reasoning = self.parse_xml_to_dict(output) - output_dict['output'] = output - try: - output_dict['correctness'], _ = self.spp_check(q, output) - except Exception as e: - print(f'Check failed: {e}') - output_dict['correctness'] = False - output_dict['reasoning'] = reasoning - output_dict['level'] = level - - if output_dict['correctness']: - r = 'pass' - else: - r = 'fail' - result[r] += level - - result['score'] = result['pass'] / (result['pass'] + result['fail']) * 100 - final_result = {'Weighted Accuracy': result['score']} - return final_result - - def parse_xml_to_dict(self, xml_string): - try: - assert '' in xml_string - assert '' in xml_string - # assert '' in xml_string - # assert '' in xml_string - final_answer_start = xml_string.index('') + len('') - final_answer_end = xml_string.index('') - # reasoning_start = xml_string.index('') + len('') - # reasoning_end = xml_string.index('') - final_answer_element = xml_string[final_answer_start:final_answer_end].rstrip().strip().rstrip() - assert '{' in final_answer_element - assert '}' in final_answer_element - dic_start = final_answer_element.index('{') - dic_end = final_answer_element.index('}') - final_answer_element = final_answer_element[dic_start:dic_end + 1].rstrip().strip().rstrip() - # reasoning_element = xml_string[reasoning_start:reasoning_end].rstrip().strip().rstrip() - try: - final_answer_element = ast.literal_eval(final_answer_element) - reasoning_element = xml_string - except Exception: - final_answer_element = '' - reasoning_element = xml_string - except Exception: - final_answer_element = '' - reasoning_element = '' - - return final_answer_element, reasoning_element - - def ssp_optimal_solution(self, instance, source, target): - """Provides the optimal solution for the SSP instance. - - :param instance: The SSP instance as a dictionary with 'nodes' and 'edges'. - :param source: The source node. - :param target: The destination node. - :return: The optimal shortest path length and path. - """ - G = nx.Graph() - G.add_nodes_from(instance['nodes']) - G.add_weighted_edges_from([(edge['from'], edge['to'], edge['weight']) - for edge in instance['edges']]) - shortest_path_length = None - shortest_path = None - if nx.has_path(G, source=source, target=target): - shortest_path_length = nx.shortest_path_length(G, source=source, target=target, weight='weight') - shortest_path = nx.shortest_path(G, source=source, target=target, weight='weight') - return shortest_path_length, shortest_path - - # SPP - def spp_check(self, instance, solution, start_node=None, end_node=None): - """Validate the solution of the SPP problem. - - :param instance: The instance dictionary with nodes and edges. - :param solution: The solution dictionary with the path and total distance. - :param start_node: The start node. - :param end_node: The end node. - :return: A tuple of (is_correct, message). - """ - # Get the start and end nodes - # Currently, the start and end nodes are the first and last nodes in the instance - if start_node is None: - start_node = instance['nodes'][0] - if end_node is None: - end_node = instance['nodes'][-1] - - # Convert solution to dictionary - try: - path_string = solution.get('Path', '') - cost_string = solution.get('TotalDistance', '') - except Exception: - return False, 'The solution is not a dictionary.' - - # Calculate the optimal solution - ssp_optimal_length, ssp_optimal_path = self.ssp_optimal_solution( - instance, start_node, end_node) - if ssp_optimal_length is None: - if isinstance(cost_string, int) or cost_string.isdigit(): - return False, f'No path between from node {start_node} to node {end_node}.' - else: - return True, 'No path found from node {start_node} to node {end_node}.' - - try: - path = list(map(int, path_string.split('->'))) - total_cost = int(cost_string) - except Exception: - return False, 'The solution is not a valid dictionary.' - - # Check if path starts and ends with the correct nodes - if not path or path[0] != start_node or path[-1] != end_node: - return False, 'The path does not start or end at the correct nodes.' - - # Check if the path is continuous and calculate the cost - calculated_cost = 0 - is_in_edge = lambda edge, from_node, to_node: (edge['from'] == from_node and edge['to'] == to_node) or (edge['from'] == to_node and edge['to'] == from_node) - for i in range(len(path) - 1): - from_node, to_node = path[i], path[i + 1] - edge = next((edge for edge in instance['edges'] if is_in_edge(edge, from_node, to_node)), None) - - if not edge: - return False, f'No edge found from node {from_node} to node {to_node}.' - - calculated_cost += edge['weight'] - - # Check if the calculated cost matches the total cost provided in the solution - if calculated_cost != total_cost: - return False, f'The calculated cost ({calculated_cost}) does not match the provided total cost ({total_cost}).' - - if calculated_cost != ssp_optimal_length: - # spp_optimal_path = '->'.join(map(str, ssp_optimal_path)) - return False, f'The calculated cost ({calculated_cost}) does not match the optimal solution ({ssp_optimal_length}): {ssp_optimal_path}.' - - return True, 'The solution is valid.' +import ast +import json + +try: + import networkx as nx +except ImportError: + nx = None + +from datasets import Dataset + +from opencompass.openicl.icl_evaluator import BaseEvaluator +from opencompass.registry import ICL_EVALUATORS, LOAD_DATASET +from opencompass.utils import get_data_path + +from ..base import BaseDataset +from .prompts import sppPrompts + + +def q2text(q, p=sppPrompts): + # start_node = q['start_node'] + # end_node = q['end_node'] + # TO-DO: fix later + start_node = q['nodes'][0] + end_node = q['nodes'][-1] + edges = q['edges'] + prompt_text = p['Intro'] + '\n' + \ + p['Initial_question'].format(start_node=start_node, end_node=end_node) + '\n' + \ + p['Output_content'] + '\n' + \ + p['Output_format'] + \ + "\n The graph's edges and weights are as follows: \n" + for edge in edges: + this_line = f"Edge from {edge['from']} to {edge['to']} has a weight of {edge['weight']}." + prompt_text += this_line + '\n' + return prompt_text + + +@LOAD_DATASET.register_module(force=True) +class P_SPP_Dataset(BaseDataset): + + @staticmethod + def load(path: str): + path = get_data_path(path, local_mode=True) + raw_data = [] + data_path = path + all_data = [] + with open(data_path + 'spp_instances.json', 'r') as f: + data = json.load(f) + all_data = zip([int(d['complexity_level']) for d in data], data) + for level, q in all_data: + prompt = q2text(q) + raw_data.append({ + 'prompt': prompt, + 'q': str(level) + '####\n' + json.dumps(q), + 'level': level + }) + dataset = Dataset.from_list(raw_data) + return dataset + + +@ICL_EVALUATORS.register_module(force=True) +class P_SPP_Evaluator(BaseEvaluator): + + def score(self, predictions, references): + assert len(predictions) == len(references) + + result = {'pass': 0, 'fail': 0} + for index, (q, output) in enumerate(zip(references, predictions)): + output_dict = {} + level = int(q.split('####\n')[0]) + q = json.loads(q.split('####\n')[-1]) + output, reasoning = self.parse_xml_to_dict(output) + output_dict['output'] = output + try: + output_dict['correctness'], _ = self.spp_check(q, output) + except Exception as e: + print(f'Check failed: {e}') + output_dict['correctness'] = False + output_dict['reasoning'] = reasoning + output_dict['level'] = level + + if output_dict['correctness']: + r = 'pass' + else: + r = 'fail' + result[r] += level + + result['score'] = result['pass'] / (result['pass'] + result['fail']) * 100 + final_result = {'Weighted Accuracy': result['score']} + return final_result + + def parse_xml_to_dict(self, xml_string): + try: + assert '' in xml_string + assert '' in xml_string + # assert '' in xml_string + # assert '' in xml_string + final_answer_start = xml_string.index('') + len('') + final_answer_end = xml_string.index('') + # reasoning_start = xml_string.index('') + len('') + # reasoning_end = xml_string.index('') + final_answer_element = xml_string[final_answer_start:final_answer_end].rstrip().strip().rstrip() + assert '{' in final_answer_element + assert '}' in final_answer_element + dic_start = final_answer_element.index('{') + dic_end = final_answer_element.index('}') + final_answer_element = final_answer_element[dic_start:dic_end + 1].rstrip().strip().rstrip() + # reasoning_element = xml_string[reasoning_start:reasoning_end].rstrip().strip().rstrip() + try: + final_answer_element = ast.literal_eval(final_answer_element) + reasoning_element = xml_string + except Exception: + final_answer_element = '' + reasoning_element = xml_string + except Exception: + final_answer_element = '' + reasoning_element = '' + + return final_answer_element, reasoning_element + + def ssp_optimal_solution(self, instance, source, target): + """Provides the optimal solution for the SSP instance. + + :param instance: The SSP instance as a dictionary with 'nodes' and 'edges'. + :param source: The source node. + :param target: The destination node. + :return: The optimal shortest path length and path. + """ + G = nx.Graph() + G.add_nodes_from(instance['nodes']) + G.add_weighted_edges_from([(edge['from'], edge['to'], edge['weight']) + for edge in instance['edges']]) + shortest_path_length = None + shortest_path = None + if nx.has_path(G, source=source, target=target): + shortest_path_length = nx.shortest_path_length(G, source=source, target=target, weight='weight') + shortest_path = nx.shortest_path(G, source=source, target=target, weight='weight') + return shortest_path_length, shortest_path + + # SPP + def spp_check(self, instance, solution, start_node=None, end_node=None): + """Validate the solution of the SPP problem. + + :param instance: The instance dictionary with nodes and edges. + :param solution: The solution dictionary with the path and total distance. + :param start_node: The start node. + :param end_node: The end node. + :return: A tuple of (is_correct, message). + """ + # Get the start and end nodes + # Currently, the start and end nodes are the first and last nodes in the instance + if start_node is None: + start_node = instance['nodes'][0] + if end_node is None: + end_node = instance['nodes'][-1] + + # Convert solution to dictionary + try: + path_string = solution.get('Path', '') + cost_string = solution.get('TotalDistance', '') + except Exception: + return False, 'The solution is not a dictionary.' + + # Calculate the optimal solution + ssp_optimal_length, ssp_optimal_path = self.ssp_optimal_solution( + instance, start_node, end_node) + if ssp_optimal_length is None: + if isinstance(cost_string, int) or cost_string.isdigit(): + return False, f'No path between from node {start_node} to node {end_node}.' + else: + return True, 'No path found from node {start_node} to node {end_node}.' + + try: + path = list(map(int, path_string.split('->'))) + total_cost = int(cost_string) + except Exception: + return False, 'The solution is not a valid dictionary.' + + # Check if path starts and ends with the correct nodes + if not path or path[0] != start_node or path[-1] != end_node: + return False, 'The path does not start or end at the correct nodes.' + + # Check if the path is continuous and calculate the cost + calculated_cost = 0 + is_in_edge = lambda edge, from_node, to_node: (edge['from'] == from_node and edge['to'] == to_node) or (edge['from'] == to_node and edge['to'] == from_node) + for i in range(len(path) - 1): + from_node, to_node = path[i], path[i + 1] + edge = next((edge for edge in instance['edges'] if is_in_edge(edge, from_node, to_node)), None) + + if not edge: + return False, f'No edge found from node {from_node} to node {to_node}.' + + calculated_cost += edge['weight'] + + # Check if the calculated cost matches the total cost provided in the solution + if calculated_cost != total_cost: + return False, f'The calculated cost ({calculated_cost}) does not match the provided total cost ({total_cost}).' + + if calculated_cost != ssp_optimal_length: + # spp_optimal_path = '->'.join(map(str, ssp_optimal_path)) + return False, f'The calculated cost ({calculated_cost}) does not match the optimal solution ({ssp_optimal_length}): {ssp_optimal_path}.' + + return True, 'The solution is valid.' diff --git a/opencompass/datasets/NPHardEval/prompts.py b/opencompass/datasets/NPHardEval/prompts.py index 91383d99..e24b9409 100644 --- a/opencompass/datasets/NPHardEval/prompts.py +++ b/opencompass/datasets/NPHardEval/prompts.py @@ -1,96 +1,96 @@ -# Overall fewshot prompts -FEW_SHOT_SELF = 'Please refer to a few examples of this problem and the corresponding reasoning process. The examples are:' -FEW_SHOT_OTHERS = 'Please refer to a few examples of another problem and the corresponding reasoning process. The problem is {initial_question}. {output_content}. The examples are:' - -# P problems -sppPrompts = { - 'Intro': 'The Shortest Path Problem (SPP) involves finding the shortest path between two nodes in a weighted graph.', - 'Initial_question': "You need to find the shortest path between node {start_node} and node {end_node} in a graph. The graph's edges and their weights are given.", - 'Output_content': 'Please provide the shortest path from {start_node} to {end_node} and its total distance. Offer a concise step-by-step explanation of your reasoning process. Aim for brevity and clarity in your response.', - 'Output_format': "Your output should be enclosed within tags. Include your reasoning in tags and the final path and total distance in tags, like {'Path': 'START->...->END', 'TotalDistance': 'INT_TOTAL_DISTANCE'}.", - 'Few_shot_self': FEW_SHOT_SELF, - 'Few_shot_others': FEW_SHOT_OTHERS -} - -mfpPrompts = { - 'Intro': 'The Maximum Flow Problem (MFP) seeks to find the maximum possible flow from a source node to a sink node in a flow network, subject to capacity constraints on the edges.', - 'Initial_question': 'Determine the maximum flow from the source node {source_node} to the sink node {sink_node} in the given flow network. The capacities of the edges are provided.', - 'Output_content': 'Please indicate the maximum flow value and the flow for each edge. Provide a brief explanation of your methodology. Keep your response concise and focused.', - 'Output_format': "Enclose your output within tags. Present your reasoning in tags and the final maximum flow and edge flows in tags, like {'MaxFlow': 'MAX_FLOW_VALUE', 'Flows': {'NODE_1->NODE_2': FLOW, ...}}.", - 'Few_shot_self': FEW_SHOT_SELF, - 'Few_shot_others': FEW_SHOT_OTHERS -} - -bspPrompts = { - 'Intro': 'The Binary Search Problem (BSP) deals with finding the position of a target value within a sorted array using a binary search algorithm, which efficiently narrows down the search range.', - 'Initial_question': 'Find the position of the target value {target_value} in the sorted array. The index begins with 0. The array elements are provided.', - 'Output_content': 'Please identify the position of the target value in the array. Offer a brief, step-by-step account of your search process. Aim for conciseness in your response.', - 'Output_format': "Your output should be enclosed in tags. Include your search process in tags and the final position of the target value in tags, like {'Position': 'TARGET_POSITION'}.", - 'Few_shot_self': FEW_SHOT_SELF, - 'Few_shot_others': FEW_SHOT_OTHERS -} - -edpPrompts = { - 'Intro': 'The Edit Distance Problem (EDP) involves finding the minimum number of operations required to transform one string into another, where each operation is either an insertion, deletion, or substitution of a single character.', - 'Initial_question': 'Find the minimum number of operations required to transform the first string {string_a} into the second string {string_b}. The operations are insertion, deletion, and substitution of a single character, each requiring 1 edit operation.', - 'Output_content': 'Please provide the minimum number of operations required to transform the first string into the second string. Offer a brief explanation of your methodology. Keep your response concise and focused.', - 'Output_format': "Enclose your output within tags. Present your reasoning in tags and the final minimum number of operations in tags, like {'Operations': 'MINIMUM_NUMBER_OF_OPERATIONS'}.", - 'Few_shot_self': FEW_SHOT_SELF, - 'Few_shot_others': FEW_SHOT_OTHERS -} - -# NP-complete problems -tsp_dPrompts = { - 'Intro': 'The Traveling Salesman Problem (Decision Version, TSP-D) focuses on determining if a salesman can complete a route, visiting each city at least once, with the total travel distance being less than a specified value.', - 'Initial_question': "Check if it's possible for a salesman to visit each of the {total_cities} cities at least once and return to the starting city with the total distance less than {distance_limit}. The distances between each pair of cities are given.", - 'Output_content': 'Provide a yes or no answer, with a succinct explanation of your decision process. Focus on clarity and brevity in your response.', - 'Output_format': "Enclose your output in tags. Present your reasoning in tags and the final yes/no answer in tags, like {'Feasible': 'YES_OR_NO'}.", - 'Few_shot_self': FEW_SHOT_SELF, - 'Few_shot_others': FEW_SHOT_OTHERS -} - -gcp_dPrompts = { - 'Intro': 'The Graph Coloring Problem (Decision Version, GCP-D) involves determining if it is possible to color the vertices of a graph using a given number of colors, ensuring no two adjacent vertices have the same color.', - 'Initial_question': 'Find out if the vertices of a graph with {total_vertices} vertices can be colored using only {number_of_colors} colors, such that no adjacent vertices share the same color.', - 'Output_content': 'Provide a yes or no answer, along with a concise explanation of your reasoning. Keep your explanation focused and brief.', - 'Output_format': "Enclose your output in tags. Include your reasoning in tags and the final yes/no answer in tags, like {'Feasible': 'YES_OR_NO'}.", - 'Few_shot_self': FEW_SHOT_SELF, - 'Few_shot_others': FEW_SHOT_OTHERS -} - -kspPrompts = { - 'Intro': 'The 0-1 Knapsack Problem (KSP) asks whether a subset of items, each with a given weight and value, can be chosen to fit into a knapsack of fixed capacity, maximizing the total value without exceeding the capacity.', - 'Initial_question': 'Determine if a subset of items can be selected to fit into a knapsack with a capacity of {knapsack_capacity}, maximizing value without exceeding the capacity. Item weights and values are provided.', - 'Output_content': 'Indicate if an optimal subset exists and its total value. Offer a concise explanation of your selection process. Aim for clarity and brevity in your response.', - 'Output_format': "Your output should be enclosed within tags. Include your selection process in tags and the final decision and total value in tags, like {'Feasible': 'YES_OR_NO', 'TotalValue': 'TOTAL_VALUE', 'SelectedItemIds': [0, 1]}.", - 'Few_shot_self': FEW_SHOT_SELF, - 'Few_shot_others': FEW_SHOT_OTHERS -} - -# NP-hard problems -tspPrompts = { - 'Intro': 'The traveling salesman problem (TSP) is a classic optimization problem that aims to find the shortest possible route that visits a set of cities, with each city being visited exactly once and the route returning to the original city.', - 'Initial_question': 'You must find the shortest path that visits all {total_cities} cities, labelled from 1 to {total_cities}. The distances between each pair of cities are provided.', - 'Output_content': 'Please list each city in the order they are visited. Provide the total distance of the trip. You should also provide very short step by step reasoning. Do not use multiple lines and try your best to save output tokens.', - 'Output_format': "Your output should contain two parts enclosed by . First, your step by step reasoning like The reasoning process. Second, the final output of the result path and total distance wrapped by final_answer tag, like {'Path': '0->1->2->...->N->0', 'TotalDistance': 'INT_TOTAL_DISTANCE'}", - 'Few_shot_self': FEW_SHOT_SELF, - 'Few_shot_others': FEW_SHOT_OTHERS -} - -gcpPrompts = { - 'Intro': 'Graph coloring refers to the problem of coloring vertices of a graph in such a way that no two adjacent vertices have the same color. ', - 'Initial_question': 'There are {max_vertices} vertices 1 to {max_vertices} in a graph. You may use {max_colors} colors with alphabats from A, B, C,... to color the graph.', - 'Output_content': "Please label every vertex, even if it is disconnected from the rest of the graph. Please provide each vertex's color. Do not skip any vertices. You should also provide very short step by step reasoning. Do not use multiple lines and try your best to save output tokens.", - 'Output_format': "Your output should contain two parts enclosed by . First, your step by step reasoning wrapped by . Second, the final output of all vertex numbers and their associated colors, wrapped by final_answer tag, like {0:'COLOR_1', 1:'COLOR_2', ...}.", - 'Few_shot_self': FEW_SHOT_SELF, - 'Few_shot_others': FEW_SHOT_OTHERS -} - -mspPrompts = { - 'Intro': 'The meeting scheduling problem (MSP) is a type of constraint satisfaction problem where the goal is to find a suitable time slot for a meeting that all participants can attend without conflicts in their schedules.', - 'Initial_question': "There are {total_participants} participants with their available time slots. There are {total_timeslots} consecutive non-overlapping time slots. Let's assume all meetings has duration of 1.", - 'Output_content': 'Please provide a time slot where all participants can attend the meeting. You should also provide very short step by step reasoning. Do not use multiple lines and try your best to save output tokens.', - 'Output_format': 'Your output should contain two parts enclosed by . First, your step by step reasoning wrapped by . Second, the final output of meeting numbers followed by a list of slots, like {0:[1,2], 1:[4], ...}.', - 'Few_shot_self': FEW_SHOT_SELF, - 'Few_shot_others': FEW_SHOT_OTHERS -} +# Overall fewshot prompts +FEW_SHOT_SELF = 'Please refer to a few examples of this problem and the corresponding reasoning process. The examples are:' +FEW_SHOT_OTHERS = 'Please refer to a few examples of another problem and the corresponding reasoning process. The problem is {initial_question}. {output_content}. The examples are:' + +# P problems +sppPrompts = { + 'Intro': 'The Shortest Path Problem (SPP) involves finding the shortest path between two nodes in a weighted graph.', + 'Initial_question': "You need to find the shortest path between node {start_node} and node {end_node} in a graph. The graph's edges and their weights are given.", + 'Output_content': 'Please provide the shortest path from {start_node} to {end_node} and its total distance. Offer a concise step-by-step explanation of your reasoning process. Aim for brevity and clarity in your response.', + 'Output_format': "Your output should be enclosed within tags. Include your reasoning in tags and the final path and total distance in tags, like {'Path': 'START->...->END', 'TotalDistance': 'INT_TOTAL_DISTANCE'}.", + 'Few_shot_self': FEW_SHOT_SELF, + 'Few_shot_others': FEW_SHOT_OTHERS +} + +mfpPrompts = { + 'Intro': 'The Maximum Flow Problem (MFP) seeks to find the maximum possible flow from a source node to a sink node in a flow network, subject to capacity constraints on the edges.', + 'Initial_question': 'Determine the maximum flow from the source node {source_node} to the sink node {sink_node} in the given flow network. The capacities of the edges are provided.', + 'Output_content': 'Please indicate the maximum flow value and the flow for each edge. Provide a brief explanation of your methodology. Keep your response concise and focused.', + 'Output_format': "Enclose your output within tags. Present your reasoning in tags and the final maximum flow and edge flows in tags, like {'MaxFlow': 'MAX_FLOW_VALUE', 'Flows': {'NODE_1->NODE_2': FLOW, ...}}.", + 'Few_shot_self': FEW_SHOT_SELF, + 'Few_shot_others': FEW_SHOT_OTHERS +} + +bspPrompts = { + 'Intro': 'The Binary Search Problem (BSP) deals with finding the position of a target value within a sorted array using a binary search algorithm, which efficiently narrows down the search range.', + 'Initial_question': 'Find the position of the target value {target_value} in the sorted array. The index begins with 0. The array elements are provided.', + 'Output_content': 'Please identify the position of the target value in the array. Offer a brief, step-by-step account of your search process. Aim for conciseness in your response.', + 'Output_format': "Your output should be enclosed in tags. Include your search process in tags and the final position of the target value in tags, like {'Position': 'TARGET_POSITION'}.", + 'Few_shot_self': FEW_SHOT_SELF, + 'Few_shot_others': FEW_SHOT_OTHERS +} + +edpPrompts = { + 'Intro': 'The Edit Distance Problem (EDP) involves finding the minimum number of operations required to transform one string into another, where each operation is either an insertion, deletion, or substitution of a single character.', + 'Initial_question': 'Find the minimum number of operations required to transform the first string {string_a} into the second string {string_b}. The operations are insertion, deletion, and substitution of a single character, each requiring 1 edit operation.', + 'Output_content': 'Please provide the minimum number of operations required to transform the first string into the second string. Offer a brief explanation of your methodology. Keep your response concise and focused.', + 'Output_format': "Enclose your output within tags. Present your reasoning in tags and the final minimum number of operations in tags, like {'Operations': 'MINIMUM_NUMBER_OF_OPERATIONS'}.", + 'Few_shot_self': FEW_SHOT_SELF, + 'Few_shot_others': FEW_SHOT_OTHERS +} + +# NP-complete problems +tsp_dPrompts = { + 'Intro': 'The Traveling Salesman Problem (Decision Version, TSP-D) focuses on determining if a salesman can complete a route, visiting each city at least once, with the total travel distance being less than a specified value.', + 'Initial_question': "Check if it's possible for a salesman to visit each of the {total_cities} cities at least once and return to the starting city with the total distance less than {distance_limit}. The distances between each pair of cities are given.", + 'Output_content': 'Provide a yes or no answer, with a succinct explanation of your decision process. Focus on clarity and brevity in your response.', + 'Output_format': "Enclose your output in tags. Present your reasoning in tags and the final yes/no answer in tags, like {'Feasible': 'YES_OR_NO'}.", + 'Few_shot_self': FEW_SHOT_SELF, + 'Few_shot_others': FEW_SHOT_OTHERS +} + +gcp_dPrompts = { + 'Intro': 'The Graph Coloring Problem (Decision Version, GCP-D) involves determining if it is possible to color the vertices of a graph using a given number of colors, ensuring no two adjacent vertices have the same color.', + 'Initial_question': 'Find out if the vertices of a graph with {total_vertices} vertices can be colored using only {number_of_colors} colors, such that no adjacent vertices share the same color.', + 'Output_content': 'Provide a yes or no answer, along with a concise explanation of your reasoning. Keep your explanation focused and brief.', + 'Output_format': "Enclose your output in tags. Include your reasoning in tags and the final yes/no answer in tags, like {'Feasible': 'YES_OR_NO'}.", + 'Few_shot_self': FEW_SHOT_SELF, + 'Few_shot_others': FEW_SHOT_OTHERS +} + +kspPrompts = { + 'Intro': 'The 0-1 Knapsack Problem (KSP) asks whether a subset of items, each with a given weight and value, can be chosen to fit into a knapsack of fixed capacity, maximizing the total value without exceeding the capacity.', + 'Initial_question': 'Determine if a subset of items can be selected to fit into a knapsack with a capacity of {knapsack_capacity}, maximizing value without exceeding the capacity. Item weights and values are provided.', + 'Output_content': 'Indicate if an optimal subset exists and its total value. Offer a concise explanation of your selection process. Aim for clarity and brevity in your response.', + 'Output_format': "Your output should be enclosed within tags. Include your selection process in tags and the final decision and total value in tags, like {'Feasible': 'YES_OR_NO', 'TotalValue': 'TOTAL_VALUE', 'SelectedItemIds': [0, 1]}.", + 'Few_shot_self': FEW_SHOT_SELF, + 'Few_shot_others': FEW_SHOT_OTHERS +} + +# NP-hard problems +tspPrompts = { + 'Intro': 'The traveling salesman problem (TSP) is a classic optimization problem that aims to find the shortest possible route that visits a set of cities, with each city being visited exactly once and the route returning to the original city.', + 'Initial_question': 'You must find the shortest path that visits all {total_cities} cities, labelled from 1 to {total_cities}. The distances between each pair of cities are provided.', + 'Output_content': 'Please list each city in the order they are visited. Provide the total distance of the trip. You should also provide very short step by step reasoning. Do not use multiple lines and try your best to save output tokens.', + 'Output_format': "Your output should contain two parts enclosed by . First, your step by step reasoning like The reasoning process. Second, the final output of the result path and total distance wrapped by final_answer tag, like {'Path': '0->1->2->...->N->0', 'TotalDistance': 'INT_TOTAL_DISTANCE'}", + 'Few_shot_self': FEW_SHOT_SELF, + 'Few_shot_others': FEW_SHOT_OTHERS +} + +gcpPrompts = { + 'Intro': 'Graph coloring refers to the problem of coloring vertices of a graph in such a way that no two adjacent vertices have the same color. ', + 'Initial_question': 'There are {max_vertices} vertices 1 to {max_vertices} in a graph. You may use {max_colors} colors with alphabats from A, B, C,... to color the graph.', + 'Output_content': "Please label every vertex, even if it is disconnected from the rest of the graph. Please provide each vertex's color. Do not skip any vertices. You should also provide very short step by step reasoning. Do not use multiple lines and try your best to save output tokens.", + 'Output_format': "Your output should contain two parts enclosed by . First, your step by step reasoning wrapped by . Second, the final output of all vertex numbers and their associated colors, wrapped by final_answer tag, like {0:'COLOR_1', 1:'COLOR_2', ...}.", + 'Few_shot_self': FEW_SHOT_SELF, + 'Few_shot_others': FEW_SHOT_OTHERS +} + +mspPrompts = { + 'Intro': 'The meeting scheduling problem (MSP) is a type of constraint satisfaction problem where the goal is to find a suitable time slot for a meeting that all participants can attend without conflicts in their schedules.', + 'Initial_question': "There are {total_participants} participants with their available time slots. There are {total_timeslots} consecutive non-overlapping time slots. Let's assume all meetings has duration of 1.", + 'Output_content': 'Please provide a time slot where all participants can attend the meeting. You should also provide very short step by step reasoning. Do not use multiple lines and try your best to save output tokens.', + 'Output_format': 'Your output should contain two parts enclosed by . First, your step by step reasoning wrapped by . Second, the final output of meeting numbers followed by a list of slots, like {0:[1,2], 1:[4], ...}.', + 'Few_shot_self': FEW_SHOT_SELF, + 'Few_shot_others': FEW_SHOT_OTHERS +} diff --git a/opencompass/datasets/NPHardEval/utils.py b/opencompass/datasets/NPHardEval/utils.py index 1906fe2a..d5c48565 100644 --- a/opencompass/datasets/NPHardEval/utils.py +++ b/opencompass/datasets/NPHardEval/utils.py @@ -1,43 +1,43 @@ -import ast -import xml.etree.ElementTree as ET - - -def append_root_tags(string): - if not string.strip().startswith(''): - string = '\n' + string - if not string.strip().endswith(''): - string += '\n' - return string - - -def parse_xml_to_dict(xml_string): - final_answer_element = '' - reasoning_element = '' - - try: - # Parse the XML string - root = ET.fromstring(xml_string) - - # Find the 'final_answer' tag - final_answer_element = root.find('final_answer').text - - # Find the 'reasoning' tag - reasoning_element = root.find('reasoning').text - except Exception: - try: - assert '' in xml_string - assert '' in xml_string - assert '' in xml_string - assert '' in xml_string - final_answer_start = xml_string.index('') + len('') - final_answer_end = xml_string.index('') - reasoning_start = xml_string.index('') + len('') - reasoning_end = xml_string.index('') - final_answer_element = xml_string[final_answer_start:final_answer_end] - reasoning_element = xml_string[reasoning_start:reasoning_end] - except Exception: - final_answer_element = '' - reasoning_element = '' - - final_answer_element = ast.literal_eval(final_answer_element.strip()) - return final_answer_element, reasoning_element +import ast +import xml.etree.ElementTree as ET + + +def append_root_tags(string): + if not string.strip().startswith(''): + string = '\n' + string + if not string.strip().endswith(''): + string += '\n' + return string + + +def parse_xml_to_dict(xml_string): + final_answer_element = '' + reasoning_element = '' + + try: + # Parse the XML string + root = ET.fromstring(xml_string) + + # Find the 'final_answer' tag + final_answer_element = root.find('final_answer').text + + # Find the 'reasoning' tag + reasoning_element = root.find('reasoning').text + except Exception: + try: + assert '' in xml_string + assert '' in xml_string + assert '' in xml_string + assert '' in xml_string + final_answer_start = xml_string.index('') + len('') + final_answer_end = xml_string.index('') + reasoning_start = xml_string.index('') + len('') + reasoning_end = xml_string.index('') + final_answer_element = xml_string[final_answer_start:final_answer_end] + reasoning_element = xml_string[reasoning_start:reasoning_end] + except Exception: + final_answer_element = '' + reasoning_element = '' + + final_answer_element = ast.literal_eval(final_answer_element.strip()) + return final_answer_element, reasoning_element diff --git a/opencompass/datasets/TheoremQA/__init__.py b/opencompass/datasets/TheoremQA/__init__.py index a24a0ea1..620b9255 100644 --- a/opencompass/datasets/TheoremQA/__init__.py +++ b/opencompass/datasets/TheoremQA/__init__.py @@ -1,4 +1,4 @@ -from .legacy import (TheoremQA_postprocess, TheoremQA_postprocess_v2, - TheoremQADataset) -from .main import (TheoremQA_postprocess_v3, TheoremQADatasetV3, - TheoremQAEvaluatorV3, TheoremQA_postprocess_v4) +from .legacy import (TheoremQA_postprocess, TheoremQA_postprocess_v2, + TheoremQADataset) +from .main import (TheoremQA_postprocess_v3, TheoremQADatasetV3, + TheoremQAEvaluatorV3, TheoremQA_postprocess_v4) diff --git a/opencompass/datasets/TheoremQA/legacy.py b/opencompass/datasets/TheoremQA/legacy.py index 5a4b2563..afea4e02 100644 --- a/opencompass/datasets/TheoremQA/legacy.py +++ b/opencompass/datasets/TheoremQA/legacy.py @@ -1,40 +1,40 @@ -import re - -from datasets import load_dataset - -from opencompass.registry import LOAD_DATASET, TEXT_POSTPROCESSORS -from opencompass.utils import get_data_path - -from ..base import BaseDataset - - -@LOAD_DATASET.register_module() -class TheoremQADataset(BaseDataset): - - @staticmethod - def load(path: str): - path = get_data_path(path, local_mode=True) - return load_dataset('csv', data_files={'test': path}) - - -@TEXT_POSTPROCESSORS.register_module('TheoremQA') -def TheoremQA_postprocess(text: str) -> str: - text = text.strip() - matches = re.findall(r'answer is ([^\s]+)', text) - if len(matches) == 0: - return text - else: - text = matches[0].strip().strip('.,?!\"\';:') - return text - - -def TheoremQA_postprocess_v2(text: str) -> str: - prediction = text.strip().strip('\n').split('\n')[-1] - tmp = '' - for entry in prediction.split(' ')[::-1]: - if entry == 'is' or entry == 'be' or entry == 'are' or entry.endswith( - ':'): - break - tmp = entry + ' ' + tmp - prediction = tmp.strip().strip('.') - return prediction +import re + +from datasets import load_dataset + +from opencompass.registry import LOAD_DATASET, TEXT_POSTPROCESSORS +from opencompass.utils import get_data_path + +from ..base import BaseDataset + + +@LOAD_DATASET.register_module() +class TheoremQADataset(BaseDataset): + + @staticmethod + def load(path: str): + path = get_data_path(path, local_mode=True) + return load_dataset('csv', data_files={'test': path}) + + +@TEXT_POSTPROCESSORS.register_module('TheoremQA') +def TheoremQA_postprocess(text: str) -> str: + text = text.strip() + matches = re.findall(r'answer is ([^\s]+)', text) + if len(matches) == 0: + return text + else: + text = matches[0].strip().strip('.,?!\"\';:') + return text + + +def TheoremQA_postprocess_v2(text: str) -> str: + prediction = text.strip().strip('\n').split('\n')[-1] + tmp = '' + for entry in prediction.split(' ')[::-1]: + if entry == 'is' or entry == 'be' or entry == 'are' or entry.endswith( + ':'): + break + tmp = entry + ' ' + tmp + prediction = tmp.strip().strip('.') + return prediction diff --git a/opencompass/datasets/TheoremQA/main.py b/opencompass/datasets/TheoremQA/main.py index 4c145896..23251cb3 100644 --- a/opencompass/datasets/TheoremQA/main.py +++ b/opencompass/datasets/TheoremQA/main.py @@ -1,75 +1,75 @@ -import re -import json - -from datasets import Dataset, DatasetDict - -from opencompass.registry import LOAD_DATASET, TEXT_POSTPROCESSORS, ICL_EVALUATORS -from opencompass.utils import get_data_path - -from opencompass.openicl.icl_evaluator import BaseEvaluator -from ..base import BaseDataset -from . import utils -from tqdm import tqdm - - -@LOAD_DATASET.register_module() -class TheoremQADatasetV3(BaseDataset): - - @staticmethod - def load(path: str): - path = get_data_path(path, local_mode=True) - with open(path, 'r') as f: - data = json.load(f) - for item in data: - item['Answer'] = str(item['Answer']) - dataset = Dataset.from_list(data) - return dataset - - -def TheoremQA_postprocess_v3(text: str) -> str: - answer = utils.answer_clean(["The answer is:", "The answer is", "the answer is"], text) - return answer - -def TheoremQA_postprocess_v4(text: str) -> str: - # First clean the answer text - answer = utils.answer_clean(["The answer is:", "The answer is", "the answer is"], text) - # Remove LaTeX delimiters \( and \) and strip whitespace - answer = answer.strip('\\(').strip('\\)').strip() - return answer - - -@ICL_EVALUATORS.register_module() -class TheoremQAEvaluatorV3(BaseEvaluator): - def score(self, predictions, references, test_set): - if len(predictions) != len(references): - return {"error": "preds and refrs have different length"} - - details = [] - correct, wrong = 0, 0 - for index in tqdm(range(len(predictions))): - answer = predictions[index] - groundtruth = references[index] - answer_type = test_set[index]['Answer_type'] - if answer_type in ['float', 'integer', 'bool']: - groundtruth = [groundtruth, eval(groundtruth)] - else: - groundtruth = [groundtruth, None] - if utils.compare_answer_with_groundtruth(answer, *groundtruth): - correct += 1 - is_correct = True - else: - wrong += 1 - is_correct = False - - details.append( - { - # "question": question, - # "solution": output, - # "correct": groundtruth, - "pred": answer, - "is_correct": is_correct, - } - ) - - score = correct / (correct + wrong) * 100 - return {'score': score, 'details': details} +import re +import json + +from datasets import Dataset, DatasetDict + +from opencompass.registry import LOAD_DATASET, TEXT_POSTPROCESSORS, ICL_EVALUATORS +from opencompass.utils import get_data_path + +from opencompass.openicl.icl_evaluator import BaseEvaluator +from ..base import BaseDataset +from . import utils +from tqdm import tqdm + + +@LOAD_DATASET.register_module() +class TheoremQADatasetV3(BaseDataset): + + @staticmethod + def load(path: str): + path = get_data_path(path, local_mode=True) + with open(path, 'r') as f: + data = json.load(f) + for item in data: + item['Answer'] = str(item['Answer']) + dataset = Dataset.from_list(data) + return dataset + + +def TheoremQA_postprocess_v3(text: str) -> str: + answer = utils.answer_clean(["The answer is:", "The answer is", "the answer is"], text) + return answer + +def TheoremQA_postprocess_v4(text: str) -> str: + # First clean the answer text + answer = utils.answer_clean(["The answer is:", "The answer is", "the answer is"], text) + # Remove LaTeX delimiters \( and \) and strip whitespace + answer = answer.strip('\\(').strip('\\)').strip() + return answer + + +@ICL_EVALUATORS.register_module() +class TheoremQAEvaluatorV3(BaseEvaluator): + def score(self, predictions, references, test_set): + if len(predictions) != len(references): + return {"error": "preds and refrs have different length"} + + details = [] + correct, wrong = 0, 0 + for index in tqdm(range(len(predictions))): + answer = predictions[index] + groundtruth = references[index] + answer_type = test_set[index]['Answer_type'] + if answer_type in ['float', 'integer', 'bool']: + groundtruth = [groundtruth, eval(groundtruth)] + else: + groundtruth = [groundtruth, None] + if utils.compare_answer_with_groundtruth(answer, *groundtruth): + correct += 1 + is_correct = True + else: + wrong += 1 + is_correct = False + + details.append( + { + # "question": question, + # "solution": output, + # "correct": groundtruth, + "pred": answer, + "is_correct": is_correct, + } + ) + + score = correct / (correct + wrong) * 100 + return {'score': score, 'details': details} diff --git a/opencompass/datasets/TheoremQA/number_utils.py b/opencompass/datasets/TheoremQA/number_utils.py index fd93fe66..909ef60b 100644 --- a/opencompass/datasets/TheoremQA/number_utils.py +++ b/opencompass/datasets/TheoremQA/number_utils.py @@ -1,98 +1,98 @@ -import re -import math -from math import sqrt, sin, cos, log, pi, factorial, exp, e -E = 2.718 - - -def floatify(num: str): - try: - num = float(num) - if num.is_integer(): - return round(num) - else: - return num - except Exception: - return None - - -def within_eps(pred: float, gt: float): - eps = abs(gt) * 0.04 - if pred >= gt - eps and pred <= gt + eps: - return True - else: - return False - - -def clean_units(pred_str: str): - """Clean the units in the number.""" - def convert_pi_to_number(code_string): - code_string = code_string.replace('\\pi', 'π') - # Replace \pi or π not preceded by a digit or } with 3.14 - code_string = re.sub(r'(? "3*3.14" - code_string = re.sub(r'(\d)(\\?π)', r'\1*3.14', code_string) - # Handle cases where π is within braces or followed by a multiplication symbol - # This replaces "{π}" with "3.14" directly and "3*π" with "3*3.14" - code_string = re.sub(r'\{(\\?π)\}', '3.14', code_string) - code_string = re.sub(r'\*(\\?π)', '*3.14', code_string) - return code_string - - pred_str = convert_pi_to_number(pred_str) - pred_str = pred_str.replace('%', '/100') - pred_str = pred_str.replace('$', '') - pred_str = pred_str.replace('¥', '') - pred_str = pred_str.replace('°C', '') - pred_str = pred_str.replace(' C', '') - pred_str = pred_str.replace('°', '') - return pred_str - - -def number_it(num): - from latex2sympy2_extended import latex2sympy - if isinstance(num, (int, float)): - return num - - num = clean_units(num) - try: - num = str(latex2sympy(num)) - except Exception: - pass - - if floatify(num) is not None: - return floatify(num) - else: - try: - num = eval(num) - if isinstance(num, list) or isinstance(num, tuple): - num = num[0] - if floatify(num) is not None: - return floatify(num) - else: - return None - except Exception: - return None - - -def compare_two_numbers(p, gt): - try: - if math.isnan(p): - return False - if isinstance(gt, int): - return round(p) == gt - else: - return within_eps(pred=p, gt=gt) - except Exception: - return False - - -def compare_two_list(pred, gt): - if not isinstance(pred, list): - return False - elif len(pred) != len(gt): - return False - elif any([not isinstance(x, (int, float)) for x in pred]): - return False - else: - pred = sorted(pred) - gt = sorted(gt) - return all([compare_two_numbers(p, g) for p, g in zip(pred, gt)]) +import re +import math +from math import sqrt, sin, cos, log, pi, factorial, exp, e +E = 2.718 + + +def floatify(num: str): + try: + num = float(num) + if num.is_integer(): + return round(num) + else: + return num + except Exception: + return None + + +def within_eps(pred: float, gt: float): + eps = abs(gt) * 0.04 + if pred >= gt - eps and pred <= gt + eps: + return True + else: + return False + + +def clean_units(pred_str: str): + """Clean the units in the number.""" + def convert_pi_to_number(code_string): + code_string = code_string.replace('\\pi', 'π') + # Replace \pi or π not preceded by a digit or } with 3.14 + code_string = re.sub(r'(? "3*3.14" + code_string = re.sub(r'(\d)(\\?π)', r'\1*3.14', code_string) + # Handle cases where π is within braces or followed by a multiplication symbol + # This replaces "{π}" with "3.14" directly and "3*π" with "3*3.14" + code_string = re.sub(r'\{(\\?π)\}', '3.14', code_string) + code_string = re.sub(r'\*(\\?π)', '*3.14', code_string) + return code_string + + pred_str = convert_pi_to_number(pred_str) + pred_str = pred_str.replace('%', '/100') + pred_str = pred_str.replace('$', '') + pred_str = pred_str.replace('¥', '') + pred_str = pred_str.replace('°C', '') + pred_str = pred_str.replace(' C', '') + pred_str = pred_str.replace('°', '') + return pred_str + + +def number_it(num): + from latex2sympy2_extended import latex2sympy + if isinstance(num, (int, float)): + return num + + num = clean_units(num) + try: + num = str(latex2sympy(num)) + except Exception: + pass + + if floatify(num) is not None: + return floatify(num) + else: + try: + num = eval(num) + if isinstance(num, list) or isinstance(num, tuple): + num = num[0] + if floatify(num) is not None: + return floatify(num) + else: + return None + except Exception: + return None + + +def compare_two_numbers(p, gt): + try: + if math.isnan(p): + return False + if isinstance(gt, int): + return round(p) == gt + else: + return within_eps(pred=p, gt=gt) + except Exception: + return False + + +def compare_two_list(pred, gt): + if not isinstance(pred, list): + return False + elif len(pred) != len(gt): + return False + elif any([not isinstance(x, (int, float)) for x in pred]): + return False + else: + pred = sorted(pred) + gt = sorted(gt) + return all([compare_two_numbers(p, g) for p, g in zip(pred, gt)]) diff --git a/opencompass/datasets/TheoremQA/utils.py b/opencompass/datasets/TheoremQA/utils.py index ca9c2661..c89e5da0 100644 --- a/opencompass/datasets/TheoremQA/utils.py +++ b/opencompass/datasets/TheoremQA/utils.py @@ -1,110 +1,110 @@ -import re -from .number_utils import clean_units, compare_two_numbers, compare_two_list, number_it -import contextlib -import signal - -@contextlib.contextmanager -def time_limit(seconds: float): - def signal_handler(signum, frame): - raise ValueError - - signal.setitimer(signal.ITIMER_REAL, seconds) - signal.signal(signal.SIGALRM, signal_handler) - try: - yield - finally: - signal.setitimer(signal.ITIMER_REAL, 0) - - -def extract_theoremqa_answer(pred: str, answer_flag: bool = True): - from latex2sympy2_extended import latex2sympy - - if any([option in pred.lower() for option in ['yes', 'true']]): - pred = 'True' - elif any([option in pred.lower() for option in ['no', 'false']]): - pred = 'False' - elif any([option in pred.lower() for option in ['(a)', '(b)', '(c)', '(d)', '(e)', '(f)']]): - pass - else: - if answer_flag: - # Extract the numbers out of the string - pred = pred.split('=')[-1].strip() - pred = clean_units(pred) - try: - with time_limit(1): - tmp = str(latex2sympy(pred)) - pred = str(eval(tmp)) - except Exception: - if re.match(r'-?[\d\.]+\s\D+$', pred): - pred = pred.split(' ')[0] - elif re.match(r'-?[\d\.]+\s[^\s]+$', pred): - pred = pred.split(' ')[0] - else: - # desparate search over the last number - preds = re.findall(r'-?\d*\.?\d+', pred) - if(len(preds) >= 1): - pred = preds[-1] - else: - pred = '' - return pred - -def answer_clean(direct_answer_trigger_for_fewshot: tuple, pred: str): - pred = pred.strip('\n') - - # Determine if this is ICL, if so, use \n\n to split the first chunk. - ICL = False - for trigger in direct_answer_trigger_for_fewshot: - if pred.count(trigger) > 1: - ICL = True - if ICL: - pred = pred.split('\n\n')[0] - - # Split the trigger to find the answer. - preds = re.split('|'.join(direct_answer_trigger_for_fewshot), pred) - if len(preds) > 1: - answer_flag = True - pred = preds[-1] - else: - answer_flag = False - - pred = pred.strip('\n').rstrip('.').rstrip('/').strip(' ') - - pred = [extract_theoremqa_answer(pred, answer_flag)] - - # If there is no candidate in list, null is set. - if len(pred) == 0: - pred = "" - else: - if answer_flag: - # choose the first element in list ... - pred = pred[0] - else: - # choose the last e - pred = pred[-1] - - # Remove the period at the end, again! - pred = pred.rstrip('.').rstrip('/') - return pred - - - -def compare_answer_with_groundtruth(answer: str, groundtruth_str: str, groundtruth_num = None): - if groundtruth_str.lower() in ['(a)', '(b)', '(c)', '(d)', '(e)', '(f)']: - return groundtruth_str.lower() in answer.lower() - elif answer.lower() == groundtruth_str.lower(): - return True - elif groundtruth_num is not None: - if isinstance(groundtruth_num, (int, float)): - return compare_two_numbers(number_it(answer), groundtruth_num) - else: - if answer.startswith('(') and answer.endswith(')'): - try: - answer = list(eval(answer)) - answer = [number_it(a) for a in answer] - except Exception as e: - return False - return compare_two_list(answer, groundtruth_num) - else: - return False - else: - return False +import re +from .number_utils import clean_units, compare_two_numbers, compare_two_list, number_it +import contextlib +import signal + +@contextlib.contextmanager +def time_limit(seconds: float): + def signal_handler(signum, frame): + raise ValueError + + signal.setitimer(signal.ITIMER_REAL, seconds) + signal.signal(signal.SIGALRM, signal_handler) + try: + yield + finally: + signal.setitimer(signal.ITIMER_REAL, 0) + + +def extract_theoremqa_answer(pred: str, answer_flag: bool = True): + from latex2sympy2_extended import latex2sympy + + if any([option in pred.lower() for option in ['yes', 'true']]): + pred = 'True' + elif any([option in pred.lower() for option in ['no', 'false']]): + pred = 'False' + elif any([option in pred.lower() for option in ['(a)', '(b)', '(c)', '(d)', '(e)', '(f)']]): + pass + else: + if answer_flag: + # Extract the numbers out of the string + pred = pred.split('=')[-1].strip() + pred = clean_units(pred) + try: + with time_limit(1): + tmp = str(latex2sympy(pred)) + pred = str(eval(tmp)) + except Exception: + if re.match(r'-?[\d\.]+\s\D+$', pred): + pred = pred.split(' ')[0] + elif re.match(r'-?[\d\.]+\s[^\s]+$', pred): + pred = pred.split(' ')[0] + else: + # desparate search over the last number + preds = re.findall(r'-?\d*\.?\d+', pred) + if(len(preds) >= 1): + pred = preds[-1] + else: + pred = '' + return pred + +def answer_clean(direct_answer_trigger_for_fewshot: tuple, pred: str): + pred = pred.strip('\n') + + # Determine if this is ICL, if so, use \n\n to split the first chunk. + ICL = False + for trigger in direct_answer_trigger_for_fewshot: + if pred.count(trigger) > 1: + ICL = True + if ICL: + pred = pred.split('\n\n')[0] + + # Split the trigger to find the answer. + preds = re.split('|'.join(direct_answer_trigger_for_fewshot), pred) + if len(preds) > 1: + answer_flag = True + pred = preds[-1] + else: + answer_flag = False + + pred = pred.strip('\n').rstrip('.').rstrip('/').strip(' ') + + pred = [extract_theoremqa_answer(pred, answer_flag)] + + # If there is no candidate in list, null is set. + if len(pred) == 0: + pred = "" + else: + if answer_flag: + # choose the first element in list ... + pred = pred[0] + else: + # choose the last e + pred = pred[-1] + + # Remove the period at the end, again! + pred = pred.rstrip('.').rstrip('/') + return pred + + + +def compare_answer_with_groundtruth(answer: str, groundtruth_str: str, groundtruth_num = None): + if groundtruth_str.lower() in ['(a)', '(b)', '(c)', '(d)', '(e)', '(f)']: + return groundtruth_str.lower() in answer.lower() + elif answer.lower() == groundtruth_str.lower(): + return True + elif groundtruth_num is not None: + if isinstance(groundtruth_num, (int, float)): + return compare_two_numbers(number_it(answer), groundtruth_num) + else: + if answer.startswith('(') and answer.endswith(')'): + try: + answer = list(eval(answer)) + answer = [number_it(a) for a in answer] + except Exception as e: + return False + return compare_two_list(answer, groundtruth_num) + else: + return False + else: + return False diff --git a/opencompass/datasets/__init__.py b/opencompass/datasets/__init__.py index 3e2d0eef..9cc5043c 100644 --- a/opencompass/datasets/__init__.py +++ b/opencompass/datasets/__init__.py @@ -85,6 +85,7 @@ from .llm_compression import LLMCompressionDataset # noqa: F401, F403 from .longbench import * # noqa: F401, F403 from .longbenchv2 import * # noqa: F401, F403 from .lveval import * # noqa: F401, F403 +from .maritime_bench import * # noqa: F401, F403 from .mastermath2024v1 import * # noqa: F401, F403 from .math import * # noqa: F401, F403 from .math401 import * # noqa: F401, F403 diff --git a/opencompass/datasets/lawbench/evaluation_functions/__init__.py b/opencompass/datasets/lawbench/evaluation_functions/__init__.py index 8b137891..d3f5a12f 100644 --- a/opencompass/datasets/lawbench/evaluation_functions/__init__.py +++ b/opencompass/datasets/lawbench/evaluation_functions/__init__.py @@ -1 +1 @@ - + diff --git a/opencompass/datasets/lawbench/evaluation_functions/cjft.py b/opencompass/datasets/lawbench/evaluation_functions/cjft.py index 71d6c1dd..bf149db8 100644 --- a/opencompass/datasets/lawbench/evaluation_functions/cjft.py +++ b/opencompass/datasets/lawbench/evaluation_functions/cjft.py @@ -1,19 +1,19 @@ -from ..utils.function_utils import compute_rouge - -#情景法条识别 - -def compute_cjft(data_dict): - """ - Compute the ROUGE-L score between the prediction and the reference - """ - references, predictions = [], [] - for example in data_dict: - question, prediction, answer = example["origin_prompt"], example["prediction"], example["refr"] - predictions.append(prediction) - references.append(answer) - - # compute the accuracy of score_list - rouge_scores = compute_rouge(predictions, references) - rouge_ls = [score["rouge-l"]["f"] for score in rouge_scores] - average_rouge_l = sum(rouge_ls) / len(rouge_ls) - return {"score": average_rouge_l} +from ..utils.function_utils import compute_rouge + +#情景法条识别 + +def compute_cjft(data_dict): + """ + Compute the ROUGE-L score between the prediction and the reference + """ + references, predictions = [], [] + for example in data_dict: + question, prediction, answer = example["origin_prompt"], example["prediction"], example["refr"] + predictions.append(prediction) + references.append(answer) + + # compute the accuracy of score_list + rouge_scores = compute_rouge(predictions, references) + rouge_ls = [score["rouge-l"]["f"] for score in rouge_scores] + average_rouge_l = sum(rouge_ls) / len(rouge_ls) + return {"score": average_rouge_l} diff --git a/opencompass/datasets/lawbench/evaluation_functions/flzx.py b/opencompass/datasets/lawbench/evaluation_functions/flzx.py index 376c7733..9d0f6ec7 100644 --- a/opencompass/datasets/lawbench/evaluation_functions/flzx.py +++ b/opencompass/datasets/lawbench/evaluation_functions/flzx.py @@ -1,18 +1,18 @@ -from ..utils.function_utils import compute_rouge - -#法律咨询 -def compute_flzx(data_dict): - """ - Compute the ROUGE-L score between the prediction and the reference - """ - references, predictions = [], [] - for example in data_dict: - question, prediction, answer = example["origin_prompt"], example["prediction"], example["refr"] - predictions.append(prediction) - references.append(answer) - - # compute the accuracy of score_list - rouge_scores = compute_rouge(predictions, references) - rouge_ls = [score["rouge-l"]["f"] for score in rouge_scores] - average_rouge_l = sum(rouge_ls) / len(rouge_ls) - return {"score": average_rouge_l} +from ..utils.function_utils import compute_rouge + +#法律咨询 +def compute_flzx(data_dict): + """ + Compute the ROUGE-L score between the prediction and the reference + """ + references, predictions = [], [] + for example in data_dict: + question, prediction, answer = example["origin_prompt"], example["prediction"], example["refr"] + predictions.append(prediction) + references.append(answer) + + # compute the accuracy of score_list + rouge_scores = compute_rouge(predictions, references) + rouge_ls = [score["rouge-l"]["f"] for score in rouge_scores] + average_rouge_l = sum(rouge_ls) / len(rouge_ls) + return {"score": average_rouge_l} diff --git a/opencompass/datasets/lawbench/evaluation_functions/ftcs.py b/opencompass/datasets/lawbench/evaluation_functions/ftcs.py index 009099e7..5b21b632 100644 --- a/opencompass/datasets/lawbench/evaluation_functions/ftcs.py +++ b/opencompass/datasets/lawbench/evaluation_functions/ftcs.py @@ -1,19 +1,19 @@ -from ..utils.function_utils import compute_rouge - -#法条记忆问答 -def compute_ftcs(data_dict): - """ - Compute the ROUGE-L score between the prediction and the reference - """ - references, predictions = [], [] - for example in data_dict: - question, prediction, answer = example["origin_prompt"], example["prediction"], example["refr"] - answer = answer.replace("答案:", "") - predictions.append(prediction) - references.append(answer) - - # compute the accuracy of score_list - rouge_scores = compute_rouge(predictions, references) - rouge_ls = [score["rouge-l"]["f"] for score in rouge_scores] - average_rouge_l = sum(rouge_ls) / len(rouge_ls) - return {"score": average_rouge_l} +from ..utils.function_utils import compute_rouge + +#法条记忆问答 +def compute_ftcs(data_dict): + """ + Compute the ROUGE-L score between the prediction and the reference + """ + references, predictions = [], [] + for example in data_dict: + question, prediction, answer = example["origin_prompt"], example["prediction"], example["refr"] + answer = answer.replace("答案:", "") + predictions.append(prediction) + references.append(answer) + + # compute the accuracy of score_list + rouge_scores = compute_rouge(predictions, references) + rouge_ls = [score["rouge-l"]["f"] for score in rouge_scores] + average_rouge_l = sum(rouge_ls) / len(rouge_ls) + return {"score": average_rouge_l} diff --git a/opencompass/datasets/lawbench/evaluation_functions/jdzy.py b/opencompass/datasets/lawbench/evaluation_functions/jdzy.py index 1129d58b..498df762 100644 --- a/opencompass/datasets/lawbench/evaluation_functions/jdzy.py +++ b/opencompass/datasets/lawbench/evaluation_functions/jdzy.py @@ -1,36 +1,36 @@ -from ..utils.function_utils import multi_choice_judge - -""" -multi-choice single-label selection -metric: accuracy -争议焦点:识别案件涉及的争议焦点 -""" - -def compute_jdzy(data_dict): - """ - Compute the Accuracy - The JEC dataset has 16 possible answers for each question, stored in the option_list - A prediction is correct if - 1. The correct answer appears in the prediction, and - 2. Options other than the answer do not appear in the prediction. - """ - - score_list, abstentions = [], 0 - option_list = ["诉讼主体", "租金情况", "利息", "本金争议", "责任认定", "责任划分", "损失认定及处理", - "原审判决是否适当", "合同效力", "财产分割", "责任承担", "鉴定结论采信问题", "诉讼时效", "违约", "合同解除", "肇事逃逸"] - for example in data_dict: - question, prediction, answer = example["origin_prompt"], example["prediction"], example["refr"] - if answer[7:-1] == "赔偿": - # todo: dataset imperfection - continue - assert answer.startswith("争议焦点类别:") and answer[7:-1] in option_list, \ - f"answer: {answer} \n question: {question}" - - answer_letter = answer[7:-1] - judge = multi_choice_judge(prediction, option_list, answer_letter) - score_list.append(judge["score"]) - abstentions += judge["abstention"] - - # compute the accuracy of score_list - accuracy = sum(score_list) / len(score_list) - return {"score": accuracy, "abstention_rate": abstentions / len(data_dict)} +from ..utils.function_utils import multi_choice_judge + +""" +multi-choice single-label selection +metric: accuracy +争议焦点:识别案件涉及的争议焦点 +""" + +def compute_jdzy(data_dict): + """ + Compute the Accuracy + The JEC dataset has 16 possible answers for each question, stored in the option_list + A prediction is correct if + 1. The correct answer appears in the prediction, and + 2. Options other than the answer do not appear in the prediction. + """ + + score_list, abstentions = [], 0 + option_list = ["诉讼主体", "租金情况", "利息", "本金争议", "责任认定", "责任划分", "损失认定及处理", + "原审判决是否适当", "合同效力", "财产分割", "责任承担", "鉴定结论采信问题", "诉讼时效", "违约", "合同解除", "肇事逃逸"] + for example in data_dict: + question, prediction, answer = example["origin_prompt"], example["prediction"], example["refr"] + if answer[7:-1] == "赔偿": + # todo: dataset imperfection + continue + assert answer.startswith("争议焦点类别:") and answer[7:-1] in option_list, \ + f"answer: {answer} \n question: {question}" + + answer_letter = answer[7:-1] + judge = multi_choice_judge(prediction, option_list, answer_letter) + score_list.append(judge["score"]) + abstentions += judge["abstention"] + + # compute the accuracy of score_list + accuracy = sum(score_list) / len(score_list) + return {"score": accuracy, "abstention_rate": abstentions / len(data_dict)} diff --git a/opencompass/datasets/lawbench/evaluation_functions/jec_ac.py b/opencompass/datasets/lawbench/evaluation_functions/jec_ac.py index f6c98ad7..45a7f0f6 100644 --- a/opencompass/datasets/lawbench/evaluation_functions/jec_ac.py +++ b/opencompass/datasets/lawbench/evaluation_functions/jec_ac.py @@ -1,29 +1,29 @@ -from ..utils.function_utils import multi_choice_judge - -""" -Task: multi-choice selection -Metric: Accuracy -司法考试-案例分析 -""" -def compute_jec_ac(data_dict): - """ - Compute the Accuracy - The JEC dataset has 4 options for each question: A, B, C, D - A prediction is correct if - 1. The correct answer appears in the prediction, and - 2. Options other than the answer do not appear in the prediction. - """ - score_list, abstentions = [], 0 - option_list = ["A", "B", "C", "D"] - for example in data_dict: - question, prediction, answer = example["origin_prompt"], example["prediction"], example["refr"] - assert answer.startswith("正确答案:") and answer[5] in option_list, f"answer[5]: {answer}, question: {question}" - - answer_letter = answer[5] - judge = multi_choice_judge(prediction, option_list, answer_letter) - score_list.append(judge["score"]) - abstentions += judge["abstention"] - - # compute the accuracy of score_list - accuracy = sum(score_list) / len(score_list) - return {"score": accuracy, "abstention_rate": abstentions / len(data_dict)} +from ..utils.function_utils import multi_choice_judge + +""" +Task: multi-choice selection +Metric: Accuracy +司法考试-案例分析 +""" +def compute_jec_ac(data_dict): + """ + Compute the Accuracy + The JEC dataset has 4 options for each question: A, B, C, D + A prediction is correct if + 1. The correct answer appears in the prediction, and + 2. Options other than the answer do not appear in the prediction. + """ + score_list, abstentions = [], 0 + option_list = ["A", "B", "C", "D"] + for example in data_dict: + question, prediction, answer = example["origin_prompt"], example["prediction"], example["refr"] + assert answer.startswith("正确答案:") and answer[5] in option_list, f"answer[5]: {answer}, question: {question}" + + answer_letter = answer[5] + judge = multi_choice_judge(prediction, option_list, answer_letter) + score_list.append(judge["score"]) + abstentions += judge["abstention"] + + # compute the accuracy of score_list + accuracy = sum(score_list) / len(score_list) + return {"score": accuracy, "abstention_rate": abstentions / len(data_dict)} diff --git a/opencompass/datasets/lawbench/evaluation_functions/jec_kd.py b/opencompass/datasets/lawbench/evaluation_functions/jec_kd.py index 3afe4ef9..f68dfad1 100644 --- a/opencompass/datasets/lawbench/evaluation_functions/jec_kd.py +++ b/opencompass/datasets/lawbench/evaluation_functions/jec_kd.py @@ -1,29 +1,29 @@ -from ..utils.function_utils import multi_choice_judge - -""" -Task: multi-choice selection -Metric: Accuracy -司法考试 -""" -def compute_jec_kd(data_dict): - """ - Compute the Accuracy - The JEC_KD dataset has 4 options for each question: A, B, C, D - A prediction is correct if - 1. The correct answer appears in the prediction, and - 2. Options other than the answer do not appear in the prediction. - """ - score_list, abstentions = [], 0 - option_list = ["A", "B", "C", "D"] - for example in data_dict: - question, prediction, answer = example["origin_prompt"], example["prediction"], example["refr"] - assert answer.startswith("正确答案:") and answer[5] in option_list, f"answer[5]: {answer}, question: {question}" - - answer_letter = answer[5] - judge = multi_choice_judge(prediction, option_list, answer_letter) - score_list.append(judge["score"]) - abstentions += judge["abstention"] - - # compute the accuracy of score_list - accuracy = sum(score_list) / len(score_list) - return {"score": accuracy, "abstention_rate": abstentions / len(data_dict)} +from ..utils.function_utils import multi_choice_judge + +""" +Task: multi-choice selection +Metric: Accuracy +司法考试 +""" +def compute_jec_kd(data_dict): + """ + Compute the Accuracy + The JEC_KD dataset has 4 options for each question: A, B, C, D + A prediction is correct if + 1. The correct answer appears in the prediction, and + 2. Options other than the answer do not appear in the prediction. + """ + score_list, abstentions = [], 0 + option_list = ["A", "B", "C", "D"] + for example in data_dict: + question, prediction, answer = example["origin_prompt"], example["prediction"], example["refr"] + assert answer.startswith("正确答案:") and answer[5] in option_list, f"answer[5]: {answer}, question: {question}" + + answer_letter = answer[5] + judge = multi_choice_judge(prediction, option_list, answer_letter) + score_list.append(judge["score"]) + abstentions += judge["abstention"] + + # compute the accuracy of score_list + accuracy = sum(score_list) / len(score_list) + return {"score": accuracy, "abstention_rate": abstentions / len(data_dict)} diff --git a/opencompass/datasets/lawbench/evaluation_functions/jetq.py b/opencompass/datasets/lawbench/evaluation_functions/jetq.py index 936de7c5..48b4afab 100644 --- a/opencompass/datasets/lawbench/evaluation_functions/jetq.py +++ b/opencompass/datasets/lawbench/evaluation_functions/jetq.py @@ -1,43 +1,43 @@ -import re - -""" -number prediction -metric: accuracy -金额提取 -""" -def compute_jetq(data_dict): - """ - Compute the Accuracy - we extract the total amount of cost involved in the crime from the prediction and compare it with the reference - The prediction is correct if - the total amount of cost provided in the reference, appears in the prediction. - """ - score_list, abstentions = [], 0 - - for example in data_dict: - question, prediction, answer = example["origin_prompt"], example["prediction"], example["refr"] - assert answer.startswith("上文涉及到的犯罪金额:"), f"answer: {answer}, question: {question}" - assert answer.endswith("元。"), f"answer: {answer}, question: {question}" - answer = answer.replace("上文涉及到的犯罪金额:", "") - - assert "千元" not in answer, f"answer: {answer}, question: {question}" - assert "万" not in answer, f"answer: {answer}, question: {question}" - - # remove "元" - answer = answer.replace("元。", "") - answer = float(answer) - - prediction_digits = re.findall(r"\d+\.?\d*", prediction) - prediction_digits = [float(digit) for digit in prediction_digits] - - if len(prediction_digits) == 0: - abstentions += 1 - if answer in prediction_digits: - score_list.append(1) - else: - score_list.append(0) - - - # compute the accuracy of score_list - accuracy = sum(score_list) / len(score_list) - return {"score": accuracy, "abstention_rate": abstentions/len(data_dict)} +import re + +""" +number prediction +metric: accuracy +金额提取 +""" +def compute_jetq(data_dict): + """ + Compute the Accuracy + we extract the total amount of cost involved in the crime from the prediction and compare it with the reference + The prediction is correct if + the total amount of cost provided in the reference, appears in the prediction. + """ + score_list, abstentions = [], 0 + + for example in data_dict: + question, prediction, answer = example["origin_prompt"], example["prediction"], example["refr"] + assert answer.startswith("上文涉及到的犯罪金额:"), f"answer: {answer}, question: {question}" + assert answer.endswith("元。"), f"answer: {answer}, question: {question}" + answer = answer.replace("上文涉及到的犯罪金额:", "") + + assert "千元" not in answer, f"answer: {answer}, question: {question}" + assert "万" not in answer, f"answer: {answer}, question: {question}" + + # remove "元" + answer = answer.replace("元。", "") + answer = float(answer) + + prediction_digits = re.findall(r"\d+\.?\d*", prediction) + prediction_digits = [float(digit) for digit in prediction_digits] + + if len(prediction_digits) == 0: + abstentions += 1 + if answer in prediction_digits: + score_list.append(1) + else: + score_list.append(0) + + + # compute the accuracy of score_list + accuracy = sum(score_list) / len(score_list) + return {"score": accuracy, "abstention_rate": abstentions/len(data_dict)} diff --git a/opencompass/datasets/lawbench/evaluation_functions/lblj.py b/opencompass/datasets/lawbench/evaluation_functions/lblj.py index 7675ec99..0bc20e24 100644 --- a/opencompass/datasets/lawbench/evaluation_functions/lblj.py +++ b/opencompass/datasets/lawbench/evaluation_functions/lblj.py @@ -1,29 +1,29 @@ -from ..utils.function_utils import multi_choice_judge - -""" -Task: multi-choice selection -Metric: Accuracy -论辩挖掘 -""" -def compute_lblj(data_dict): - """ - Compute the Accuracy - The LBLJ dataset has 5 options for each question: A, B, C, D, E - A prediction is correct if - 1. The correct answer appears in the prediction, and - 2. Options other than the answer do not appear in the prediction. - """ - score_list, abstentions = [], 0 - option_list = ["A", "B", "C", "D", "E"] - for example in data_dict: - question, prediction, answer = example["origin_prompt"], example["prediction"], example["refr"] - assert answer.startswith("[正确答案]") and answer[6] in option_list, f"answer[6]: {answer}, question: {question}" - - answer_letter = answer[6] - judge = multi_choice_judge(prediction, option_list, answer_letter) - score_list.append(judge["score"]) - abstentions += judge["abstention"] - - # compute the accuracy of score_list - accuracy = sum(score_list) / len(score_list) - return {"score": accuracy, "abstention_rate": abstentions / len(data_dict)} +from ..utils.function_utils import multi_choice_judge + +""" +Task: multi-choice selection +Metric: Accuracy +论辩挖掘 +""" +def compute_lblj(data_dict): + """ + Compute the Accuracy + The LBLJ dataset has 5 options for each question: A, B, C, D, E + A prediction is correct if + 1. The correct answer appears in the prediction, and + 2. Options other than the answer do not appear in the prediction. + """ + score_list, abstentions = [], 0 + option_list = ["A", "B", "C", "D", "E"] + for example in data_dict: + question, prediction, answer = example["origin_prompt"], example["prediction"], example["refr"] + assert answer.startswith("[正确答案]") and answer[6] in option_list, f"answer[6]: {answer}, question: {question}" + + answer_letter = answer[6] + judge = multi_choice_judge(prediction, option_list, answer_letter) + score_list.append(judge["score"]) + abstentions += judge["abstention"] + + # compute the accuracy of score_list + accuracy = sum(score_list) / len(score_list) + return {"score": accuracy, "abstention_rate": abstentions / len(data_dict)} diff --git a/opencompass/datasets/lawbench/evaluation_functions/ljp_accusation.py b/opencompass/datasets/lawbench/evaluation_functions/ljp_accusation.py index dc16a7c4..93690a9f 100644 --- a/opencompass/datasets/lawbench/evaluation_functions/ljp_accusation.py +++ b/opencompass/datasets/lawbench/evaluation_functions/ljp_accusation.py @@ -1,76 +1,76 @@ -from ..utils.function_utils import compute_f1_two_sets -""" -task: legal accusation prediction -metric: f1 score -法律判决预测-罪名预测 -""" - -option_list = ["侮辱", "违法发放贷款", "失火", "票据诈骗", "帮助犯罪分子逃避处罚", "重大责任事故", "对非国家工作人员行贿", - "非法制造、销售非法制造的注册商标标识", "非法制造、买卖、运输、邮寄、储存枪支、弹药、爆炸物", "非法获取公民个人信息", - "扰乱无线电通讯管理秩序", "非法持有、私藏枪支、弹药", "拒不执行判决、裁定", "虚开发票", "巨额财产来源不明", - "组织、领导、参加黑社会性质组织", "非法获取国家秘密", "以危险方法危害公共安全", "非法持有毒品", - "聚众扰乱公共场所秩序、交通秩序", "包庇毒品犯罪分子", "滥伐林木", "伪造公司、企业、事业单位、人民团体印章", - "非法占用农用地", "走私废物", "串通投标", "非法采伐、毁坏国家重点保护植物", "冒充军人招摇撞骗", "玩忽职守", - "重婚", "招收公务员、学生徇私舞弊", "组织、领导传销活动", "非法猎捕、杀害珍贵、濒危野生动物", "侵犯著作权", - "非法种植毒品原植物", "伪造、变造、买卖武装部队公文、证件、印章", "倒卖文物", "伪造、变造居民身份证", "滥用职权", - "诽谤", "猥亵儿童", "非法转让、倒卖土地使用权", "挪用公款", "污染环境", "出售、购买、运输假币", "敲诈勒索", - "高利转贷", "故意伤害", "持有、使用假币", "单位受贿", "强奸", "引诱、容留、介绍卖淫", "虐待", - "生产、销售伪劣农药、兽药、化肥、种子", "妨害公务", "容留他人吸毒", "拐骗儿童", "强制猥亵、侮辱妇女", - "非法处置查封、扣押、冻结的财产", "骗取贷款、票据承兑、金融票证", "强迫他人吸毒", "非法拘禁", - "非法携带枪支、弹药、管制刀具、危险物品危及公共安全", "绑架", "聚众斗殴", "破坏计算机信息系统", - "制造、贩卖、传播淫秽物品", "虐待被监管人", "贷款诈骗", "赌博", "徇私舞弊不征、少征税款", - "盗窃、抢夺枪支、弹药、爆炸物、危险物质", "故意杀人", "介绍贿赂", "提供侵入、非法控制计算机信息系统程序、工具", - "编造、故意传播虚假恐怖信息", "妨害作证", "强迫卖淫", "走私、贩卖、运输、制造毒品", "伪证", "拐卖妇女、儿童", - "过失损坏武器装备、军事设施、军事通信", "破坏广播电视设施、公用电信设施", "洗钱", "职务侵占", "倒卖车票、船票", - "抢劫", "侵占", "掩饰、隐瞒犯罪所得、犯罪所得收益", "徇私舞弊不移交刑事案件", "引诱、教唆、欺骗他人吸毒", "遗弃", - "生产、销售伪劣产品", "放火", "非法采矿", "对单位行贿", "盗窃、抢夺枪支、弹药、爆炸物", "破坏易燃易爆设备", - "妨害信用卡管理", "制作、复制、出版、贩卖、传播淫秽物品牟利", "金融凭证诈骗", "私分国有资产", - "走私国家禁止进出口的货物、物品", "假冒注册商标", "危险物品肇事", "走私普通货物、物品", "经济犯", "虚报注册资本", - "盗掘古文化遗址、古墓葬", "传播淫秽物品", "窝藏、包庇", "拒不支付劳动报酬", "行贿", "开设赌场", "传授犯罪方法", - "协助组织卖淫", "保险诈骗", "破坏生产经营", "破坏交通设施", "打击报复证人", "非法侵入住宅", "非国家工作人员受贿", - "过失致人重伤", "伪造、变造金融票证", "窝藏、转移、隐瞒毒品、毒赃", "帮助毁灭、伪造证据", "走私珍贵动物、珍贵动物制品", - "生产、销售假药", "逃税", "挪用特定款物", "聚众扰乱社会秩序", "组织、强迫、引诱、容留、介绍卖淫", "合同诈骗", - "非法生产、销售间谍专用器材", "破坏交通工具", "传播性病", "强迫交易", "隐匿、故意销毁会计凭证、会计帐簿、财务会计报告", - "非法组织卖血", "强迫劳动", "破坏电力设备", "销售假冒注册商标的商品", "收买被拐卖的妇女、儿童", "诬告陷害", "脱逃", - "非法经营", "徇私枉法", "信用卡诈骗", "生产、销售不符合安全标准的食品", "非法行医", "伪造货币", "动植物检疫徇私舞弊", - "单位行贿", "破坏监管秩序", "盗窃", "盗伐林木", "重大劳动安全事故", "非法吸收公众存款", - "非法制造、出售非法制造的发票", "非法狩猎", "组织卖淫", "非法买卖、运输、携带、持有毒品原植物种子、幼苗", "挪用资金", - "诈骗", "伪造、变造、买卖国家机关公文、证件、印章", "持有伪造的发票", "贪污", "非法生产、买卖警用装备", - "投放危险物质", "伪造、倒卖伪造的有价票证", "集资诈骗", "抢夺", "生产、销售有毒、有害食品", "非法捕捞水产品", - "过失致人死亡", "非法买卖制毒物品", "虚开增值税专用发票、用于骗取出口退税、抵扣税款发票", "寻衅滋事", "危险驾驶", - "故意毁坏财物", "招摇撞骗", "盗窃、侮辱尸体", "走私武器、弹药", - "非法收购、运输、加工、出售国家重点保护植物、国家重点保护植物制品", "非法出售发票", "劫持船只、汽车", - "受贿", "聚众哄抢", "交通肇事"] - - -def compute_ljp_accusation(data_dict): - """ - Compute the F1-score - The LJP_Accusation dataset a set of 189 different accusation types. - A question may involve one or more accusation types. - Given a list of accusation types from both the ground truth and the prediction, we compute the F1-score between - these two lists. - """ - score_list, abstentions = [], 0 - - for example in data_dict: - question, prediction, answer = example["origin_prompt"], example["prediction"], example["refr"] - - assert answer.startswith("罪名:"), f"answer: {answer} \n question: {question}" - answer = answer.replace("罪名:", "") - answers = answer.split(";") - - prediction_list =[] - for option in option_list: - if option in prediction: - prediction_list.append(option) - - if len(prediction_list) == 0: - abstentions += 1 - gt_set = set(answers) - pred_set = set(prediction_list) - score = compute_f1_two_sets(gt_set, pred_set) - score_list.append(score) - - f1_score_average = sum(score_list) / len(score_list) - return {"score": f1_score_average, "abstention_rate": abstentions/len(data_dict)} +from ..utils.function_utils import compute_f1_two_sets +""" +task: legal accusation prediction +metric: f1 score +法律判决预测-罪名预测 +""" + +option_list = ["侮辱", "违法发放贷款", "失火", "票据诈骗", "帮助犯罪分子逃避处罚", "重大责任事故", "对非国家工作人员行贿", + "非法制造、销售非法制造的注册商标标识", "非法制造、买卖、运输、邮寄、储存枪支、弹药、爆炸物", "非法获取公民个人信息", + "扰乱无线电通讯管理秩序", "非法持有、私藏枪支、弹药", "拒不执行判决、裁定", "虚开发票", "巨额财产来源不明", + "组织、领导、参加黑社会性质组织", "非法获取国家秘密", "以危险方法危害公共安全", "非法持有毒品", + "聚众扰乱公共场所秩序、交通秩序", "包庇毒品犯罪分子", "滥伐林木", "伪造公司、企业、事业单位、人民团体印章", + "非法占用农用地", "走私废物", "串通投标", "非法采伐、毁坏国家重点保护植物", "冒充军人招摇撞骗", "玩忽职守", + "重婚", "招收公务员、学生徇私舞弊", "组织、领导传销活动", "非法猎捕、杀害珍贵、濒危野生动物", "侵犯著作权", + "非法种植毒品原植物", "伪造、变造、买卖武装部队公文、证件、印章", "倒卖文物", "伪造、变造居民身份证", "滥用职权", + "诽谤", "猥亵儿童", "非法转让、倒卖土地使用权", "挪用公款", "污染环境", "出售、购买、运输假币", "敲诈勒索", + "高利转贷", "故意伤害", "持有、使用假币", "单位受贿", "强奸", "引诱、容留、介绍卖淫", "虐待", + "生产、销售伪劣农药、兽药、化肥、种子", "妨害公务", "容留他人吸毒", "拐骗儿童", "强制猥亵、侮辱妇女", + "非法处置查封、扣押、冻结的财产", "骗取贷款、票据承兑、金融票证", "强迫他人吸毒", "非法拘禁", + "非法携带枪支、弹药、管制刀具、危险物品危及公共安全", "绑架", "聚众斗殴", "破坏计算机信息系统", + "制造、贩卖、传播淫秽物品", "虐待被监管人", "贷款诈骗", "赌博", "徇私舞弊不征、少征税款", + "盗窃、抢夺枪支、弹药、爆炸物、危险物质", "故意杀人", "介绍贿赂", "提供侵入、非法控制计算机信息系统程序、工具", + "编造、故意传播虚假恐怖信息", "妨害作证", "强迫卖淫", "走私、贩卖、运输、制造毒品", "伪证", "拐卖妇女、儿童", + "过失损坏武器装备、军事设施、军事通信", "破坏广播电视设施、公用电信设施", "洗钱", "职务侵占", "倒卖车票、船票", + "抢劫", "侵占", "掩饰、隐瞒犯罪所得、犯罪所得收益", "徇私舞弊不移交刑事案件", "引诱、教唆、欺骗他人吸毒", "遗弃", + "生产、销售伪劣产品", "放火", "非法采矿", "对单位行贿", "盗窃、抢夺枪支、弹药、爆炸物", "破坏易燃易爆设备", + "妨害信用卡管理", "制作、复制、出版、贩卖、传播淫秽物品牟利", "金融凭证诈骗", "私分国有资产", + "走私国家禁止进出口的货物、物品", "假冒注册商标", "危险物品肇事", "走私普通货物、物品", "经济犯", "虚报注册资本", + "盗掘古文化遗址、古墓葬", "传播淫秽物品", "窝藏、包庇", "拒不支付劳动报酬", "行贿", "开设赌场", "传授犯罪方法", + "协助组织卖淫", "保险诈骗", "破坏生产经营", "破坏交通设施", "打击报复证人", "非法侵入住宅", "非国家工作人员受贿", + "过失致人重伤", "伪造、变造金融票证", "窝藏、转移、隐瞒毒品、毒赃", "帮助毁灭、伪造证据", "走私珍贵动物、珍贵动物制品", + "生产、销售假药", "逃税", "挪用特定款物", "聚众扰乱社会秩序", "组织、强迫、引诱、容留、介绍卖淫", "合同诈骗", + "非法生产、销售间谍专用器材", "破坏交通工具", "传播性病", "强迫交易", "隐匿、故意销毁会计凭证、会计帐簿、财务会计报告", + "非法组织卖血", "强迫劳动", "破坏电力设备", "销售假冒注册商标的商品", "收买被拐卖的妇女、儿童", "诬告陷害", "脱逃", + "非法经营", "徇私枉法", "信用卡诈骗", "生产、销售不符合安全标准的食品", "非法行医", "伪造货币", "动植物检疫徇私舞弊", + "单位行贿", "破坏监管秩序", "盗窃", "盗伐林木", "重大劳动安全事故", "非法吸收公众存款", + "非法制造、出售非法制造的发票", "非法狩猎", "组织卖淫", "非法买卖、运输、携带、持有毒品原植物种子、幼苗", "挪用资金", + "诈骗", "伪造、变造、买卖国家机关公文、证件、印章", "持有伪造的发票", "贪污", "非法生产、买卖警用装备", + "投放危险物质", "伪造、倒卖伪造的有价票证", "集资诈骗", "抢夺", "生产、销售有毒、有害食品", "非法捕捞水产品", + "过失致人死亡", "非法买卖制毒物品", "虚开增值税专用发票、用于骗取出口退税、抵扣税款发票", "寻衅滋事", "危险驾驶", + "故意毁坏财物", "招摇撞骗", "盗窃、侮辱尸体", "走私武器、弹药", + "非法收购、运输、加工、出售国家重点保护植物、国家重点保护植物制品", "非法出售发票", "劫持船只、汽车", + "受贿", "聚众哄抢", "交通肇事"] + + +def compute_ljp_accusation(data_dict): + """ + Compute the F1-score + The LJP_Accusation dataset a set of 189 different accusation types. + A question may involve one or more accusation types. + Given a list of accusation types from both the ground truth and the prediction, we compute the F1-score between + these two lists. + """ + score_list, abstentions = [], 0 + + for example in data_dict: + question, prediction, answer = example["origin_prompt"], example["prediction"], example["refr"] + + assert answer.startswith("罪名:"), f"answer: {answer} \n question: {question}" + answer = answer.replace("罪名:", "") + answers = answer.split(";") + + prediction_list =[] + for option in option_list: + if option in prediction: + prediction_list.append(option) + + if len(prediction_list) == 0: + abstentions += 1 + gt_set = set(answers) + pred_set = set(prediction_list) + score = compute_f1_two_sets(gt_set, pred_set) + score_list.append(score) + + f1_score_average = sum(score_list) / len(score_list) + return {"score": f1_score_average, "abstention_rate": abstentions/len(data_dict)} diff --git a/opencompass/datasets/lawbench/evaluation_functions/ljp_article.py b/opencompass/datasets/lawbench/evaluation_functions/ljp_article.py index 637eae5b..9b895bab 100644 --- a/opencompass/datasets/lawbench/evaluation_functions/ljp_article.py +++ b/opencompass/datasets/lawbench/evaluation_functions/ljp_article.py @@ -1,70 +1,70 @@ -import re - -""" -task: law article prediction -metric: F1 score -法律判决预测-法条预测 -""" -def replace_match(match): - return match.group(1) - -def compute_ljp_article(data_dict): - """ - Compute the F1-score - A reference contains a list of articles of the Criminal Law of the People's Republic of China. - We compute the F1-score between the prediction and the reference. - """ - import cn2an - - score_list, abstentions = [], 0 - - for example in data_dict: - question, prediction, answer = example["origin_prompt"], example["prediction"], example["refr"] - assert answer.startswith("法条:刑法第"), f"answer: {answer}" - assert answer.endswith("条"), f"answer: {answer}" - - answer = answer.replace("法条:刑法第", "") - answer = answer.replace("条", "") - - answer_law_indices = answer.split("、") - answer_law_index_digit_list = [] - for answer_law_index in answer_law_indices: - assert answer_law_index.isdigit(), f"answer_law_index: {answer_law_index}" - answer_law_index_digit = int(answer_law_index) - assert answer_law_index_digit <= 490, "刑法总共只有490条" - answer_law_index_digit_list.append(answer_law_index_digit) - - prediction_law_chunks = prediction.split("、") - prediction_law_index_digit_list = [] - - for prediction_law_chunk in prediction_law_chunks: - prediction_law_chunk = prediction_law_chunk.replace("万元", "元") - - # delete phrase starts with "第" and ends with "款", we don't care about it in the answer - prediction_law_chunk = re.sub(r'第(.*?)款', "", prediction_law_chunk) - # keep only the digits in the phrase starts with "第" and ends with "条", otherwise cn may fail to convert - prediction_law_chunk = re.sub(r'第(.*?)条', replace_match, prediction_law_chunk) - prediction_law_chunk = cn2an.transform(prediction_law_chunk, "cn2an") - # find digtis in prediction_law_chunk - prediction_law_section_numbers = re.findall(r"\d+", prediction_law_chunk) - if len(prediction_law_section_numbers) == 0: - continue - if len(prediction_law_section_numbers) != 1: - # in this case, we only take the first number, and reject the others - pass - - prediction_law_index_digit = int(prediction_law_section_numbers[0]) - prediction_law_index_digit_list.append(prediction_law_index_digit) - - gt_set = set(answer_law_index_digit_list) - pred_set = set(prediction_law_index_digit_list) - if len(pred_set) == 0: - abstentions += 1 - precision = len(gt_set.intersection(pred_set)) / len(pred_set) if len(pred_set) != 0 else 0 - recall = len(gt_set.intersection(pred_set)) / len(gt_set) if len(gt_set) != 0 else 0 - f1_score = 2 * precision * recall / (precision + recall) if (precision + recall) != 0 else 0 - score_list.append(f1_score) - - # compute the accuracy of score_list - average_f1 = sum(score_list) / len(score_list) - return {'score': average_f1, 'abstention_rate': abstentions/len(data_dict)} +import re + +""" +task: law article prediction +metric: F1 score +法律判决预测-法条预测 +""" +def replace_match(match): + return match.group(1) + +def compute_ljp_article(data_dict): + """ + Compute the F1-score + A reference contains a list of articles of the Criminal Law of the People's Republic of China. + We compute the F1-score between the prediction and the reference. + """ + import cn2an + + score_list, abstentions = [], 0 + + for example in data_dict: + question, prediction, answer = example["origin_prompt"], example["prediction"], example["refr"] + assert answer.startswith("法条:刑法第"), f"answer: {answer}" + assert answer.endswith("条"), f"answer: {answer}" + + answer = answer.replace("法条:刑法第", "") + answer = answer.replace("条", "") + + answer_law_indices = answer.split("、") + answer_law_index_digit_list = [] + for answer_law_index in answer_law_indices: + assert answer_law_index.isdigit(), f"answer_law_index: {answer_law_index}" + answer_law_index_digit = int(answer_law_index) + assert answer_law_index_digit <= 490, "刑法总共只有490条" + answer_law_index_digit_list.append(answer_law_index_digit) + + prediction_law_chunks = prediction.split("、") + prediction_law_index_digit_list = [] + + for prediction_law_chunk in prediction_law_chunks: + prediction_law_chunk = prediction_law_chunk.replace("万元", "元") + + # delete phrase starts with "第" and ends with "款", we don't care about it in the answer + prediction_law_chunk = re.sub(r'第(.*?)款', "", prediction_law_chunk) + # keep only the digits in the phrase starts with "第" and ends with "条", otherwise cn may fail to convert + prediction_law_chunk = re.sub(r'第(.*?)条', replace_match, prediction_law_chunk) + prediction_law_chunk = cn2an.transform(prediction_law_chunk, "cn2an") + # find digtis in prediction_law_chunk + prediction_law_section_numbers = re.findall(r"\d+", prediction_law_chunk) + if len(prediction_law_section_numbers) == 0: + continue + if len(prediction_law_section_numbers) != 1: + # in this case, we only take the first number, and reject the others + pass + + prediction_law_index_digit = int(prediction_law_section_numbers[0]) + prediction_law_index_digit_list.append(prediction_law_index_digit) + + gt_set = set(answer_law_index_digit_list) + pred_set = set(prediction_law_index_digit_list) + if len(pred_set) == 0: + abstentions += 1 + precision = len(gt_set.intersection(pred_set)) / len(pred_set) if len(pred_set) != 0 else 0 + recall = len(gt_set.intersection(pred_set)) / len(gt_set) if len(gt_set) != 0 else 0 + f1_score = 2 * precision * recall / (precision + recall) if (precision + recall) != 0 else 0 + score_list.append(f1_score) + + # compute the accuracy of score_list + average_f1 = sum(score_list) / len(score_list) + return {'score': average_f1, 'abstention_rate': abstentions/len(data_dict)} diff --git a/opencompass/datasets/lawbench/evaluation_functions/ljp_imprison.py b/opencompass/datasets/lawbench/evaluation_functions/ljp_imprison.py index e051caf9..7d858d2a 100644 --- a/opencompass/datasets/lawbench/evaluation_functions/ljp_imprison.py +++ b/opencompass/datasets/lawbench/evaluation_functions/ljp_imprison.py @@ -1,54 +1,54 @@ -import math - -import re - -#法律判决预测-刑期预测 -def compute_ljp_imprison(data_dict): - import cn2an - - score_list, abstentions = [], 0 - - for example in data_dict: - question, prediction, answer = example["origin_prompt"], example["prediction"], example["refr"] - # get answer digit, which is the number between "刑期:" and "个月" - if "死刑" in answer or "无期" in answer: - # TODO: data imperfection - continue - - assert answer.startswith("刑期:") and answer.endswith("个月"), f"answer: {answer}, question: {question}" - answer = answer.replace("刑期:", "") - answer = answer.replace("个月", "") - answer_digit = int(answer) - prediction = cn2an.transform(prediction, "cn2an") - - # use regular expression to extract the digits from prediction, only consider digits before "个月" or "月" - prediction_digit_month_list = re.findall(r"\d+个月", prediction) - prediction_digit_month_list = [int(digit.replace("个月", "")) for digit in prediction_digit_month_list] - prediction_digit_month_list2 = re.findall(r"\d+月", prediction) - prediction_digit_month_list2 = [int(digit.replace("月", "")) for digit in prediction_digit_month_list2] - prediction_digit_month_list.extend(prediction_digit_month_list2) - # catches the digits before "年" - prediction_digit_year_list = re.findall(r"\d+年", prediction) - prediction_digit_year_list = [int(digit.replace("年", "")) for digit in prediction_digit_year_list] - - if len(prediction_digit_month_list) > 0: - prediction_digit_month = int(prediction_digit_month_list[0]) - elif len(prediction_digit_year_list) > 0: - prediction_digit_month = int(prediction_digit_year_list[0]) * 12 - else: - abstentions += 1 - prediction_digit_month = -1 - - if prediction_digit_month != -1: - score_list.append(abs(math.log(answer_digit + 1) - math.log(prediction_digit_month + 1))) - else: - score_list.append(math.log(216)) - - if abstentions == len(score_list): - log_distance = 0 - else: - # compute the average of score_list (log distance) - log_distance = sum(score_list) / len(score_list) - # normalize the score to between 0 and 1 - log_distance = (math.log(216) - log_distance)/math.log(216) - return {"score": log_distance, "abstention_rate": abstentions/len(data_dict)} +import math + +import re + +#法律判决预测-刑期预测 +def compute_ljp_imprison(data_dict): + import cn2an + + score_list, abstentions = [], 0 + + for example in data_dict: + question, prediction, answer = example["origin_prompt"], example["prediction"], example["refr"] + # get answer digit, which is the number between "刑期:" and "个月" + if "死刑" in answer or "无期" in answer: + # TODO: data imperfection + continue + + assert answer.startswith("刑期:") and answer.endswith("个月"), f"answer: {answer}, question: {question}" + answer = answer.replace("刑期:", "") + answer = answer.replace("个月", "") + answer_digit = int(answer) + prediction = cn2an.transform(prediction, "cn2an") + + # use regular expression to extract the digits from prediction, only consider digits before "个月" or "月" + prediction_digit_month_list = re.findall(r"\d+个月", prediction) + prediction_digit_month_list = [int(digit.replace("个月", "")) for digit in prediction_digit_month_list] + prediction_digit_month_list2 = re.findall(r"\d+月", prediction) + prediction_digit_month_list2 = [int(digit.replace("月", "")) for digit in prediction_digit_month_list2] + prediction_digit_month_list.extend(prediction_digit_month_list2) + # catches the digits before "年" + prediction_digit_year_list = re.findall(r"\d+年", prediction) + prediction_digit_year_list = [int(digit.replace("年", "")) for digit in prediction_digit_year_list] + + if len(prediction_digit_month_list) > 0: + prediction_digit_month = int(prediction_digit_month_list[0]) + elif len(prediction_digit_year_list) > 0: + prediction_digit_month = int(prediction_digit_year_list[0]) * 12 + else: + abstentions += 1 + prediction_digit_month = -1 + + if prediction_digit_month != -1: + score_list.append(abs(math.log(answer_digit + 1) - math.log(prediction_digit_month + 1))) + else: + score_list.append(math.log(216)) + + if abstentions == len(score_list): + log_distance = 0 + else: + # compute the average of score_list (log distance) + log_distance = sum(score_list) / len(score_list) + # normalize the score to between 0 and 1 + log_distance = (math.log(216) - log_distance)/math.log(216) + return {"score": log_distance, "abstention_rate": abstentions/len(data_dict)} diff --git a/opencompass/datasets/lawbench/evaluation_functions/sjjc.py b/opencompass/datasets/lawbench/evaluation_functions/sjjc.py index d5d9b7e3..0ff6f1de 100644 --- a/opencompass/datasets/lawbench/evaluation_functions/sjjc.py +++ b/opencompass/datasets/lawbench/evaluation_functions/sjjc.py @@ -1,64 +1,64 @@ -from ..utils.function_utils import compute_f1_two_sets -from ..utils.rc_f1 import CJRCEvaluator - - -""" -task: event detection -metric: F1 score -事件检测 -""" -option_list = ["支付/给付", "欺骗", "搜查/扣押", "要求/请求", "卖出", "买入", "获利", "拘捕", "鉴定", "同意/接受", "供述", "联络", "帮助/救助", "租用/借用", "受伤", "伪造", "卖淫", "伤害人身", "赔偿", "归还/偿还"] - -def compute_sjjc(data_dict): - """ - Compute the F1-score - The sjjc task covers 20 event types. - A question may involve one or more event types. - Given a list of event types from both the ground truth and the prediction, we compute the F1-score between - these two lists. - """ - score_list, abstentions = [], 0 - - for example in data_dict: - question, prediction, answer = example["origin_prompt"], example["prediction"], example["refr"] - - answers = answer.split(";") - - prediction_list =[] - for option in option_list: - if option in prediction: - prediction_list.append(option) - - if len(prediction_list) == 0: - abstentions += 1 - gt_set = set(answers) - pred_set = set(prediction_list) - score = compute_f1_two_sets(gt_set, pred_set) - score_list.append(score) - - f1_score_average = sum(score_list) / len(score_list) - return {"score": f1_score_average, "abstention_rate": abstentions/len(data_dict)} - -""" -task: trigger word extraction -metric: F1 score -触发词抽取 -""" -def compute_cfcy(data_dict): - - scores = 0 - - for example in data_dict: - question, prediction, answer = example["origin_prompt"], example["prediction"], example["refr"] - - answers = answer.split(";") - predictions = prediction.split(";") - intersected = [CJRCEvaluator.compute_f1(r, h) for r, h in zip(answers, predictions)] - - prec = sum(intersected) / len(predictions) if len(predictions) > 0 else 0 - rec = sum(intersected) / len(answers) if len(answers) > 0 else 0 - # print(prec, rec, intersected) - scores += 2 * prec * rec / (prec + rec + 1e-10) - - f1_score_average = scores / len(data_dict) - return {"score": f1_score_average} +from ..utils.function_utils import compute_f1_two_sets +from ..utils.rc_f1 import CJRCEvaluator + + +""" +task: event detection +metric: F1 score +事件检测 +""" +option_list = ["支付/给付", "欺骗", "搜查/扣押", "要求/请求", "卖出", "买入", "获利", "拘捕", "鉴定", "同意/接受", "供述", "联络", "帮助/救助", "租用/借用", "受伤", "伪造", "卖淫", "伤害人身", "赔偿", "归还/偿还"] + +def compute_sjjc(data_dict): + """ + Compute the F1-score + The sjjc task covers 20 event types. + A question may involve one or more event types. + Given a list of event types from both the ground truth and the prediction, we compute the F1-score between + these two lists. + """ + score_list, abstentions = [], 0 + + for example in data_dict: + question, prediction, answer = example["origin_prompt"], example["prediction"], example["refr"] + + answers = answer.split(";") + + prediction_list =[] + for option in option_list: + if option in prediction: + prediction_list.append(option) + + if len(prediction_list) == 0: + abstentions += 1 + gt_set = set(answers) + pred_set = set(prediction_list) + score = compute_f1_two_sets(gt_set, pred_set) + score_list.append(score) + + f1_score_average = sum(score_list) / len(score_list) + return {"score": f1_score_average, "abstention_rate": abstentions/len(data_dict)} + +""" +task: trigger word extraction +metric: F1 score +触发词抽取 +""" +def compute_cfcy(data_dict): + + scores = 0 + + for example in data_dict: + question, prediction, answer = example["origin_prompt"], example["prediction"], example["refr"] + + answers = answer.split(";") + predictions = prediction.split(";") + intersected = [CJRCEvaluator.compute_f1(r, h) for r, h in zip(answers, predictions)] + + prec = sum(intersected) / len(predictions) if len(predictions) > 0 else 0 + rec = sum(intersected) / len(answers) if len(answers) > 0 else 0 + # print(prec, rec, intersected) + scores += 2 * prec * rec / (prec + rec + 1e-10) + + f1_score_average = scores / len(data_dict) + return {"score": f1_score_average} diff --git a/opencompass/datasets/lawbench/evaluation_functions/wbfl.py b/opencompass/datasets/lawbench/evaluation_functions/wbfl.py index edde3eb9..7ed4334b 100644 --- a/opencompass/datasets/lawbench/evaluation_functions/wbfl.py +++ b/opencompass/datasets/lawbench/evaluation_functions/wbfl.py @@ -1,42 +1,42 @@ -""" -task: multiple choice classification -metric: F1 score -婚姻文本分类 -""" - -def compute_wbfl(data_dict): - """ - A reference (R) contains a list of options, each option is from the option_list. - We will extract the options appearing in the prediction and convert them into a set (P). - We compute the F1 score between the prediction (P) and the reference (R). - """ - - - score_list, abstentions = [], 0 - option_list = ["婚后有子女", "限制行为能力子女抚养", "有夫妻共同财产", "支付抚养费", "不动产分割", "婚后分局", - "二次起诉离婚", "按月给付抚养费", "准予离婚", "有夫妻共同债务", "婚前个人财产", "法定离婚", "不履行家庭义务", - "存在非婚生子", "适当帮助", "不履行离婚协议", "损害赔偿", "感情不和分居满二年", "子女随非抚养权人生活", "婚后个人财产"] - for example in data_dict: - question, prediction, answer = example["origin_prompt"], example["prediction"], example["refr"] - assert answer.startswith("类别:") and answer.endswith("。"), f"answer: {answer}, question: {question}" - - gt_list = (answer[3:-1].split("、")) - for gt in gt_list: - assert gt in option_list, f"gt: {gt}, question: {question}" - gt_set = set(gt_list) - - prediction_list = [] - for option in option_list: - if option in prediction: - prediction_list.append(option) - if len(prediction_list) == 0: - abstentions += 1 - predict_set = set(prediction_list) - precision = len(gt_set.intersection(predict_set)) / len(predict_set) if len(predict_set) != 0 else 0 - recall = len(gt_set.intersection(predict_set)) / len(gt_set) if len(gt_set) != 0 else 0 - f1_score = 2 * precision * recall / (precision + recall) if (precision + recall) != 0 else 0 - score_list.append(f1_score) - - # compute the accuracy of score_list - final_f1_score = sum(score_list) / len(score_list) - return {'score': final_f1_score, 'abstention_rate': abstentions / len(data_dict)} +""" +task: multiple choice classification +metric: F1 score +婚姻文本分类 +""" + +def compute_wbfl(data_dict): + """ + A reference (R) contains a list of options, each option is from the option_list. + We will extract the options appearing in the prediction and convert them into a set (P). + We compute the F1 score between the prediction (P) and the reference (R). + """ + + + score_list, abstentions = [], 0 + option_list = ["婚后有子女", "限制行为能力子女抚养", "有夫妻共同财产", "支付抚养费", "不动产分割", "婚后分局", + "二次起诉离婚", "按月给付抚养费", "准予离婚", "有夫妻共同债务", "婚前个人财产", "法定离婚", "不履行家庭义务", + "存在非婚生子", "适当帮助", "不履行离婚协议", "损害赔偿", "感情不和分居满二年", "子女随非抚养权人生活", "婚后个人财产"] + for example in data_dict: + question, prediction, answer = example["origin_prompt"], example["prediction"], example["refr"] + assert answer.startswith("类别:") and answer.endswith("。"), f"answer: {answer}, question: {question}" + + gt_list = (answer[3:-1].split("、")) + for gt in gt_list: + assert gt in option_list, f"gt: {gt}, question: {question}" + gt_set = set(gt_list) + + prediction_list = [] + for option in option_list: + if option in prediction: + prediction_list.append(option) + if len(prediction_list) == 0: + abstentions += 1 + predict_set = set(prediction_list) + precision = len(gt_set.intersection(predict_set)) / len(predict_set) if len(predict_set) != 0 else 0 + recall = len(gt_set.intersection(predict_set)) / len(gt_set) if len(gt_set) != 0 else 0 + f1_score = 2 * precision * recall / (precision + recall) if (precision + recall) != 0 else 0 + score_list.append(f1_score) + + # compute the accuracy of score_list + final_f1_score = sum(score_list) / len(score_list) + return {'score': final_f1_score, 'abstention_rate': abstentions / len(data_dict)} diff --git a/opencompass/datasets/lawbench/evaluation_functions/wsjd.py b/opencompass/datasets/lawbench/evaluation_functions/wsjd.py index 4c0a6dad..ae322686 100644 --- a/opencompass/datasets/lawbench/evaluation_functions/wsjd.py +++ b/opencompass/datasets/lawbench/evaluation_functions/wsjd.py @@ -1,52 +1,52 @@ -import re -import os -import subprocess -import tempfile - -""" -Task: legal document grammar correction -Metric: F0.5 score -文书校对 -""" -def compute_wsjd(data_dict): - origins, references, predictions = [], [], [] - for example in data_dict: - question, prediction, answer = example["origin_prompt"], example["prediction"], example["refr"] - if isinstance(question, list): - question = question[0]['prompt'] - start = question.index('句子:\n') + 4 - origins.append(re.sub(r'\n|\t', '', question[start:].split('\n')[0])) - # truncate predictions >5 tokens longer than the reference - prediction = re.sub(r'\n|\t', '', prediction) - if len(prediction) - len(answer) > 5: - prediction = prediction[:len(answer) + 5] - if len(prediction) == 0: - prediction = "无内容" - predictions.append(prediction) - references.append(re.sub(r'\n|\t', '', answer)) - - #generate input files for ChERRANT - preds = [f'{i} \t {origin} \t {prediction} \n' for i, (origin, prediction) in enumerate(zip(origins, predictions))] - golds = [f'{i} \t {origin} \t {reference} \n' for i, (origin, reference) in enumerate(zip(origins, references))] - - now_path = os.path.abspath(os.getcwd()) - utils_path = os.path.abspath(os.path.join(__file__, '..', '..', 'utils')) - os.chdir(utils_path) - with tempfile.NamedTemporaryFile(delete=False, mode='w') as tmp_pred_file, \ - tempfile.NamedTemporaryFile(delete=False, mode='w') as tmp_gold_file: - tmp_pred_file.writelines(preds) - tmp_gold_file.writelines(golds) - - os.environ['KMP_DUPLICATE_LIB_OK'] = 'True' - os.system(f'python3 parallel_to_m2.py -f {tmp_pred_file.name} -o {tmp_pred_file.name}.m2 -g char') - os.system(f'python3 parallel_to_m2.py -f {tmp_gold_file.name} -o {tmp_gold_file.name}.m2 -g char') - output = subprocess.check_output( - f"python3 compare_m2_for_evaluation.py -hyp {tmp_pred_file.name}.m2 -ref {tmp_gold_file.name}.m2", shell=True) - score = float(output.decode().split('\t')[-1].split('\n')[0]) - #remove prediction files - os.remove(tmp_pred_file.name) - os.remove(tmp_gold_file.name) - os.remove(f"{tmp_pred_file.name}.m2") - os.remove(f"{tmp_gold_file.name}.m2") - os.chdir(now_path) - return {"score": score} +import re +import os +import subprocess +import tempfile + +""" +Task: legal document grammar correction +Metric: F0.5 score +文书校对 +""" +def compute_wsjd(data_dict): + origins, references, predictions = [], [], [] + for example in data_dict: + question, prediction, answer = example["origin_prompt"], example["prediction"], example["refr"] + if isinstance(question, list): + question = question[0]['prompt'] + start = question.index('句子:\n') + 4 + origins.append(re.sub(r'\n|\t', '', question[start:].split('\n')[0])) + # truncate predictions >5 tokens longer than the reference + prediction = re.sub(r'\n|\t', '', prediction) + if len(prediction) - len(answer) > 5: + prediction = prediction[:len(answer) + 5] + if len(prediction) == 0: + prediction = "无内容" + predictions.append(prediction) + references.append(re.sub(r'\n|\t', '', answer)) + + #generate input files for ChERRANT + preds = [f'{i} \t {origin} \t {prediction} \n' for i, (origin, prediction) in enumerate(zip(origins, predictions))] + golds = [f'{i} \t {origin} \t {reference} \n' for i, (origin, reference) in enumerate(zip(origins, references))] + + now_path = os.path.abspath(os.getcwd()) + utils_path = os.path.abspath(os.path.join(__file__, '..', '..', 'utils')) + os.chdir(utils_path) + with tempfile.NamedTemporaryFile(delete=False, mode='w') as tmp_pred_file, \ + tempfile.NamedTemporaryFile(delete=False, mode='w') as tmp_gold_file: + tmp_pred_file.writelines(preds) + tmp_gold_file.writelines(golds) + + os.environ['KMP_DUPLICATE_LIB_OK'] = 'True' + os.system(f'python3 parallel_to_m2.py -f {tmp_pred_file.name} -o {tmp_pred_file.name}.m2 -g char') + os.system(f'python3 parallel_to_m2.py -f {tmp_gold_file.name} -o {tmp_gold_file.name}.m2 -g char') + output = subprocess.check_output( + f"python3 compare_m2_for_evaluation.py -hyp {tmp_pred_file.name}.m2 -ref {tmp_gold_file.name}.m2", shell=True) + score = float(output.decode().split('\t')[-1].split('\n')[0]) + #remove prediction files + os.remove(tmp_pred_file.name) + os.remove(tmp_gold_file.name) + os.remove(f"{tmp_pred_file.name}.m2") + os.remove(f"{tmp_gold_file.name}.m2") + os.chdir(now_path) + return {"score": score} diff --git a/opencompass/datasets/lawbench/evaluation_functions/xxcq.py b/opencompass/datasets/lawbench/evaluation_functions/xxcq.py index 679d94d7..c504c730 100644 --- a/opencompass/datasets/lawbench/evaluation_functions/xxcq.py +++ b/opencompass/datasets/lawbench/evaluation_functions/xxcq.py @@ -1,17 +1,17 @@ -from ..utils.comprehension_scores import compute_ie_f1 - - -""" -task: information extraction -metric: F1 score -信息抽取 -""" -def compute_xxcq(data_dict): - references, predictions = [], [] - for example in data_dict: - question, prediction, answer = example["origin_prompt"], example["prediction"], example["refr"] - predictions.append(prediction) - references.append(answer) - - return compute_ie_f1(predictions, references, {"犯罪嫌疑人", "受害人", "被盗货币", "物品价值", "盗窃获利", - "被盗物品", "作案工具", "时间", "地点", "组织机构"}) +from ..utils.comprehension_scores import compute_ie_f1 + + +""" +task: information extraction +metric: F1 score +信息抽取 +""" +def compute_xxcq(data_dict): + references, predictions = [], [] + for example in data_dict: + question, prediction, answer = example["origin_prompt"], example["prediction"], example["refr"] + predictions.append(prediction) + references.append(answer) + + return compute_ie_f1(predictions, references, {"犯罪嫌疑人", "受害人", "被盗货币", "物品价值", "盗窃获利", + "被盗物品", "作案工具", "时间", "地点", "组织机构"}) diff --git a/opencompass/datasets/lawbench/evaluation_functions/ydlj.py b/opencompass/datasets/lawbench/evaluation_functions/ydlj.py index 5081e027..1065959f 100644 --- a/opencompass/datasets/lawbench/evaluation_functions/ydlj.py +++ b/opencompass/datasets/lawbench/evaluation_functions/ydlj.py @@ -1,17 +1,17 @@ -from ..utils.comprehension_scores import compute_rc_f1 - -""" -Task: machine reading comprehension -Metric: F1 score -法律阅读理解 -""" -def compute_ydlj(data_dict): - references, predictions = [], [] - for example in data_dict: - question, prediction, answer = example["origin_prompt"], example["prediction"], example["refr"] - answer = answer.replace("回答:", "") - predictions.append(prediction) - references.append(answer) - - f1_score = compute_rc_f1(predictions, references) - return f1_score +from ..utils.comprehension_scores import compute_rc_f1 + +""" +Task: machine reading comprehension +Metric: F1 score +法律阅读理解 +""" +def compute_ydlj(data_dict): + references, predictions = [], [] + for example in data_dict: + question, prediction, answer = example["origin_prompt"], example["prediction"], example["refr"] + answer = answer.replace("回答:", "") + predictions.append(prediction) + references.append(answer) + + f1_score = compute_rc_f1(predictions, references) + return f1_score diff --git a/opencompass/datasets/lawbench/evaluation_functions/yqzy.py b/opencompass/datasets/lawbench/evaluation_functions/yqzy.py index 1568050d..57b62466 100644 --- a/opencompass/datasets/lawbench/evaluation_functions/yqzy.py +++ b/opencompass/datasets/lawbench/evaluation_functions/yqzy.py @@ -1,18 +1,18 @@ -from ..utils.function_utils import compute_rouge - -#舆情摘要 -def compute_yqzy(data_dict): - """ - Compute the ROUGE-L score between the prediction and the reference - """ - references, predictions = [], [] - for example in data_dict: - question, prediction, answer = example["origin_prompt"], example["prediction"], example["refr"] - predictions.append(prediction) - references.append(answer) - - # compute the accuracy of score_list - rouge_scores = compute_rouge(predictions, references) - rouge_ls = [score["rouge-l"]["f"] for score in rouge_scores] - average_rouge_l = sum(rouge_ls) / len(rouge_ls) - return {"score": average_rouge_l} +from ..utils.function_utils import compute_rouge + +#舆情摘要 +def compute_yqzy(data_dict): + """ + Compute the ROUGE-L score between the prediction and the reference + """ + references, predictions = [], [] + for example in data_dict: + question, prediction, answer = example["origin_prompt"], example["prediction"], example["refr"] + predictions.append(prediction) + references.append(answer) + + # compute the accuracy of score_list + rouge_scores = compute_rouge(predictions, references) + rouge_ls = [score["rouge-l"]["f"] for score in rouge_scores] + average_rouge_l = sum(rouge_ls) / len(rouge_ls) + return {"score": average_rouge_l} diff --git a/opencompass/datasets/lawbench/evaluation_functions/zxfl.py b/opencompass/datasets/lawbench/evaluation_functions/zxfl.py index 7e4b0bc4..4cb0ec00 100644 --- a/opencompass/datasets/lawbench/evaluation_functions/zxfl.py +++ b/opencompass/datasets/lawbench/evaluation_functions/zxfl.py @@ -1,27 +1,27 @@ -from ..utils.function_utils import multi_choice_judge - -""" -task: multiple choice classification -metric: accuracy -咨询分类 -""" - -def compute_zxfl(data_dict): - """ - A reference (R) contains a list of options, each option is from the option_list. - We will extract the options appearing in the prediction and convert them into a set (P). - We compute the accuracy between the prediction (P) and the reference (R). - """ - - - score_list, abstentions = [], 0 - option_list = ['婚姻家庭', '劳动纠纷', '交通事故', '债权债务', '刑事辩护', '合同纠纷', '房产纠纷', '侵权', '公司法', '医疗纠纷', '拆迁安置', '行政诉讼', '建设工程', '知识产权', '综合咨询', '人身损害', '涉外法律', '海事海商', '消费权益', '抵押担保'] - for example in data_dict: - question, prediction, answer = example["origin_prompt"], example["prediction"], example["refr"] - judge = multi_choice_judge(prediction, option_list, answer) - score_list.append(judge["score"]) - abstentions += judge["abstention"] - - # compute the accuracy of score_list - final_accuracy_score = sum(score_list) / len(score_list) - return {'score': final_accuracy_score, 'abstention_rate': abstentions / len(data_dict)} +from ..utils.function_utils import multi_choice_judge + +""" +task: multiple choice classification +metric: accuracy +咨询分类 +""" + +def compute_zxfl(data_dict): + """ + A reference (R) contains a list of options, each option is from the option_list. + We will extract the options appearing in the prediction and convert them into a set (P). + We compute the accuracy between the prediction (P) and the reference (R). + """ + + + score_list, abstentions = [], 0 + option_list = ['婚姻家庭', '劳动纠纷', '交通事故', '债权债务', '刑事辩护', '合同纠纷', '房产纠纷', '侵权', '公司法', '医疗纠纷', '拆迁安置', '行政诉讼', '建设工程', '知识产权', '综合咨询', '人身损害', '涉外法律', '海事海商', '消费权益', '抵押担保'] + for example in data_dict: + question, prediction, answer = example["origin_prompt"], example["prediction"], example["refr"] + judge = multi_choice_judge(prediction, option_list, answer) + score_list.append(judge["score"]) + abstentions += judge["abstention"] + + # compute the accuracy of score_list + final_accuracy_score = sum(score_list) / len(score_list) + return {'score': final_accuracy_score, 'abstention_rate': abstentions / len(data_dict)} diff --git a/opencompass/datasets/lawbench/utils/__init__.py b/opencompass/datasets/lawbench/utils/__init__.py index 8b137891..d3f5a12f 100644 --- a/opencompass/datasets/lawbench/utils/__init__.py +++ b/opencompass/datasets/lawbench/utils/__init__.py @@ -1 +1 @@ - + diff --git a/opencompass/datasets/lawbench/utils/char_smi.py b/opencompass/datasets/lawbench/utils/char_smi.py index 54bb4790..0d257601 100644 --- a/opencompass/datasets/lawbench/utils/char_smi.py +++ b/opencompass/datasets/lawbench/utils/char_smi.py @@ -1,456 +1,456 @@ -### Copy from https://github.com/iqiyi/FASPell ### - -""" -Requirements: - - java (required only if tree edit distance is used) - - numpy -""" -import numpy as np -from subprocess import Popen, PIPE, STDOUT -import os -import argparse - -IDCS = {'\u2ff0': 2, # 12 ideographic description characters and their capacity of son nodes - '\u2ff1': 2, - '\u2ff2': 3, - '\u2ff3': 3, - '\u2ff4': 2, - '\u2ff5': 2, - '\u2ff6': 2, - '\u2ff7': 2, - '\u2ff8': 2, - '\u2ff9': 2, - '\u2ffa': 2, - '\u2ffb': 2, } - -PINYIN = {'ā': ['a', 1], 'á': ['a', 2], 'ǎ': ['a', 3], 'à': ['a', 4], - 'ē': ['e', 1], 'é': ['e', 2], 'ě': ['e', 3], 'è': ['e', 4], - 'ī': ['i', 1], 'í': ['i', 2], 'ǐ': ['i', 3], 'ì': ['i', 4], - 'ō': ['o', 1], 'ó': ['o', 2], 'ǒ': ['o', 3], 'ò': ['o', 4], - 'ū': ['u', 1], 'ú': ['u', 2], 'ǔ': ['u', 3], 'ù': ['u', 4], - 'ǖ': ['ü', 1], 'ǘ': ['ü', 2], 'ǚ': ['ü', 3], 'ǜ': ['ü', 4], - '': ['m', 2], 'ń': ['n', 2], 'ň': ['n', 3], 'ǹ': ['n', 4], - } - -# APTED_JAR_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'apted.jar') -APTED_JAR_PATH = 'apted.jar' - - -def tree_edit_distance(tree_a, tree_b): - """ - We use APTED algorithm proposed by M. Pawlik and N. Augsten - github link: https://github.com/DatabaseGroup/apted - """ - p = Popen(['java', '-jar', APTED_JAR_PATH, '-t', tree_a, tree_b], stdout=PIPE, stderr=STDOUT) - - res = [line for line in p.stdout] - res = res[0] - res = res.strip() - res = float(res) - - return res - - -def edit_distance(string_a, string_b, name='Levenshtein'): - """ - >>> edit_distance('abcde', 'avbcude') - 2 - >>> edit_distance(['至', '刂'], ['亻', '至', '刂']) - 1 - >>> edit_distance('fang', 'qwe') - 4 - >>> edit_distance('fang', 'hen') - 3 - """ - size_x = len(string_a) + 1 - size_y = len(string_b) + 1 - matrix = np.zeros((size_x, size_y), dtype=int) - for x in range(size_x): - matrix[x, 0] = x - for y in range(size_y): - matrix[0, y] = y - - for x in range(1, size_x): - for y in range(1, size_y): - if string_a[x - 1] == string_b[y - 1]: - matrix[x, y] = min( - matrix[x - 1, y] + 1, - matrix[x - 1, y - 1], - matrix[x, y - 1] + 1 - ) - else: - if name == 'Levenshtein': - matrix[x, y] = min( - matrix[x - 1, y] + 1, - matrix[x - 1, y - 1] + 1, - matrix[x, y - 1] + 1 - ) - else: # Canonical - matrix[x, y] = min( - matrix[x - 1, y] + 1, - matrix[x - 1, y - 1] + 2, - matrix[x, y - 1] + 1 - ) - - return matrix[size_x - 1, size_y - 1] - - -class CharFuncs(object): - def __init__(self, char_meta_fname): - self.data = self.load_char_meta(char_meta_fname) - self.char_dict = dict([(c, 0) for c in self.data]) - - self.safe = {'\u2ff0': 'A', - # to eliminate the bug that, in Windows CMD, char ⿻ and ⿵ are encoded to be the same. - '\u2ff1': 'B', - '\u2ff2': 'C', - '\u2ff3': 'D', - '\u2ff4': 'E', - '\u2ff5': 'F', - '\u2ff6': 'G', - '\u2ff7': 'H', - '\u2ff8': 'I', - '\u2ff9': 'J', - '\u2ffa': 'L', - '\u2ffb': 'M', } - - @staticmethod - def load_char_meta(fname): - data = {} - f = open(fname, 'r', encoding='utf-8') - for line in f: - items = line.strip().split('\t') - code_point = items[0] - char = items[1] - pronunciation = items[2] - decompositions = items[3:] - assert char not in data - data[char] = {"code_point": code_point, "pronunciation": pronunciation, "decompositions": decompositions} - return data - - def shape_distance(self, char1, char2, safe=True, as_tree=False): - """ - >>> c = CharFuncs('data/char_meta.txt') - >>> c.shape_distance('田', '由') - 1 - >>> c.shape_distance('牛', '午') - 1 - """ - assert char1 in self.data - assert char2 in self.data - - def safe_encode(decomp): - tree = '' - for c in string_to_tree(decomp): - if c not in self.safe: - tree += c - else: - tree += self.safe[c] - return tree - - def safe_encode_string(decomp): - tree = '' - for c in decomp: - if c not in self.safe: - tree += c - else: - tree += self.safe[c] - return tree - - decomps_1 = self.data[char1]["decompositions"] - decomps_2 = self.data[char2]["decompositions"] - - distance = 1e5 - if as_tree: - for decomp1 in decomps_1: - for decomp2 in decomps_2: - if not safe: - ted = tree_edit_distance(string_to_tree(decomp1), string_to_tree(decomp2)) - else: - ted = tree_edit_distance(safe_encode(decomp1), safe_encode(decomp2)) - distance = min(distance, ted) - else: - for decomp1 in decomps_1: - for decomp2 in decomps_2: - if not safe: - ed = edit_distance(decomp1, decomp2) - else: - ed = edit_distance(safe_encode_string(decomp1), safe_encode_string(decomp2)) - distance = min(distance, ed) - - return distance - - def pronunciation_distance(self, char1, char2): - """ - >>> c = CharFuncs('data/char_meta.txt') - >>> c.pronunciation_distance('田', '由') - 3.4 - >>> c.pronunciation_distance('牛', '午') - 2.6 - """ - assert char1 in self.data - assert char2 in self.data - pronunciations1 = self.data[char1]["pronunciation"] - pronunciations2 = self.data[char2]["pronunciation"] - - if pronunciations1[0] == 'null' or pronunciations2 == 'null': - return 0.0 - else: - - pronunciations1 = pronunciations1.split(';') # separate by lan - pronunciations2 = pronunciations2.split(';') # separate by lan - - distance = 0.0 - count = 0 - for pron_lan1, pron_lan2 in zip(pronunciations1, pronunciations2): - if (pron_lan1 == 'null') or (pron_lan2 == 'null'): - pass - else: - distance_lan = 1e5 - for p1 in pron_lan1.split(','): - for p2 in pron_lan2.split(','): - distance_lan = min(distance_lan, edit_distance(p1, p2)) - distance += distance_lan - count += 1 - - return distance / count - - @staticmethod - def load_dict(fname): - data = {} - f = open(fname, 'r', encoding='utf-8') - for line in f: - char, freq = line.strip().split('\t') - assert char not in data - data[char] = freq - - return data - - def similarity(self, char1, char2, weights=(0.8, 0.2, 0.0), as_tree=False): - """ - this function returns weighted similarity. When used in FASPell, each weight can only be 0 or 1. - """ - - # assert char1 in self.char_dict - # assert char2 in self.char_dict - shape_w, sound_w, freq_w = weights - - if char1 in self.char_dict and char2 in self.char_dict: - - shape_sim = self.shape_similarity(char1, char2, as_tree=as_tree) - sound_sim = self.pronunciation_similarity(char1, char2) - freq_sim = 1.0 - self.char_dict[char2] / len(self.char_dict) - - return shape_sim * shape_w + sound_sim * sound_w + freq_sim * freq_w - else: - return 0.0 - - def shape_similarity(self, char1, char2, safe=True, as_tree=False): - """ - >>> c = CharFuncs('data/char_meta.txt') - >>> c.shape_similarity('牛', '午') - 0.8571428571428572 - >>> c.shape_similarity('田', '由') - 0.8888888888888888 - """ - assert char1 in self.data - assert char2 in self.data - - def safe_encode(decomp): - tree = '' - for c in string_to_tree(decomp): - if c not in self.safe: - tree += c - else: - tree += self.safe[c] - return tree - - def safe_encode_string(decomp): - tree = '' - for c in decomp: - if c not in self.safe: - tree += c - else: - tree += self.safe[c] - return tree - - decomps_1 = self.data[char1]["decompositions"] - decomps_2 = self.data[char2]["decompositions"] - - similarity = 0.0 - if as_tree: - for decomp1 in decomps_1: - for decomp2 in decomps_2: - if not safe: - ted = tree_edit_distance(string_to_tree(decomp1), string_to_tree(decomp2)) - else: - ted = tree_edit_distance(safe_encode(decomp1), safe_encode(decomp2)) - normalized_ted = 2 * ted / (len(decomp1) + len(decomp2) + ted) - similarity = max(similarity, 1 - normalized_ted) - else: - for decomp1 in decomps_1: - for decomp2 in decomps_2: - if not safe: - ed = edit_distance(decomp1, decomp2) - else: - ed = edit_distance(safe_encode_string(decomp1), safe_encode_string(decomp2)) - normalized_ed = ed / max(len(decomp1), len(decomp2)) - similarity = max(similarity, 1 - normalized_ed) - - return similarity - - def pronunciation_similarity(self, char1, char2): - """ - >>> c = CharFuncs('data/char_meta.txt') - >>> c.pronunciation_similarity('牛', '午') - 0.27999999999999997 - >>> c.pronunciation_similarity('由', '田') - 0.09 - - """ - assert char1 in self.data - assert char2 in self.data - pronunciations1 = self.data[char1]["pronunciation"] - pronunciations2 = self.data[char2]["pronunciation"] - - if pronunciations1[0] == 'null' or pronunciations2 == 'null': - return 0.0 - else: - - pronunciations1 = pronunciations1.split(';') # separate by lan - pronunciations2 = pronunciations2.split(';') # separate by lan - - similarity = 0.0 - count = 0 - for pron_lan1, pron_lan2 in zip(pronunciations1, pronunciations2): - if (pron_lan1 == 'null') or (pron_lan2 == 'null'): - pass - else: - similarity_lan = 0.0 - for p1 in pron_lan1.split(','): - for p2 in pron_lan2.split(','): - tmp_sim = 1 - edit_distance(p1, p2) / max(len(p1), len(p2)) - similarity_lan = max(similarity_lan, tmp_sim) - similarity += similarity_lan - count += 1 - - return similarity / count if count else 0 - - -def string_to_tree(string): - """ - This function converts ids string to a string that can be used as a tree input to APTED. - Any Error raised by this function implies that the input string is invalid. - >>> string_to_tree('⿱⿱⿰丿㇏⿰丿㇏⿱⿰丿㇏⿰丿㇏') # 炎 - '{⿱{⿱{⿰{丿}{㇏}}{⿰{丿}{㇏}}}{⿱{⿰{丿}{㇏}}{⿰{丿}{㇏}}}}' - >>> string_to_tree('⿱⿰丿㇏⿱一⿱⿻一丨一') # 全 - '{⿱{⿰{丿}{㇏}}{⿱{一}{⿱{⿻{一}{丨}}{一}}}}' - >>> string_to_tree('⿱⿰丿㇏⿻⿱一⿱⿻一丨一丷') # 金 - '{⿱{⿰{丿}{㇏}}{⿻{⿱{一}{⿱{⿻{一}{丨}}{一}}}{丷}}}' - >>> string_to_tree('⿻⿻⿻一丨一⿴⿱⿰丨𠃌一一') # 車 - '{⿻{⿻{⿻{一}{丨}}{一}}{⿴{⿱{⿰{丨}{𠃌}}{一}}{一}}}' - >>> string_to_tree('⿻⿻⿻一丨⿰丿㇏⿴⿱⿰丨𠃌一一') # 東 - '{⿻{⿻{⿻{一}{丨}}{⿰{丿}{㇏}}}{⿴{⿱{⿰{丨}{𠃌}}{一}}{一}}}' - >>> string_to_tree('丿') # 丿 - '{丿}' - >>> string_to_tree('⿻') # ⿻ - '{⿻}' - """ - if string[0] in IDCS and len(string) != 1: - bracket_stack = [] - tree = [] - - def add_brackets(num): - if num == 2: - bracket_stack.extend(['}', '{', '}']) - else: - bracket_stack.extend(['}', '{', '}', '{', '}']) - tree.append('{') - - global_just_put = '{' - - for c in string: - tree.append(c) - if c in IDCS: - assert global_just_put != '}' - add_brackets(IDCS[c]) - global_just_put = '{' - else: - just_put = '' - while just_put != '{' and bracket_stack: - just_put = bracket_stack.pop(-1) - tree.append(just_put) - global_just_put = just_put - - res = ''.join(tree) - assert res[-1] == '}' - else: - assert len(string) == 1 or string == 'null' - res = string[0] - - return '{' + res + '}' - - -def pinyin_map(standard_pinyin): - """ - >>> pinyin_map('xuě') - 'xue3' - >>> pinyin_map('xue') - 'xue' - >>> pinyin_map('lǜ') - 'lü4' - >>> pinyin_map('fá') - 'fa2' - """ - tone = '' - pinyin = '' - - assert ' ' not in standard_pinyin - for c in standard_pinyin: - if c in PINYIN: - pinyin += PINYIN[c][0] - assert tone == '' - tone = str(PINYIN[c][1]) - else: - pinyin += c - pinyin += tone - return pinyin - - -def parse_args(): - usage = '\n1. You can compute character similarity by:\n' \ - 'python char_sim.py 午 牛 年 千\n' \ - '\n' \ - '2. You can use ted in computing character similarity by:\n' \ - 'python char_sim.py 午 牛 年 千 -t\n' \ - '\n' - parser = argparse.ArgumentParser( - description='A script to compute Chinese character (Kanji) similarity', usage=usage) - - parser.add_argument('multiargs', nargs='*', type=str, default=None, - help='Chinese characters in question') - parser.add_argument('--ted', '-t', action="store_true", default=False, - help='True=to use tree edit distence (TED)' - 'False=to use string edit distance') - - args = parser.parse_args() - return args - - -if __name__ == '__main__': - args = parse_args() - c = CharFuncs('data/char_meta.txt') - if not args.ted: - for i, c1 in enumerate(args.multiargs): - for c2 in args.multiargs[i:]: - if c1 != c2: - print(f'For character pair ({c1}, {c2}):') - print(f' v-sim = {c.shape_similarity(c1, c2)}') - print(f' p-sim = {c.pronunciation_similarity(c1, c2)}\n') - else: - for i, c1 in enumerate(args.multiargs): - for c2 in args.multiargs[i:]: - if c1 != c2: - print(f'For character pair ({c1}, {c2}):') - print(f' v-sim = {c.shape_similarity(c1, c2, as_tree=True)}') +### Copy from https://github.com/iqiyi/FASPell ### + +""" +Requirements: + - java (required only if tree edit distance is used) + - numpy +""" +import numpy as np +from subprocess import Popen, PIPE, STDOUT +import os +import argparse + +IDCS = {'\u2ff0': 2, # 12 ideographic description characters and their capacity of son nodes + '\u2ff1': 2, + '\u2ff2': 3, + '\u2ff3': 3, + '\u2ff4': 2, + '\u2ff5': 2, + '\u2ff6': 2, + '\u2ff7': 2, + '\u2ff8': 2, + '\u2ff9': 2, + '\u2ffa': 2, + '\u2ffb': 2, } + +PINYIN = {'ā': ['a', 1], 'á': ['a', 2], 'ǎ': ['a', 3], 'à': ['a', 4], + 'ē': ['e', 1], 'é': ['e', 2], 'ě': ['e', 3], 'è': ['e', 4], + 'ī': ['i', 1], 'í': ['i', 2], 'ǐ': ['i', 3], 'ì': ['i', 4], + 'ō': ['o', 1], 'ó': ['o', 2], 'ǒ': ['o', 3], 'ò': ['o', 4], + 'ū': ['u', 1], 'ú': ['u', 2], 'ǔ': ['u', 3], 'ù': ['u', 4], + 'ǖ': ['ü', 1], 'ǘ': ['ü', 2], 'ǚ': ['ü', 3], 'ǜ': ['ü', 4], + '': ['m', 2], 'ń': ['n', 2], 'ň': ['n', 3], 'ǹ': ['n', 4], + } + +# APTED_JAR_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'apted.jar') +APTED_JAR_PATH = 'apted.jar' + + +def tree_edit_distance(tree_a, tree_b): + """ + We use APTED algorithm proposed by M. Pawlik and N. Augsten + github link: https://github.com/DatabaseGroup/apted + """ + p = Popen(['java', '-jar', APTED_JAR_PATH, '-t', tree_a, tree_b], stdout=PIPE, stderr=STDOUT) + + res = [line for line in p.stdout] + res = res[0] + res = res.strip() + res = float(res) + + return res + + +def edit_distance(string_a, string_b, name='Levenshtein'): + """ + >>> edit_distance('abcde', 'avbcude') + 2 + >>> edit_distance(['至', '刂'], ['亻', '至', '刂']) + 1 + >>> edit_distance('fang', 'qwe') + 4 + >>> edit_distance('fang', 'hen') + 3 + """ + size_x = len(string_a) + 1 + size_y = len(string_b) + 1 + matrix = np.zeros((size_x, size_y), dtype=int) + for x in range(size_x): + matrix[x, 0] = x + for y in range(size_y): + matrix[0, y] = y + + for x in range(1, size_x): + for y in range(1, size_y): + if string_a[x - 1] == string_b[y - 1]: + matrix[x, y] = min( + matrix[x - 1, y] + 1, + matrix[x - 1, y - 1], + matrix[x, y - 1] + 1 + ) + else: + if name == 'Levenshtein': + matrix[x, y] = min( + matrix[x - 1, y] + 1, + matrix[x - 1, y - 1] + 1, + matrix[x, y - 1] + 1 + ) + else: # Canonical + matrix[x, y] = min( + matrix[x - 1, y] + 1, + matrix[x - 1, y - 1] + 2, + matrix[x, y - 1] + 1 + ) + + return matrix[size_x - 1, size_y - 1] + + +class CharFuncs(object): + def __init__(self, char_meta_fname): + self.data = self.load_char_meta(char_meta_fname) + self.char_dict = dict([(c, 0) for c in self.data]) + + self.safe = {'\u2ff0': 'A', + # to eliminate the bug that, in Windows CMD, char ⿻ and ⿵ are encoded to be the same. + '\u2ff1': 'B', + '\u2ff2': 'C', + '\u2ff3': 'D', + '\u2ff4': 'E', + '\u2ff5': 'F', + '\u2ff6': 'G', + '\u2ff7': 'H', + '\u2ff8': 'I', + '\u2ff9': 'J', + '\u2ffa': 'L', + '\u2ffb': 'M', } + + @staticmethod + def load_char_meta(fname): + data = {} + f = open(fname, 'r', encoding='utf-8') + for line in f: + items = line.strip().split('\t') + code_point = items[0] + char = items[1] + pronunciation = items[2] + decompositions = items[3:] + assert char not in data + data[char] = {"code_point": code_point, "pronunciation": pronunciation, "decompositions": decompositions} + return data + + def shape_distance(self, char1, char2, safe=True, as_tree=False): + """ + >>> c = CharFuncs('data/char_meta.txt') + >>> c.shape_distance('田', '由') + 1 + >>> c.shape_distance('牛', '午') + 1 + """ + assert char1 in self.data + assert char2 in self.data + + def safe_encode(decomp): + tree = '' + for c in string_to_tree(decomp): + if c not in self.safe: + tree += c + else: + tree += self.safe[c] + return tree + + def safe_encode_string(decomp): + tree = '' + for c in decomp: + if c not in self.safe: + tree += c + else: + tree += self.safe[c] + return tree + + decomps_1 = self.data[char1]["decompositions"] + decomps_2 = self.data[char2]["decompositions"] + + distance = 1e5 + if as_tree: + for decomp1 in decomps_1: + for decomp2 in decomps_2: + if not safe: + ted = tree_edit_distance(string_to_tree(decomp1), string_to_tree(decomp2)) + else: + ted = tree_edit_distance(safe_encode(decomp1), safe_encode(decomp2)) + distance = min(distance, ted) + else: + for decomp1 in decomps_1: + for decomp2 in decomps_2: + if not safe: + ed = edit_distance(decomp1, decomp2) + else: + ed = edit_distance(safe_encode_string(decomp1), safe_encode_string(decomp2)) + distance = min(distance, ed) + + return distance + + def pronunciation_distance(self, char1, char2): + """ + >>> c = CharFuncs('data/char_meta.txt') + >>> c.pronunciation_distance('田', '由') + 3.4 + >>> c.pronunciation_distance('牛', '午') + 2.6 + """ + assert char1 in self.data + assert char2 in self.data + pronunciations1 = self.data[char1]["pronunciation"] + pronunciations2 = self.data[char2]["pronunciation"] + + if pronunciations1[0] == 'null' or pronunciations2 == 'null': + return 0.0 + else: + + pronunciations1 = pronunciations1.split(';') # separate by lan + pronunciations2 = pronunciations2.split(';') # separate by lan + + distance = 0.0 + count = 0 + for pron_lan1, pron_lan2 in zip(pronunciations1, pronunciations2): + if (pron_lan1 == 'null') or (pron_lan2 == 'null'): + pass + else: + distance_lan = 1e5 + for p1 in pron_lan1.split(','): + for p2 in pron_lan2.split(','): + distance_lan = min(distance_lan, edit_distance(p1, p2)) + distance += distance_lan + count += 1 + + return distance / count + + @staticmethod + def load_dict(fname): + data = {} + f = open(fname, 'r', encoding='utf-8') + for line in f: + char, freq = line.strip().split('\t') + assert char not in data + data[char] = freq + + return data + + def similarity(self, char1, char2, weights=(0.8, 0.2, 0.0), as_tree=False): + """ + this function returns weighted similarity. When used in FASPell, each weight can only be 0 or 1. + """ + + # assert char1 in self.char_dict + # assert char2 in self.char_dict + shape_w, sound_w, freq_w = weights + + if char1 in self.char_dict and char2 in self.char_dict: + + shape_sim = self.shape_similarity(char1, char2, as_tree=as_tree) + sound_sim = self.pronunciation_similarity(char1, char2) + freq_sim = 1.0 - self.char_dict[char2] / len(self.char_dict) + + return shape_sim * shape_w + sound_sim * sound_w + freq_sim * freq_w + else: + return 0.0 + + def shape_similarity(self, char1, char2, safe=True, as_tree=False): + """ + >>> c = CharFuncs('data/char_meta.txt') + >>> c.shape_similarity('牛', '午') + 0.8571428571428572 + >>> c.shape_similarity('田', '由') + 0.8888888888888888 + """ + assert char1 in self.data + assert char2 in self.data + + def safe_encode(decomp): + tree = '' + for c in string_to_tree(decomp): + if c not in self.safe: + tree += c + else: + tree += self.safe[c] + return tree + + def safe_encode_string(decomp): + tree = '' + for c in decomp: + if c not in self.safe: + tree += c + else: + tree += self.safe[c] + return tree + + decomps_1 = self.data[char1]["decompositions"] + decomps_2 = self.data[char2]["decompositions"] + + similarity = 0.0 + if as_tree: + for decomp1 in decomps_1: + for decomp2 in decomps_2: + if not safe: + ted = tree_edit_distance(string_to_tree(decomp1), string_to_tree(decomp2)) + else: + ted = tree_edit_distance(safe_encode(decomp1), safe_encode(decomp2)) + normalized_ted = 2 * ted / (len(decomp1) + len(decomp2) + ted) + similarity = max(similarity, 1 - normalized_ted) + else: + for decomp1 in decomps_1: + for decomp2 in decomps_2: + if not safe: + ed = edit_distance(decomp1, decomp2) + else: + ed = edit_distance(safe_encode_string(decomp1), safe_encode_string(decomp2)) + normalized_ed = ed / max(len(decomp1), len(decomp2)) + similarity = max(similarity, 1 - normalized_ed) + + return similarity + + def pronunciation_similarity(self, char1, char2): + """ + >>> c = CharFuncs('data/char_meta.txt') + >>> c.pronunciation_similarity('牛', '午') + 0.27999999999999997 + >>> c.pronunciation_similarity('由', '田') + 0.09 + + """ + assert char1 in self.data + assert char2 in self.data + pronunciations1 = self.data[char1]["pronunciation"] + pronunciations2 = self.data[char2]["pronunciation"] + + if pronunciations1[0] == 'null' or pronunciations2 == 'null': + return 0.0 + else: + + pronunciations1 = pronunciations1.split(';') # separate by lan + pronunciations2 = pronunciations2.split(';') # separate by lan + + similarity = 0.0 + count = 0 + for pron_lan1, pron_lan2 in zip(pronunciations1, pronunciations2): + if (pron_lan1 == 'null') or (pron_lan2 == 'null'): + pass + else: + similarity_lan = 0.0 + for p1 in pron_lan1.split(','): + for p2 in pron_lan2.split(','): + tmp_sim = 1 - edit_distance(p1, p2) / max(len(p1), len(p2)) + similarity_lan = max(similarity_lan, tmp_sim) + similarity += similarity_lan + count += 1 + + return similarity / count if count else 0 + + +def string_to_tree(string): + """ + This function converts ids string to a string that can be used as a tree input to APTED. + Any Error raised by this function implies that the input string is invalid. + >>> string_to_tree('⿱⿱⿰丿㇏⿰丿㇏⿱⿰丿㇏⿰丿㇏') # 炎 + '{⿱{⿱{⿰{丿}{㇏}}{⿰{丿}{㇏}}}{⿱{⿰{丿}{㇏}}{⿰{丿}{㇏}}}}' + >>> string_to_tree('⿱⿰丿㇏⿱一⿱⿻一丨一') # 全 + '{⿱{⿰{丿}{㇏}}{⿱{一}{⿱{⿻{一}{丨}}{一}}}}' + >>> string_to_tree('⿱⿰丿㇏⿻⿱一⿱⿻一丨一丷') # 金 + '{⿱{⿰{丿}{㇏}}{⿻{⿱{一}{⿱{⿻{一}{丨}}{一}}}{丷}}}' + >>> string_to_tree('⿻⿻⿻一丨一⿴⿱⿰丨𠃌一一') # 車 + '{⿻{⿻{⿻{一}{丨}}{一}}{⿴{⿱{⿰{丨}{𠃌}}{一}}{一}}}' + >>> string_to_tree('⿻⿻⿻一丨⿰丿㇏⿴⿱⿰丨𠃌一一') # 東 + '{⿻{⿻{⿻{一}{丨}}{⿰{丿}{㇏}}}{⿴{⿱{⿰{丨}{𠃌}}{一}}{一}}}' + >>> string_to_tree('丿') # 丿 + '{丿}' + >>> string_to_tree('⿻') # ⿻ + '{⿻}' + """ + if string[0] in IDCS and len(string) != 1: + bracket_stack = [] + tree = [] + + def add_brackets(num): + if num == 2: + bracket_stack.extend(['}', '{', '}']) + else: + bracket_stack.extend(['}', '{', '}', '{', '}']) + tree.append('{') + + global_just_put = '{' + + for c in string: + tree.append(c) + if c in IDCS: + assert global_just_put != '}' + add_brackets(IDCS[c]) + global_just_put = '{' + else: + just_put = '' + while just_put != '{' and bracket_stack: + just_put = bracket_stack.pop(-1) + tree.append(just_put) + global_just_put = just_put + + res = ''.join(tree) + assert res[-1] == '}' + else: + assert len(string) == 1 or string == 'null' + res = string[0] + + return '{' + res + '}' + + +def pinyin_map(standard_pinyin): + """ + >>> pinyin_map('xuě') + 'xue3' + >>> pinyin_map('xue') + 'xue' + >>> pinyin_map('lǜ') + 'lü4' + >>> pinyin_map('fá') + 'fa2' + """ + tone = '' + pinyin = '' + + assert ' ' not in standard_pinyin + for c in standard_pinyin: + if c in PINYIN: + pinyin += PINYIN[c][0] + assert tone == '' + tone = str(PINYIN[c][1]) + else: + pinyin += c + pinyin += tone + return pinyin + + +def parse_args(): + usage = '\n1. You can compute character similarity by:\n' \ + 'python char_sim.py 午 牛 年 千\n' \ + '\n' \ + '2. You can use ted in computing character similarity by:\n' \ + 'python char_sim.py 午 牛 年 千 -t\n' \ + '\n' + parser = argparse.ArgumentParser( + description='A script to compute Chinese character (Kanji) similarity', usage=usage) + + parser.add_argument('multiargs', nargs='*', type=str, default=None, + help='Chinese characters in question') + parser.add_argument('--ted', '-t', action="store_true", default=False, + help='True=to use tree edit distence (TED)' + 'False=to use string edit distance') + + args = parser.parse_args() + return args + + +if __name__ == '__main__': + args = parse_args() + c = CharFuncs('data/char_meta.txt') + if not args.ted: + for i, c1 in enumerate(args.multiargs): + for c2 in args.multiargs[i:]: + if c1 != c2: + print(f'For character pair ({c1}, {c2}):') + print(f' v-sim = {c.shape_similarity(c1, c2)}') + print(f' p-sim = {c.pronunciation_similarity(c1, c2)}\n') + else: + for i, c1 in enumerate(args.multiargs): + for c2 in args.multiargs[i:]: + if c1 != c2: + print(f'For character pair ({c1}, {c2}):') + print(f' v-sim = {c.shape_similarity(c1, c2, as_tree=True)}') print(f' p-sim = {c.pronunciation_similarity(c1, c2)}\n') \ No newline at end of file diff --git a/opencompass/datasets/lawbench/utils/compare_m2_for_evaluation.py b/opencompass/datasets/lawbench/utils/compare_m2_for_evaluation.py index 2e7567e8..41f6e818 100644 --- a/opencompass/datasets/lawbench/utils/compare_m2_for_evaluation.py +++ b/opencompass/datasets/lawbench/utils/compare_m2_for_evaluation.py @@ -1,433 +1,433 @@ -import argparse -from collections import Counter - -def main(): - # Parse command line args - args = parse_args() - # Open hypothesis and reference m2 files and split into chunks - hyp_m2 = open(args.hyp).read().strip().split("\n\n")[args.start:args.end] if args.start is not None and args.end is not None else open(args.hyp).read().strip().split("\n\n") - ref_m2 = open(args.ref).read().strip().split("\n\n")[args.start:args.end] if args.start is not None and args.end is not None else open(args.ref).read().strip().split("\n\n") - # Make sure they have the same number of sentences - assert len(hyp_m2) == len(ref_m2), print(len(hyp_m2), len(ref_m2)) - - # Store global corpus level best counts here - best_dict = Counter({"tp":0, "fp":0, "fn":0}) - best_cats = {} - # Process each sentence - sents = zip(hyp_m2, ref_m2) - for sent_id, sent in enumerate(sents): - # Simplify the edits into lists of lists - # if "A1" in sent[0] or "A1" in sent[1] or sent_id in sent_id_cons: - # sent_id_cons.append(sent_id) - src = sent[0].split("\n")[0] - hyp_edits = simplify_edits(sent[0], args.max_answer_num) - ref_edits = simplify_edits(sent[1], args.max_answer_num) - # Process the edits for detection/correction based on args - hyp_dict = process_edits(hyp_edits, args) - ref_dict = process_edits(ref_edits, args) - if args.reference_num is None or len(ref_dict.keys()) == args.reference_num: - # Evaluate edits and get best TP, FP, FN hyp+ref combo. - count_dict, cat_dict = evaluate_edits(src, - hyp_dict, ref_dict, best_dict, sent_id, args) - # Merge these dicts with best_dict and best_cats - best_dict += Counter(count_dict) - best_cats = merge_dict(best_cats, cat_dict) - # Print results - print_results(best_dict, best_cats, args) - -# Parse command line args -def parse_args(): - parser = argparse.ArgumentParser( - description="Calculate F-scores for error detection and/or correction.\n" - "Flags let you evaluate at different levels of granularity.", - formatter_class=argparse.RawTextHelpFormatter, - usage="%(prog)s [options] -hyp HYP -ref REF") - parser.add_argument( - "-hyp", - help="A hypothesis M2 file.", - required=True) - parser.add_argument( - "-ref", - help="A reference M2 file.", - required=True) - parser.add_argument( - "--start", - type=int, - default=None - ) - parser.add_argument( - "--end", - type=int, - default=None - ) - parser.add_argument( - "--max_answer_num", - type=int, - default=None - ) - parser.add_argument( - "--reference_num", - type=int, - default=None - ) - parser.add_argument( - "-b", - "--beta", - help="Value of beta in F-score. (default: 0.5)", - default=0.5, - type=float) - parser.add_argument( - "-v", - "--verbose", - help="Print verbose output.", - action="store_true") - eval_type = parser.add_mutually_exclusive_group() - eval_type.add_argument( - "-dt", - help="Evaluate Detection in terms of Tokens.", - action="store_true") - eval_type.add_argument( - "-ds", - help="Evaluate Detection in terms of Spans.", - action="store_true") - eval_type.add_argument( - "-cs", - help="Evaluate Correction in terms of Spans. (default)", - action="store_true") - eval_type.add_argument( - "-cse", - help="Evaluate Correction in terms of Spans and Error types.", - action="store_true") - parser.add_argument( - "-single", - help="Only evaluate single token edits; i.e. 0:1, 1:0 or 1:1", - action="store_true") - parser.add_argument( - "-multi", - help="Only evaluate multi token edits; i.e. 2+:n or n:2+", - action="store_true") - parser.add_argument( - "-multi_hyp_avg", - help="When get multiple hypotheses for a sentence, calculate their average F-scores for this sentence.", - action="store_true") # For IAA calculation - parser.add_argument( - "-multi_hyp_max", - help="When get multiple hypotheses for a sentence, calculate their F-scores and select the max one for this sentence.", - action="store_true") # For multiple hypotheses system evaluation - parser.add_argument( - "-filt", - help="Do not evaluate the specified error types.", - nargs="+", - default=[]) - parser.add_argument( - "-cat", - help="Show error category scores.\n" - "1: Only show operation tier scores; e.g. R.\n" - "2: Only show main tier scores; e.g. NOUN.\n" - "3: Show all category scores; e.g. R:NOUN.", - choices=[1, 2, 3], - type=int) - args = parser.parse_args() - return args - -# Input: An m2 format sentence with edits. -# Output: A list of lists. Each edit: [start, end, cat, cor, coder] -def simplify_edits(sent, max_answer_num): - out_edits = [] - # Get the edit lines from an m2 block. - edits = sent.split("\n") - # Loop through the edits - for edit in edits: - # Preprocessing - if edit.startswith("A "): - edit = edit[2:].split("|||") # Ignore "A " then split. - span = edit[0].split() - start = int(span[0]) - end = int(span[1]) - cat = edit[1] - cor = edit[2].replace(" ", "") - coder = int(edit[-1]) - out_edit = [start, end, cat, cor, coder] - out_edits.append(out_edit) - # return [edit for edit in out_edits if edit[-1] in [0,1]] - if max_answer_num is None: - return out_edits - elif max_answer_num == 1: - return [edit for edit in out_edits if edit[-1] == 0] - elif max_answer_num == 2: - return [edit for edit in out_edits if edit[-1] in [0, 1]] - elif max_answer_num == 3: - return [edit for edit in out_edits if edit[-1] in [0, 1, 2]] - -# Input 1: A list of edits. Each edit: [start, end, cat, cor, coder] -# Input 2: Command line args -# Output: A dict; key is coder, value is edit dict. -def process_edits(edits, args): - coder_dict = {} - # Add an explicit noop edit if there are no edits. - if not edits: edits = [[-1, -1, "noop", "-NONE-", 0]] - # Loop through the edits - for edit in edits: - # Name the edit elements for clarity - start = edit[0] - end = edit[1] - cat = edit[2] - cor = edit[3] - coder = edit[4] - # Add the coder to the coder_dict if necessary - if coder not in coder_dict: coder_dict[coder] = {} - - # Optionally apply filters based on args - # 1. UNK type edits are only useful for detection, not correction. - if not args.dt and not args.ds and cat == "UNK": continue - # 2. Only evaluate single token edits; i.e. 0:1, 1:0 or 1:1 - if args.single and (end-start >= 2 or len(cor.split()) >= 2): continue - # 3. Only evaluate multi token edits; i.e. 2+:n or n:2+ - if args.multi and end-start < 2 and len(cor.split()) < 2: continue - # 4. If there is a filter, ignore the specified error types - if args.filt and cat in args.filt: continue - - # Token Based Detection - if args.dt: - # Preserve noop edits. - if start == -1: - if (start, start) in coder_dict[coder].keys(): - coder_dict[coder][(start, start)].append(cat) - else: - coder_dict[coder][(start, start)] = [cat] - # Insertions defined as affecting the token on the right - elif start == end and start >= 0: - if (start, start+1) in coder_dict[coder].keys(): - coder_dict[coder][(start, start+1)].append(cat) - else: - coder_dict[coder][(start, start+1)] = [cat] - # Edit spans are split for each token in the range. - else: - for tok_id in range(start, end): - if (tok_id, tok_id+1) in coder_dict[coder].keys(): - coder_dict[coder][(tok_id, tok_id+1)].append(cat) - else: - coder_dict[coder][(tok_id, tok_id+1)] = [cat] - - # Span Based Detection - elif args.ds: - if (start, end) in coder_dict[coder].keys(): - coder_dict[coder][(start, end)].append(cat) - else: - coder_dict[coder][(start, end)] = [cat] - - # Span Based Correction - else: - # With error type classification - if args.cse: - if (start, end, cat, cor) in coder_dict[coder].keys(): - coder_dict[coder][(start, end, cat, cor)].append(cat) - else: - coder_dict[coder][(start, end, cat, cor)] = [cat] - # Without error type classification - else: - if (start, end, cor) in coder_dict[coder].keys(): - coder_dict[coder][(start, end, cor)].append(cat) - else: - coder_dict[coder][(start, end, cor)] = [cat] - return coder_dict - -# Input 1: A hyp dict; key is coder_id, value is dict of processed hyp edits. -# Input 2: A ref dict; key is coder_id, value is dict of processed ref edits. -# Input 3: A dictionary of the best corpus level TP, FP and FN counts so far. -# Input 4: Sentence ID (for verbose output only) -# Input 5: Command line args -# Output 1: A dict of the best corpus level TP, FP and FN for the input sentence. -# Output 2: The corresponding error type dict for the above dict. -def evaluate_edits(src, hyp_dict, ref_dict, best, sent_id, args): - # Store the best sentence level scores and hyp+ref combination IDs - # best_f is initialised as -1 cause 0 is a valid result. - best_tp, best_fp, best_fn, best_f, best_hyp, best_ref = 0, 0, 0, -1, 0, 0 - best_cat = {} - # skip not annotatable sentence - if len(ref_dict.keys()) == 1: - ref_id = list(ref_dict.keys())[0] - if len(ref_dict[ref_id].keys()) == 1: - cat = list(ref_dict[ref_id].values())[0][0] - if cat == "NA": - best_dict = {"tp":best_tp, "fp":best_fp, "fn":best_fn} - return best_dict, best_cat - - # Compare each hyp and ref combination - for hyp_id in hyp_dict.keys(): - for ref_id in ref_dict.keys(): - # Get the local counts for the current combination. - tp, fp, fn, cat_dict = compareEdits(hyp_dict[hyp_id], ref_dict[ref_id]) - # Compute the local sentence scores (for verbose output only) - loc_p, loc_r, loc_f = computeFScore(tp, fp, fn, args.beta) - # Compute the global sentence scores - p, r, f = computeFScore( - tp+best["tp"], fp+best["fp"], fn+best["fn"], args.beta) - # Save the scores if they are better in terms of: - # 1. Higher F-score - # 2. Same F-score, higher TP - # 3. Same F-score and TP, lower FP - # 4. Same F-score, TP and FP, lower FN - if (f > best_f) or \ - (f == best_f and tp > best_tp) or \ - (f == best_f and tp == best_tp and fp < best_fp) or \ - (f == best_f and tp == best_tp and fp == best_fp and fn < best_fn): - best_tp, best_fp, best_fn = tp, fp, fn - best_f, best_hyp, best_ref = f, hyp_id, ref_id - best_cat = cat_dict - # Verbose output - if args.verbose: - # Prepare verbose output edits. - hyp_verb = list(sorted(hyp_dict[hyp_id].keys())) - ref_verb = list(sorted(ref_dict[ref_id].keys())) - # Ignore noop edits - if not hyp_verb or hyp_verb[0][0] == -1: hyp_verb = [] - if not ref_verb or ref_verb[0][0] == -1: ref_verb = [] - # Print verbose info - print('{:-^40}'.format("")) - print("SENTENCE "+str(sent_id)+src[1:]) - print('{:-^40}'.format("")) - print("SENTENCE "+str(sent_id)+" - HYP "+str(hyp_id)+" - REF "+str(ref_id)) - print("HYPOTHESIS EDITS :", hyp_verb) - print("REFERENCE EDITS :", ref_verb) - print("Local TP/FP/FN :", str(tp), str(fp), str(fn)) - print("Local P/R/F"+str(args.beta)+" :", str(loc_p), str(loc_r), str(loc_f)) - print("Global TP/FP/FN :", str(tp+best["tp"]), str(fp+best["fp"]), str(fn+best["fn"])) - print("Global P/R/F"+str(args.beta)+" :", str(p), str(r), str(f)) - # Verbose output: display the best hyp+ref combination - if args.verbose: - print('{:-^40}'.format("")) - print("^^ HYP "+str(best_hyp)+", REF "+str(best_ref)+" chosen for sentence "+str(sent_id)) - # Save the best TP, FP and FNs as a dict, and return this and the best_cat dict - best_dict = {"tp":best_tp, "fp":best_fp, "fn":best_fn} - return best_dict, best_cat - -# Input 1: A dictionary of hypothesis edits for a single system. -# Input 2: A dictionary of reference edits for a single annotator. -# Output 1-3: The TP, FP and FN for the hyp vs the given ref annotator. -# Output 4: A dictionary of the error type counts. -def compareEdits(hyp_edits, ref_edits): - tp = 0 # True Positives - fp = 0 # False Positives - fn = 0 # False Negatives - cat_dict = {} # {cat: [tp, fp, fn], ...} - - for h_edit, h_cats in hyp_edits.items(): - # noop hyp edits cannot be TP or FP - if h_cats[0] == "noop": continue - # TRUE POSITIVES - if h_edit in ref_edits.keys(): - # On occasion, multiple tokens at same span. - for h_cat in ref_edits[h_edit]: # Use ref dict for TP - tp += 1 - # Each dict value [TP, FP, FN] - if h_cat in cat_dict.keys(): - cat_dict[h_cat][0] += 1 - else: - cat_dict[h_cat] = [1, 0, 0] - # FALSE POSITIVES - else: - # On occasion, multiple tokens at same span. - for h_cat in h_cats: - fp += 1 - # Each dict value [TP, FP, FN] - if h_cat in cat_dict.keys(): - cat_dict[h_cat][1] += 1 - else: - cat_dict[h_cat] = [0, 1, 0] - for r_edit, r_cats in ref_edits.items(): - # noop ref edits cannot be FN - if r_cats[0] == "noop": continue - # FALSE NEGATIVES - if r_edit not in hyp_edits.keys(): - # On occasion, multiple tokens at same span. - for r_cat in r_cats: - fn += 1 - # Each dict value [TP, FP, FN] - if r_cat in cat_dict.keys(): - cat_dict[r_cat][2] += 1 - else: - cat_dict[r_cat] = [0, 0, 1] - return tp, fp, fn, cat_dict - -# Input 1-3: True positives, false positives, false negatives -# Input 4: Value of beta in F-score. -# Output 1-3: Precision, Recall and F-score rounded to 4dp. -def computeFScore(tp, fp, fn, beta): - p = float(tp)/(tp+fp) if fp else 1.0 - r = float(tp)/(tp+fn) if fn else 1.0 - f = float((1+(beta**2))*p*r)/(((beta**2)*p)+r) if p+r else 0.0 - return round(p, 4), round(r, 4), round(f, 4) - -# Input 1-2: Two error category dicts. Key is cat, value is list of TP, FP, FN. -# Output: The dictionaries combined with cumulative TP, FP, FN. -def merge_dict(dict1, dict2): - for cat, stats in dict2.items(): - if cat in dict1.keys(): - dict1[cat] = [x+y for x, y in zip(dict1[cat], stats)] - else: - dict1[cat] = stats - return dict1 - -# Input 1: A dict; key is error cat, value is counts for [tp, fp, fn] -# Input 2: Integer value denoting level of error category granularity. -# 1: Operation tier; e.g. M, R, U. 2: Main tier; e.g. NOUN, VERB 3: Everything. -# Output: A dictionary of category TP, FP and FN based on Input 2. -def processCategories(cat_dict, setting): - # Otherwise, do some processing. - proc_cat_dict = {} - for cat, cnt in cat_dict.items(): - if cat == "UNK": - proc_cat_dict[cat] = cnt - continue - # M, U, R or UNK combined only. - if setting == 1: - if cat[0] in proc_cat_dict.keys(): - proc_cat_dict[cat[0]] = [x+y for x, y in zip(proc_cat_dict[cat[0]], cnt)] - else: - proc_cat_dict[cat[0]] = cnt - # Everything without M, U or R. - elif setting == 2: - if cat[2:] in proc_cat_dict.keys(): - proc_cat_dict[cat[2:]] = [x+y for x, y in zip(proc_cat_dict[cat[2:]], cnt)] - else: - proc_cat_dict[cat[2:]] = cnt - # All error category combinations - else: - return cat_dict - return proc_cat_dict - -# Input 1: A dict of global best TP, FP and FNs -# Input 2: A dict of error types and counts for those TP, FP and FNs -# Input 3: Command line args -def print_results(best, best_cats, args): - # Prepare output title. - if args.dt: title = " Token-Based Detection " - elif args.ds: title = " Span-Based Detection " - elif args.cse: title = " Span-Based Correction + Classification " - else: title = " Span-Based Correction " - - # Category Scores - if args.cat: - best_cats = processCategories(best_cats, args.cat) - print("") - print('{:=^66}'.format(title)) - print("Category".ljust(14), "TP".ljust(8), "FP".ljust(8), "FN".ljust(8), - "P".ljust(8), "R".ljust(8), "F"+str(args.beta)) - for cat, cnts in sorted(best_cats.items()): - cat_p, cat_r, cat_f = computeFScore(cnts[0], cnts[1], cnts[2], args.beta) - print(cat.ljust(14), str(cnts[0]).ljust(8), str(cnts[1]).ljust(8), - str(cnts[2]).ljust(8), str(cat_p).ljust(8), str(cat_r).ljust(8), cat_f) - - # Print the overall results. - print("") - print('{:=^46}'.format(title)) - print("\t".join(["TP", "FP", "FN", "Prec", "Rec", "F"+str(args.beta)])) - print("\t".join(map(str, [best["tp"], best["fp"], - best["fn"]]+list(computeFScore(best["tp"], best["fp"], best["fn"], args.beta))))) - print('{:=^46}'.format("")) - print("") - -if __name__ == "__main__": - # Run the program - main() +import argparse +from collections import Counter + +def main(): + # Parse command line args + args = parse_args() + # Open hypothesis and reference m2 files and split into chunks + hyp_m2 = open(args.hyp).read().strip().split("\n\n")[args.start:args.end] if args.start is not None and args.end is not None else open(args.hyp).read().strip().split("\n\n") + ref_m2 = open(args.ref).read().strip().split("\n\n")[args.start:args.end] if args.start is not None and args.end is not None else open(args.ref).read().strip().split("\n\n") + # Make sure they have the same number of sentences + assert len(hyp_m2) == len(ref_m2), print(len(hyp_m2), len(ref_m2)) + + # Store global corpus level best counts here + best_dict = Counter({"tp":0, "fp":0, "fn":0}) + best_cats = {} + # Process each sentence + sents = zip(hyp_m2, ref_m2) + for sent_id, sent in enumerate(sents): + # Simplify the edits into lists of lists + # if "A1" in sent[0] or "A1" in sent[1] or sent_id in sent_id_cons: + # sent_id_cons.append(sent_id) + src = sent[0].split("\n")[0] + hyp_edits = simplify_edits(sent[0], args.max_answer_num) + ref_edits = simplify_edits(sent[1], args.max_answer_num) + # Process the edits for detection/correction based on args + hyp_dict = process_edits(hyp_edits, args) + ref_dict = process_edits(ref_edits, args) + if args.reference_num is None or len(ref_dict.keys()) == args.reference_num: + # Evaluate edits and get best TP, FP, FN hyp+ref combo. + count_dict, cat_dict = evaluate_edits(src, + hyp_dict, ref_dict, best_dict, sent_id, args) + # Merge these dicts with best_dict and best_cats + best_dict += Counter(count_dict) + best_cats = merge_dict(best_cats, cat_dict) + # Print results + print_results(best_dict, best_cats, args) + +# Parse command line args +def parse_args(): + parser = argparse.ArgumentParser( + description="Calculate F-scores for error detection and/or correction.\n" + "Flags let you evaluate at different levels of granularity.", + formatter_class=argparse.RawTextHelpFormatter, + usage="%(prog)s [options] -hyp HYP -ref REF") + parser.add_argument( + "-hyp", + help="A hypothesis M2 file.", + required=True) + parser.add_argument( + "-ref", + help="A reference M2 file.", + required=True) + parser.add_argument( + "--start", + type=int, + default=None + ) + parser.add_argument( + "--end", + type=int, + default=None + ) + parser.add_argument( + "--max_answer_num", + type=int, + default=None + ) + parser.add_argument( + "--reference_num", + type=int, + default=None + ) + parser.add_argument( + "-b", + "--beta", + help="Value of beta in F-score. (default: 0.5)", + default=0.5, + type=float) + parser.add_argument( + "-v", + "--verbose", + help="Print verbose output.", + action="store_true") + eval_type = parser.add_mutually_exclusive_group() + eval_type.add_argument( + "-dt", + help="Evaluate Detection in terms of Tokens.", + action="store_true") + eval_type.add_argument( + "-ds", + help="Evaluate Detection in terms of Spans.", + action="store_true") + eval_type.add_argument( + "-cs", + help="Evaluate Correction in terms of Spans. (default)", + action="store_true") + eval_type.add_argument( + "-cse", + help="Evaluate Correction in terms of Spans and Error types.", + action="store_true") + parser.add_argument( + "-single", + help="Only evaluate single token edits; i.e. 0:1, 1:0 or 1:1", + action="store_true") + parser.add_argument( + "-multi", + help="Only evaluate multi token edits; i.e. 2+:n or n:2+", + action="store_true") + parser.add_argument( + "-multi_hyp_avg", + help="When get multiple hypotheses for a sentence, calculate their average F-scores for this sentence.", + action="store_true") # For IAA calculation + parser.add_argument( + "-multi_hyp_max", + help="When get multiple hypotheses for a sentence, calculate their F-scores and select the max one for this sentence.", + action="store_true") # For multiple hypotheses system evaluation + parser.add_argument( + "-filt", + help="Do not evaluate the specified error types.", + nargs="+", + default=[]) + parser.add_argument( + "-cat", + help="Show error category scores.\n" + "1: Only show operation tier scores; e.g. R.\n" + "2: Only show main tier scores; e.g. NOUN.\n" + "3: Show all category scores; e.g. R:NOUN.", + choices=[1, 2, 3], + type=int) + args = parser.parse_args() + return args + +# Input: An m2 format sentence with edits. +# Output: A list of lists. Each edit: [start, end, cat, cor, coder] +def simplify_edits(sent, max_answer_num): + out_edits = [] + # Get the edit lines from an m2 block. + edits = sent.split("\n") + # Loop through the edits + for edit in edits: + # Preprocessing + if edit.startswith("A "): + edit = edit[2:].split("|||") # Ignore "A " then split. + span = edit[0].split() + start = int(span[0]) + end = int(span[1]) + cat = edit[1] + cor = edit[2].replace(" ", "") + coder = int(edit[-1]) + out_edit = [start, end, cat, cor, coder] + out_edits.append(out_edit) + # return [edit for edit in out_edits if edit[-1] in [0,1]] + if max_answer_num is None: + return out_edits + elif max_answer_num == 1: + return [edit for edit in out_edits if edit[-1] == 0] + elif max_answer_num == 2: + return [edit for edit in out_edits if edit[-1] in [0, 1]] + elif max_answer_num == 3: + return [edit for edit in out_edits if edit[-1] in [0, 1, 2]] + +# Input 1: A list of edits. Each edit: [start, end, cat, cor, coder] +# Input 2: Command line args +# Output: A dict; key is coder, value is edit dict. +def process_edits(edits, args): + coder_dict = {} + # Add an explicit noop edit if there are no edits. + if not edits: edits = [[-1, -1, "noop", "-NONE-", 0]] + # Loop through the edits + for edit in edits: + # Name the edit elements for clarity + start = edit[0] + end = edit[1] + cat = edit[2] + cor = edit[3] + coder = edit[4] + # Add the coder to the coder_dict if necessary + if coder not in coder_dict: coder_dict[coder] = {} + + # Optionally apply filters based on args + # 1. UNK type edits are only useful for detection, not correction. + if not args.dt and not args.ds and cat == "UNK": continue + # 2. Only evaluate single token edits; i.e. 0:1, 1:0 or 1:1 + if args.single and (end-start >= 2 or len(cor.split()) >= 2): continue + # 3. Only evaluate multi token edits; i.e. 2+:n or n:2+ + if args.multi and end-start < 2 and len(cor.split()) < 2: continue + # 4. If there is a filter, ignore the specified error types + if args.filt and cat in args.filt: continue + + # Token Based Detection + if args.dt: + # Preserve noop edits. + if start == -1: + if (start, start) in coder_dict[coder].keys(): + coder_dict[coder][(start, start)].append(cat) + else: + coder_dict[coder][(start, start)] = [cat] + # Insertions defined as affecting the token on the right + elif start == end and start >= 0: + if (start, start+1) in coder_dict[coder].keys(): + coder_dict[coder][(start, start+1)].append(cat) + else: + coder_dict[coder][(start, start+1)] = [cat] + # Edit spans are split for each token in the range. + else: + for tok_id in range(start, end): + if (tok_id, tok_id+1) in coder_dict[coder].keys(): + coder_dict[coder][(tok_id, tok_id+1)].append(cat) + else: + coder_dict[coder][(tok_id, tok_id+1)] = [cat] + + # Span Based Detection + elif args.ds: + if (start, end) in coder_dict[coder].keys(): + coder_dict[coder][(start, end)].append(cat) + else: + coder_dict[coder][(start, end)] = [cat] + + # Span Based Correction + else: + # With error type classification + if args.cse: + if (start, end, cat, cor) in coder_dict[coder].keys(): + coder_dict[coder][(start, end, cat, cor)].append(cat) + else: + coder_dict[coder][(start, end, cat, cor)] = [cat] + # Without error type classification + else: + if (start, end, cor) in coder_dict[coder].keys(): + coder_dict[coder][(start, end, cor)].append(cat) + else: + coder_dict[coder][(start, end, cor)] = [cat] + return coder_dict + +# Input 1: A hyp dict; key is coder_id, value is dict of processed hyp edits. +# Input 2: A ref dict; key is coder_id, value is dict of processed ref edits. +# Input 3: A dictionary of the best corpus level TP, FP and FN counts so far. +# Input 4: Sentence ID (for verbose output only) +# Input 5: Command line args +# Output 1: A dict of the best corpus level TP, FP and FN for the input sentence. +# Output 2: The corresponding error type dict for the above dict. +def evaluate_edits(src, hyp_dict, ref_dict, best, sent_id, args): + # Store the best sentence level scores and hyp+ref combination IDs + # best_f is initialised as -1 cause 0 is a valid result. + best_tp, best_fp, best_fn, best_f, best_hyp, best_ref = 0, 0, 0, -1, 0, 0 + best_cat = {} + # skip not annotatable sentence + if len(ref_dict.keys()) == 1: + ref_id = list(ref_dict.keys())[0] + if len(ref_dict[ref_id].keys()) == 1: + cat = list(ref_dict[ref_id].values())[0][0] + if cat == "NA": + best_dict = {"tp":best_tp, "fp":best_fp, "fn":best_fn} + return best_dict, best_cat + + # Compare each hyp and ref combination + for hyp_id in hyp_dict.keys(): + for ref_id in ref_dict.keys(): + # Get the local counts for the current combination. + tp, fp, fn, cat_dict = compareEdits(hyp_dict[hyp_id], ref_dict[ref_id]) + # Compute the local sentence scores (for verbose output only) + loc_p, loc_r, loc_f = computeFScore(tp, fp, fn, args.beta) + # Compute the global sentence scores + p, r, f = computeFScore( + tp+best["tp"], fp+best["fp"], fn+best["fn"], args.beta) + # Save the scores if they are better in terms of: + # 1. Higher F-score + # 2. Same F-score, higher TP + # 3. Same F-score and TP, lower FP + # 4. Same F-score, TP and FP, lower FN + if (f > best_f) or \ + (f == best_f and tp > best_tp) or \ + (f == best_f and tp == best_tp and fp < best_fp) or \ + (f == best_f and tp == best_tp and fp == best_fp and fn < best_fn): + best_tp, best_fp, best_fn = tp, fp, fn + best_f, best_hyp, best_ref = f, hyp_id, ref_id + best_cat = cat_dict + # Verbose output + if args.verbose: + # Prepare verbose output edits. + hyp_verb = list(sorted(hyp_dict[hyp_id].keys())) + ref_verb = list(sorted(ref_dict[ref_id].keys())) + # Ignore noop edits + if not hyp_verb or hyp_verb[0][0] == -1: hyp_verb = [] + if not ref_verb or ref_verb[0][0] == -1: ref_verb = [] + # Print verbose info + print('{:-^40}'.format("")) + print("SENTENCE "+str(sent_id)+src[1:]) + print('{:-^40}'.format("")) + print("SENTENCE "+str(sent_id)+" - HYP "+str(hyp_id)+" - REF "+str(ref_id)) + print("HYPOTHESIS EDITS :", hyp_verb) + print("REFERENCE EDITS :", ref_verb) + print("Local TP/FP/FN :", str(tp), str(fp), str(fn)) + print("Local P/R/F"+str(args.beta)+" :", str(loc_p), str(loc_r), str(loc_f)) + print("Global TP/FP/FN :", str(tp+best["tp"]), str(fp+best["fp"]), str(fn+best["fn"])) + print("Global P/R/F"+str(args.beta)+" :", str(p), str(r), str(f)) + # Verbose output: display the best hyp+ref combination + if args.verbose: + print('{:-^40}'.format("")) + print("^^ HYP "+str(best_hyp)+", REF "+str(best_ref)+" chosen for sentence "+str(sent_id)) + # Save the best TP, FP and FNs as a dict, and return this and the best_cat dict + best_dict = {"tp":best_tp, "fp":best_fp, "fn":best_fn} + return best_dict, best_cat + +# Input 1: A dictionary of hypothesis edits for a single system. +# Input 2: A dictionary of reference edits for a single annotator. +# Output 1-3: The TP, FP and FN for the hyp vs the given ref annotator. +# Output 4: A dictionary of the error type counts. +def compareEdits(hyp_edits, ref_edits): + tp = 0 # True Positives + fp = 0 # False Positives + fn = 0 # False Negatives + cat_dict = {} # {cat: [tp, fp, fn], ...} + + for h_edit, h_cats in hyp_edits.items(): + # noop hyp edits cannot be TP or FP + if h_cats[0] == "noop": continue + # TRUE POSITIVES + if h_edit in ref_edits.keys(): + # On occasion, multiple tokens at same span. + for h_cat in ref_edits[h_edit]: # Use ref dict for TP + tp += 1 + # Each dict value [TP, FP, FN] + if h_cat in cat_dict.keys(): + cat_dict[h_cat][0] += 1 + else: + cat_dict[h_cat] = [1, 0, 0] + # FALSE POSITIVES + else: + # On occasion, multiple tokens at same span. + for h_cat in h_cats: + fp += 1 + # Each dict value [TP, FP, FN] + if h_cat in cat_dict.keys(): + cat_dict[h_cat][1] += 1 + else: + cat_dict[h_cat] = [0, 1, 0] + for r_edit, r_cats in ref_edits.items(): + # noop ref edits cannot be FN + if r_cats[0] == "noop": continue + # FALSE NEGATIVES + if r_edit not in hyp_edits.keys(): + # On occasion, multiple tokens at same span. + for r_cat in r_cats: + fn += 1 + # Each dict value [TP, FP, FN] + if r_cat in cat_dict.keys(): + cat_dict[r_cat][2] += 1 + else: + cat_dict[r_cat] = [0, 0, 1] + return tp, fp, fn, cat_dict + +# Input 1-3: True positives, false positives, false negatives +# Input 4: Value of beta in F-score. +# Output 1-3: Precision, Recall and F-score rounded to 4dp. +def computeFScore(tp, fp, fn, beta): + p = float(tp)/(tp+fp) if fp else 1.0 + r = float(tp)/(tp+fn) if fn else 1.0 + f = float((1+(beta**2))*p*r)/(((beta**2)*p)+r) if p+r else 0.0 + return round(p, 4), round(r, 4), round(f, 4) + +# Input 1-2: Two error category dicts. Key is cat, value is list of TP, FP, FN. +# Output: The dictionaries combined with cumulative TP, FP, FN. +def merge_dict(dict1, dict2): + for cat, stats in dict2.items(): + if cat in dict1.keys(): + dict1[cat] = [x+y for x, y in zip(dict1[cat], stats)] + else: + dict1[cat] = stats + return dict1 + +# Input 1: A dict; key is error cat, value is counts for [tp, fp, fn] +# Input 2: Integer value denoting level of error category granularity. +# 1: Operation tier; e.g. M, R, U. 2: Main tier; e.g. NOUN, VERB 3: Everything. +# Output: A dictionary of category TP, FP and FN based on Input 2. +def processCategories(cat_dict, setting): + # Otherwise, do some processing. + proc_cat_dict = {} + for cat, cnt in cat_dict.items(): + if cat == "UNK": + proc_cat_dict[cat] = cnt + continue + # M, U, R or UNK combined only. + if setting == 1: + if cat[0] in proc_cat_dict.keys(): + proc_cat_dict[cat[0]] = [x+y for x, y in zip(proc_cat_dict[cat[0]], cnt)] + else: + proc_cat_dict[cat[0]] = cnt + # Everything without M, U or R. + elif setting == 2: + if cat[2:] in proc_cat_dict.keys(): + proc_cat_dict[cat[2:]] = [x+y for x, y in zip(proc_cat_dict[cat[2:]], cnt)] + else: + proc_cat_dict[cat[2:]] = cnt + # All error category combinations + else: + return cat_dict + return proc_cat_dict + +# Input 1: A dict of global best TP, FP and FNs +# Input 2: A dict of error types and counts for those TP, FP and FNs +# Input 3: Command line args +def print_results(best, best_cats, args): + # Prepare output title. + if args.dt: title = " Token-Based Detection " + elif args.ds: title = " Span-Based Detection " + elif args.cse: title = " Span-Based Correction + Classification " + else: title = " Span-Based Correction " + + # Category Scores + if args.cat: + best_cats = processCategories(best_cats, args.cat) + print("") + print('{:=^66}'.format(title)) + print("Category".ljust(14), "TP".ljust(8), "FP".ljust(8), "FN".ljust(8), + "P".ljust(8), "R".ljust(8), "F"+str(args.beta)) + for cat, cnts in sorted(best_cats.items()): + cat_p, cat_r, cat_f = computeFScore(cnts[0], cnts[1], cnts[2], args.beta) + print(cat.ljust(14), str(cnts[0]).ljust(8), str(cnts[1]).ljust(8), + str(cnts[2]).ljust(8), str(cat_p).ljust(8), str(cat_r).ljust(8), cat_f) + + # Print the overall results. + print("") + print('{:=^46}'.format(title)) + print("\t".join(["TP", "FP", "FN", "Prec", "Rec", "F"+str(args.beta)])) + print("\t".join(map(str, [best["tp"], best["fp"], + best["fn"]]+list(computeFScore(best["tp"], best["fp"], best["fn"], args.beta))))) + print('{:=^46}'.format("")) + print("") + +if __name__ == "__main__": + # Run the program + main() diff --git a/opencompass/datasets/lawbench/utils/comprehension_scores.py b/opencompass/datasets/lawbench/utils/comprehension_scores.py index 632c5408..d18276d3 100644 --- a/opencompass/datasets/lawbench/utils/comprehension_scores.py +++ b/opencompass/datasets/lawbench/utils/comprehension_scores.py @@ -1,82 +1,82 @@ -import re -from ..utils.rc_f1 import CJRCEvaluator - - -""" -given a target substring. find its all occurances in the string s -return the starting and ending index of every occurance -""" - - -def __find_substring_starts(s, target): - return [(m.start(), m.end()) for m in re.finditer(target, s)] - - -""" -compute the reading comprehension F1 scores -hyps and refs are lists of hyposisis and reference strings -""" - - -def compute_rc_f1(hyps, refs): - scores = 0 - for h, r in zip(hyps, refs): - scores += CJRCEvaluator.compute_f1(r, h) - return {'score': scores / len(hyps)} - - -""" -compute the information extraction F1 scores -hyps and refs are lists of hyposisis and reference strings -entity_types: a set of all possible entity types -""" - - -def compute_ie_f1(hyps, refs, entity_types): - assert (len(hyps) == len(refs)) - scores, abstentions = 0, 0 - for h, r in zip(hyps, refs): - h = __extract_entities_pred(h, entity_types) - r = __extract_entities_ref(r) - if r == {}: - scores += 1 if h == {} else 0 - continue - if h == {}: - abstentions += 1 - intersected = [CJRCEvaluator.compute_f1(r[etype], einstance) for etype, einstance in h.items() if etype in r] - prec = sum(intersected) / len(h) if len(h) > 0 else 0 - rec = sum(intersected) / len(r) if len(r) > 0 else 0 - # print(prec, rec, intersected) - scores += 2 * prec * rec / (prec + rec + 1e-10) - return {'score': scores / len(hyps), "anstention_rate": abstentions / len(hyps)} - - -def __extract_entities_ref(ref): - outputs = {} - if ref.strip() == '': - return outputs - for seg in ref.split(';'): - seg = seg.split(':') - outputs[seg[0]] = seg[1] - return outputs - - -""" -extract entity type and instances from the model prediction -pred: string of model prediction -entity_types: a set of all possible entity types -""" - - -def __extract_entities_pred(pred, entity_types): - outputs = {} - for etype in entity_types: - occurances = __find_substring_starts(pred, etype) - for start, end in occurances: - if end >= (len(pred) - 2): - continue - if pred[end] == ":" or pred[end] == ":": - einstance = re.split("\n| ", pred[end + 1:].strip())[0].strip() - if einstance != '无' and einstance != '未提及': - outputs[etype] = einstance - return outputs +import re +from ..utils.rc_f1 import CJRCEvaluator + + +""" +given a target substring. find its all occurances in the string s +return the starting and ending index of every occurance +""" + + +def __find_substring_starts(s, target): + return [(m.start(), m.end()) for m in re.finditer(target, s)] + + +""" +compute the reading comprehension F1 scores +hyps and refs are lists of hyposisis and reference strings +""" + + +def compute_rc_f1(hyps, refs): + scores = 0 + for h, r in zip(hyps, refs): + scores += CJRCEvaluator.compute_f1(r, h) + return {'score': scores / len(hyps)} + + +""" +compute the information extraction F1 scores +hyps and refs are lists of hyposisis and reference strings +entity_types: a set of all possible entity types +""" + + +def compute_ie_f1(hyps, refs, entity_types): + assert (len(hyps) == len(refs)) + scores, abstentions = 0, 0 + for h, r in zip(hyps, refs): + h = __extract_entities_pred(h, entity_types) + r = __extract_entities_ref(r) + if r == {}: + scores += 1 if h == {} else 0 + continue + if h == {}: + abstentions += 1 + intersected = [CJRCEvaluator.compute_f1(r[etype], einstance) for etype, einstance in h.items() if etype in r] + prec = sum(intersected) / len(h) if len(h) > 0 else 0 + rec = sum(intersected) / len(r) if len(r) > 0 else 0 + # print(prec, rec, intersected) + scores += 2 * prec * rec / (prec + rec + 1e-10) + return {'score': scores / len(hyps), "anstention_rate": abstentions / len(hyps)} + + +def __extract_entities_ref(ref): + outputs = {} + if ref.strip() == '': + return outputs + for seg in ref.split(';'): + seg = seg.split(':') + outputs[seg[0]] = seg[1] + return outputs + + +""" +extract entity type and instances from the model prediction +pred: string of model prediction +entity_types: a set of all possible entity types +""" + + +def __extract_entities_pred(pred, entity_types): + outputs = {} + for etype in entity_types: + occurances = __find_substring_starts(pred, etype) + for start, end in occurances: + if end >= (len(pred) - 2): + continue + if pred[end] == ":" or pred[end] == ":": + einstance = re.split("\n| ", pred[end + 1:].strip())[0].strip() + if einstance != '无' and einstance != '未提及': + outputs[etype] = einstance + return outputs diff --git a/opencompass/datasets/lawbench/utils/function_utils.py b/opencompass/datasets/lawbench/utils/function_utils.py index e4c6659d..a5c469a2 100644 --- a/opencompass/datasets/lawbench/utils/function_utils.py +++ b/opencompass/datasets/lawbench/utils/function_utils.py @@ -1,49 +1,49 @@ -from rouge_chinese import Rouge -import jieba -from nltk.translate.gleu_score import corpus_gleu - -def compute_f1_two_sets(pred_set, gt_set): - precision = len(pred_set.intersection(gt_set)) / len(pred_set) if len(pred_set) > 0 else 0 - recall = len(pred_set.intersection(gt_set)) / len(gt_set) if len(gt_set) > 0 else 0 - f1 = 2 * precision * recall / (precision + recall) if precision + recall > 0 else 0 - return f1 - -def multi_choice_judge(prediction, option_list, answer_token): - # a dict, key: letters in the option list, value: count of the letter in the prediction - count_dict, abstention, accuracy = {}, 0, 0 - for option in option_list: - option_count = prediction.count(option) - count_dict[option] = 1 if option_count > 0 else 0 # multiple occurrence of the same letter is counted as 1 - - if sum(count_dict.values()) == 0: - abstention = 1 - # if the answer token is the only predicted token, the prediction is correct - elif count_dict[answer_token] == 1 and sum(count_dict.values()) == 1: - accuracy = 1 - return {"score": accuracy, "abstention": abstention} - -""" -compute the rouge score. -hyps and refs are lists of hyposisis and reference strings -empty predictions are replaces with 无内容 -""" - - -def compute_rouge(hyps, refs): - assert(len(hyps) == len(refs)) - hyps = [' '.join(jieba.cut(h)) for h in hyps] - hyps = [h if h.strip() != "" else "无内容" for h in hyps] - refs = [' '.join(jieba.cut(r)) for r in refs] - return Rouge().get_scores(hyps, refs) - -""" -compute the gleu score. -hyps and refs are lists of hyposisis and reference strings -empty predictions are replaces with 无内容 -""" -def compute_gleu(hyps, refs): - assert(len(hyps) == len(refs)) - hyps = [' '.join(jieba.cut(h)) for h in hyps] - hyps = [h if h.strip() != "" else "无内容" for h in hyps] - refs = [[' '.join(jieba.cut(r))] for r in refs] - return corpus_gleu(refs, hyps) +from rouge_chinese import Rouge +import jieba +from nltk.translate.gleu_score import corpus_gleu + +def compute_f1_two_sets(pred_set, gt_set): + precision = len(pred_set.intersection(gt_set)) / len(pred_set) if len(pred_set) > 0 else 0 + recall = len(pred_set.intersection(gt_set)) / len(gt_set) if len(gt_set) > 0 else 0 + f1 = 2 * precision * recall / (precision + recall) if precision + recall > 0 else 0 + return f1 + +def multi_choice_judge(prediction, option_list, answer_token): + # a dict, key: letters in the option list, value: count of the letter in the prediction + count_dict, abstention, accuracy = {}, 0, 0 + for option in option_list: + option_count = prediction.count(option) + count_dict[option] = 1 if option_count > 0 else 0 # multiple occurrence of the same letter is counted as 1 + + if sum(count_dict.values()) == 0: + abstention = 1 + # if the answer token is the only predicted token, the prediction is correct + elif count_dict[answer_token] == 1 and sum(count_dict.values()) == 1: + accuracy = 1 + return {"score": accuracy, "abstention": abstention} + +""" +compute the rouge score. +hyps and refs are lists of hyposisis and reference strings +empty predictions are replaces with 无内容 +""" + + +def compute_rouge(hyps, refs): + assert(len(hyps) == len(refs)) + hyps = [' '.join(jieba.cut(h)) for h in hyps] + hyps = [h if h.strip() != "" else "无内容" for h in hyps] + refs = [' '.join(jieba.cut(r)) for r in refs] + return Rouge().get_scores(hyps, refs) + +""" +compute the gleu score. +hyps and refs are lists of hyposisis and reference strings +empty predictions are replaces with 无内容 +""" +def compute_gleu(hyps, refs): + assert(len(hyps) == len(refs)) + hyps = [' '.join(jieba.cut(h)) for h in hyps] + hyps = [h if h.strip() != "" else "无内容" for h in hyps] + refs = [[' '.join(jieba.cut(r))] for r in refs] + return corpus_gleu(refs, hyps) diff --git a/opencompass/datasets/lawbench/utils/modules/__init__.py b/opencompass/datasets/lawbench/utils/modules/__init__.py index 8b137891..d3f5a12f 100644 --- a/opencompass/datasets/lawbench/utils/modules/__init__.py +++ b/opencompass/datasets/lawbench/utils/modules/__init__.py @@ -1 +1 @@ - + diff --git a/opencompass/datasets/lawbench/utils/modules/alignment.py b/opencompass/datasets/lawbench/utils/modules/alignment.py index d11feb74..a0d12cfe 100644 --- a/opencompass/datasets/lawbench/utils/modules/alignment.py +++ b/opencompass/datasets/lawbench/utils/modules/alignment.py @@ -1,332 +1,332 @@ -import numpy as np -from typing import List, Tuple, Dict -from modules.tokenizer import Tokenizer -import os -from string import punctuation - -REAL_PATH = os.path.split(os.path.realpath(__file__))[0] -chinese_punct = "!?。"#$%&'()*+,-/:;<=>@[\]^_`{|}~⦅⦆「」、、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘'‛“”„‟…‧﹏" -english_punct = punctuation -punct = chinese_punct + english_punct - -def check_all_chinese(word): - """ - 判断一个单词是否全部由中文组成 - :param word: - :return: - """ - return all(['\u4e00' <= ch <= '\u9fff' for ch in word]) - -def read_cilin(): - """ - Cilin 詞林 is a thesaurus with semantic information - """ - # TODO -- fix this path - lines = open(os.path.join(os.path.dirname(__file__), "..", "..", "..", "..", "..", "data", "lawbench", "eval_assets", "cilin.txt"), "r", encoding="gbk").read().strip().split("\n") - semantic_dict = {} - semantic_classes = {} - for line in lines: - code, *words = line.split(" ") - for word in words: - semantic_dict[word] = code - # make reverse dict - if code in semantic_classes: - semantic_classes[code] += words - else: - semantic_classes[code] = words - return semantic_dict, semantic_classes - - -def read_confusion(): - confusion_dict = {} - with open(os.path.join(os.path.dirname(__file__), "..", "..", "..", "..", "..", "data", "lawbench", "eval_assets", "confusion_dict.txt"), "r", encoding="utf-8") as f: - for line in f: - li = line.rstrip('\n').split(" ") - confusion_dict[li[0]] = li[1:] - return confusion_dict - -class Alignment: - """ - 对齐错误句子和正确句子, - 使用编辑距离算法抽取编辑操作 - """ - - def __init__( - self, - semantic_dict: Dict, - confusion_dict: Dict, - granularity: str = "word", - ) -> None: - """ - 构造函数 - :param semantic_dict: 语义词典(大词林) - :param confusion_dict: 字符混淆集 - """ - self.insertion_cost = 1 - self.deletion_cost = 1 - self.semantic_dict = semantic_dict - self.confusion_dict = confusion_dict - # Because we use character level tokenization, this doesn't currently use POS - self._open_pos = {} # 如果是词级别,还可以利用词性是否相同来计算cost - self.granularity = granularity # word-level or character-level - self.align_seqs = [] - - def __call__(self, - src: List[Tuple], - tgt: List[Tuple], - verbose: bool = False): - cost_matrix, oper_matrix = self.align(src, tgt) - align_seq = self.get_cheapest_align_seq(oper_matrix) - - if verbose: - print("========== Seg. and POS: ==========") - print(src) - print(tgt) - print("========== Cost Matrix ==========") - print(cost_matrix) - print("========== Oper Matrix ==========") - print(oper_matrix) - print("========== Alignment ==========") - print(align_seq) - print("========== Results ==========") - for a in align_seq: - print(a[0], src[a[1]: a[2]], tgt[a[3]: a[4]]) - return align_seq - - def _get_semantic_class(self, word): - """ - NOTE: Based on the paper: - Improved-Edit-Distance Kernel for Chinese Relation Extraction - 获取每个词语的语义类别(基于大词林,有三个级别) - """ - if word in self.semantic_dict: - code = self.semantic_dict[word] - high, mid, low = code[0], code[1], code[2:4] - return high, mid, low - else: # unknown - return None - - @staticmethod - def _get_class_diff(a_class, b_class): - """ - d == 3 for equivalent semantics - d == 0 for completely different semantics - 根据大词林的信息,计算两个词的语义类别的差距 - """ - d = sum([a == b for a, b in zip(a_class, b_class)]) - return d - - def _get_semantic_cost(self, a, b): - """ - 计算基于语义信息的替换操作cost - :param a: 单词a的语义类别 - :param b: 单词b的语义类别 - :return: 替换编辑代价 - """ - a_class = self._get_semantic_class(a) - b_class = self._get_semantic_class(b) - # unknown class, default to 1 - if a_class is None or b_class is None: - return 4 - elif a_class == b_class: - return 0 - else: - return 2 * (3 - self._get_class_diff(a_class, b_class)) - - def _get_pos_cost(self, a_pos, b_pos): - """ - 计算基于词性信息的编辑距离cost - :param a_pos: 单词a的词性 - :param b_pos: 单词b的词性 - :return: 替换编辑代价 - """ - if a_pos == b_pos: - return 0 - elif a_pos in self._open_pos and b_pos in self._open_pos: - return 0.25 - else: - return 0.499 - - def _get_char_cost(self, a, b, pinyin_a, pinyin_b): - """ - NOTE: This is a replacement of ERRANTS lemma cost for Chinese - 计算基于字符相似度的编辑距离cost - """ - if not (check_all_chinese(a) and check_all_chinese(b)): - return 0.5 - if len(a) > len(b): - a, b = b, a - pinyin_a, pinyin_b = pinyin_b, pinyin_a - if a == b: - return 0 - else: - return self._get_spell_cost(a, b, pinyin_a, pinyin_b) - - def _get_spell_cost(self, a, b, pinyin_a, pinyin_b): - """ - 计算两个单词拼写相似度,分别由字形相似度和字音相似度组成 - :param a: 单词a - :param b: 单词b,且单词a的长度小于等于b - :param pinyin_a: 单词a的拼音 - :param pinyin_b: 单词b的拼音 - :return: 替换操作cost - """ - count = 0 - for i in range(len(a)): - for j in range(len(b)): - if a[i] == b[j] or (set(pinyin_a) & set(pinyin_b)) or (b[j] in self.confusion_dict.keys() and a[i] in self.confusion_dict[b[j]]) or (a[i] in self.confusion_dict.keys() and b[j] in self.confusion_dict[a[i]]): - count += 1 - break - return (len(a) - count) / (len(a) * 2) - - def get_sub_cost(self, a_seg, b_seg): - """ - Calculate the substitution cost between words a and b - 计算两个单词替换操作的编辑cost,最大为2,等于一次删除和一次添加 - """ - if a_seg[0] == b_seg[0]: - return 0 - - if self.granularity == "word": # 词级别可以额外利用词性信息 - semantic_cost = self._get_semantic_cost(a_seg[0], b_seg[0]) / 6.0 - pos_cost = self._get_pos_cost(a_seg[1], b_seg[1]) - char_cost = self._get_char_cost(a_seg[0], b_seg[0], a_seg[2], b_seg[2]) - return semantic_cost + pos_cost + char_cost - else: # 字级别只能利用字义信息(从大词林中获取)和字面相似度信息 - semantic_cost = self._get_semantic_cost(a_seg[0], b_seg[0]) / 6.0 - if a_seg[0] in punct and b_seg[0] in punct: - pos_cost = 0.0 - elif a_seg[0] not in punct and b_seg[0] not in punct: - pos_cost = 0.25 - else: - pos_cost = 0.499 - # pos_cost = 0.0 if (a_seg[0] in punct and b_seg[0] in punct) or (a_seg[0] not in punct and b_seg[0] not in punct) else 0.5 - char_cost = self._get_char_cost(a_seg[0], b_seg[0], a_seg[2], b_seg[2]) - return semantic_cost + char_cost + pos_cost - - def align(self, - src: List[Tuple], - tgt: List[Tuple]): - """ - Based on ERRANT's alignment - 基于改进的动态规划算法,为原句子的每个字打上编辑标签,以便使它能够成功转换为目标句子。 - 编辑操作类别: - 1) M:Match,即KEEP,即当前字保持不变 - 2) D:Delete,删除,即当前字需要被删除 - 3) I:Insert,插入,即当前字需要被插入 - 4) T:Transposition,移位操作,即涉及到词序问题 - """ - cost_matrix = np.zeros((len(src) + 1, len(tgt) + 1)) # 编辑cost矩阵 - oper_matrix = np.full( - (len(src) + 1, len(tgt) + 1), "O", dtype=object - ) # 操作矩阵 - # Fill in the edges - for i in range(1, len(src) + 1): - cost_matrix[i][0] = cost_matrix[i - 1][0] + 1 - oper_matrix[i][0] = ["D"] - for j in range(1, len(tgt) + 1): - cost_matrix[0][j] = cost_matrix[0][j - 1] + 1 - oper_matrix[0][j] = ["I"] - - # Loop through the cost matrix - for i in range(len(src)): - for j in range(len(tgt)): - # Matches - if src[i][0] == tgt[j][0]: # 如果两个字相等,则匹配成功(Match),编辑距离为0 - cost_matrix[i + 1][j + 1] = cost_matrix[i][j] - oper_matrix[i + 1][j + 1] = ["M"] - # Non-matches - else: - del_cost = cost_matrix[i][j + 1] + self.deletion_cost # 由删除动作得到的总cost - ins_cost = cost_matrix[i + 1][j] + self.insertion_cost # 由插入动作得到的总cost - sub_cost = cost_matrix[i][j] + self.get_sub_cost( - src[i], tgt[j] - ) # 由替换动作得到的总cost - # Calculate transposition cost - # 计算移位操作的总cost - trans_cost = float("inf") - k = 1 - while ( - i - k >= 0 - and j - k >= 0 - and cost_matrix[i - k + 1][j - k + 1] - != cost_matrix[i - k][j - k] - ): - p1 = sorted([a[0] for a in src][i - k: i + 1]) - p2 = sorted([b[0] for b in tgt][j - k: j + 1]) - if p1 == p2: - trans_cost = cost_matrix[i - k][j - k] + k - break - k += 1 - - costs = [trans_cost, sub_cost, ins_cost, del_cost] - ind = costs.index(min(costs)) - cost_matrix[i + 1][j + 1] = costs[ind] - # ind = costs.index(costs[ind], ind+1) - for idx, cost in enumerate(costs): - if cost == costs[ind]: - if idx == 0: - if oper_matrix[i + 1][j + 1] == "O": - oper_matrix[i + 1][j + 1] = ["T" + str(k + 1)] - else: - oper_matrix[i + 1][j + 1].append("T" + str(k + 1)) - elif idx == 1: - if oper_matrix[i + 1][j + 1] == "O": - oper_matrix[i + 1][j + 1] = ["S"] - else: - oper_matrix[i + 1][j + 1].append("S") - elif idx == 2: - if oper_matrix[i + 1][j + 1] == "O": - oper_matrix[i + 1][j + 1] = ["I"] - else: - oper_matrix[i + 1][j + 1].append("I") - else: - if oper_matrix[i + 1][j + 1] == "O": - oper_matrix[i + 1][j + 1] = ["D"] - else: - oper_matrix[i + 1][j + 1].append("D") - return cost_matrix, oper_matrix - - def _dfs(self, i, j, align_seq_now, oper_matrix, strategy="all"): - """ - 深度优先遍历,获取最小编辑距离相同的所有序列 - """ - if i + j == 0: - self.align_seqs.append(align_seq_now) - else: - ops = oper_matrix[i][j] # 可以类比成搜索一棵树从根结点到叶子结点的所有路径 - if strategy != "all": ops = ops[:1] - for op in ops: - if op in {"M", "S"}: - self._dfs(i - 1, j - 1, align_seq_now + [(op, i - 1, i, j - 1, j)], oper_matrix, strategy) - elif op == "D": - self._dfs(i - 1, j, align_seq_now + [(op, i - 1, i, j, j)], oper_matrix, strategy) - elif op == "I": - self._dfs(i, j - 1, align_seq_now + [(op, i, i, j - 1, j)], oper_matrix, strategy) - else: - k = int(op[1:]) - self._dfs(i - k, j - k, align_seq_now + [(op, i - k, i, j - k, j)], oper_matrix, strategy) - - def get_cheapest_align_seq(self, oper_matrix): - """ - 回溯获得编辑距离最小的编辑序列 - """ - self.align_seqs = [] - i = oper_matrix.shape[0] - 1 - j = oper_matrix.shape[1] - 1 - if abs(i - j) > 10: - self._dfs(i, j , [], oper_matrix, "first") - else: - self._dfs(i, j , [], oper_matrix, "all") - final_align_seqs = [seq[::-1] for seq in self.align_seqs] - return final_align_seqs - - -if __name__ == "__main__": - tokenizer = Tokenizer("word") - semantic_dict, semantic_class = read_cilin() - confusion_dict = read_confusion() - alignment = Alignment(semantic_dict, confusion_dict) - sents = ["首先 , 我们 得 准备 : 大 虾六 到 九 只 、 盐 一 茶匙 、 已 搾 好 的 柠檬汁 三 汤匙 、 泰国 柠檬 叶三叶 、 柠檬 香草 一 根 、 鱼酱 两 汤匙 、 辣椒 6 粒 , 纯净 水 4量杯 、 香菜 半量杯 和 草菇 10 个 。".replace(" ", ""), "首先 , 我们 得 准备 : 大 虾六 到 九 只 、 盐 一 茶匙 、 已 榨 好 的 柠檬汁 三 汤匙 、 泰国 柠檬 叶三叶 、 柠檬 香草 一 根 、 鱼酱 两 汤匙 、 辣椒 六 粒 , 纯净 水 四 量杯 、 香菜 半量杯 和 草菇 十 个 。".replace(" ", "")] - src, tgt = tokenizer(sents) +import numpy as np +from typing import List, Tuple, Dict +from modules.tokenizer import Tokenizer +import os +from string import punctuation + +REAL_PATH = os.path.split(os.path.realpath(__file__))[0] +chinese_punct = "!?。"#$%&'()*+,-/:;<=>@[\]^_`{|}~⦅⦆「」、、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘'‛“”„‟…‧﹏" +english_punct = punctuation +punct = chinese_punct + english_punct + +def check_all_chinese(word): + """ + 判断一个单词是否全部由中文组成 + :param word: + :return: + """ + return all(['\u4e00' <= ch <= '\u9fff' for ch in word]) + +def read_cilin(): + """ + Cilin 詞林 is a thesaurus with semantic information + """ + # TODO -- fix this path + lines = open(os.path.join(os.path.dirname(__file__), "..", "..", "..", "..", "..", "data", "lawbench", "eval_assets", "cilin.txt"), "r", encoding="gbk").read().strip().split("\n") + semantic_dict = {} + semantic_classes = {} + for line in lines: + code, *words = line.split(" ") + for word in words: + semantic_dict[word] = code + # make reverse dict + if code in semantic_classes: + semantic_classes[code] += words + else: + semantic_classes[code] = words + return semantic_dict, semantic_classes + + +def read_confusion(): + confusion_dict = {} + with open(os.path.join(os.path.dirname(__file__), "..", "..", "..", "..", "..", "data", "lawbench", "eval_assets", "confusion_dict.txt"), "r", encoding="utf-8") as f: + for line in f: + li = line.rstrip('\n').split(" ") + confusion_dict[li[0]] = li[1:] + return confusion_dict + +class Alignment: + """ + 对齐错误句子和正确句子, + 使用编辑距离算法抽取编辑操作 + """ + + def __init__( + self, + semantic_dict: Dict, + confusion_dict: Dict, + granularity: str = "word", + ) -> None: + """ + 构造函数 + :param semantic_dict: 语义词典(大词林) + :param confusion_dict: 字符混淆集 + """ + self.insertion_cost = 1 + self.deletion_cost = 1 + self.semantic_dict = semantic_dict + self.confusion_dict = confusion_dict + # Because we use character level tokenization, this doesn't currently use POS + self._open_pos = {} # 如果是词级别,还可以利用词性是否相同来计算cost + self.granularity = granularity # word-level or character-level + self.align_seqs = [] + + def __call__(self, + src: List[Tuple], + tgt: List[Tuple], + verbose: bool = False): + cost_matrix, oper_matrix = self.align(src, tgt) + align_seq = self.get_cheapest_align_seq(oper_matrix) + + if verbose: + print("========== Seg. and POS: ==========") + print(src) + print(tgt) + print("========== Cost Matrix ==========") + print(cost_matrix) + print("========== Oper Matrix ==========") + print(oper_matrix) + print("========== Alignment ==========") + print(align_seq) + print("========== Results ==========") + for a in align_seq: + print(a[0], src[a[1]: a[2]], tgt[a[3]: a[4]]) + return align_seq + + def _get_semantic_class(self, word): + """ + NOTE: Based on the paper: + Improved-Edit-Distance Kernel for Chinese Relation Extraction + 获取每个词语的语义类别(基于大词林,有三个级别) + """ + if word in self.semantic_dict: + code = self.semantic_dict[word] + high, mid, low = code[0], code[1], code[2:4] + return high, mid, low + else: # unknown + return None + + @staticmethod + def _get_class_diff(a_class, b_class): + """ + d == 3 for equivalent semantics + d == 0 for completely different semantics + 根据大词林的信息,计算两个词的语义类别的差距 + """ + d = sum([a == b for a, b in zip(a_class, b_class)]) + return d + + def _get_semantic_cost(self, a, b): + """ + 计算基于语义信息的替换操作cost + :param a: 单词a的语义类别 + :param b: 单词b的语义类别 + :return: 替换编辑代价 + """ + a_class = self._get_semantic_class(a) + b_class = self._get_semantic_class(b) + # unknown class, default to 1 + if a_class is None or b_class is None: + return 4 + elif a_class == b_class: + return 0 + else: + return 2 * (3 - self._get_class_diff(a_class, b_class)) + + def _get_pos_cost(self, a_pos, b_pos): + """ + 计算基于词性信息的编辑距离cost + :param a_pos: 单词a的词性 + :param b_pos: 单词b的词性 + :return: 替换编辑代价 + """ + if a_pos == b_pos: + return 0 + elif a_pos in self._open_pos and b_pos in self._open_pos: + return 0.25 + else: + return 0.499 + + def _get_char_cost(self, a, b, pinyin_a, pinyin_b): + """ + NOTE: This is a replacement of ERRANTS lemma cost for Chinese + 计算基于字符相似度的编辑距离cost + """ + if not (check_all_chinese(a) and check_all_chinese(b)): + return 0.5 + if len(a) > len(b): + a, b = b, a + pinyin_a, pinyin_b = pinyin_b, pinyin_a + if a == b: + return 0 + else: + return self._get_spell_cost(a, b, pinyin_a, pinyin_b) + + def _get_spell_cost(self, a, b, pinyin_a, pinyin_b): + """ + 计算两个单词拼写相似度,分别由字形相似度和字音相似度组成 + :param a: 单词a + :param b: 单词b,且单词a的长度小于等于b + :param pinyin_a: 单词a的拼音 + :param pinyin_b: 单词b的拼音 + :return: 替换操作cost + """ + count = 0 + for i in range(len(a)): + for j in range(len(b)): + if a[i] == b[j] or (set(pinyin_a) & set(pinyin_b)) or (b[j] in self.confusion_dict.keys() and a[i] in self.confusion_dict[b[j]]) or (a[i] in self.confusion_dict.keys() and b[j] in self.confusion_dict[a[i]]): + count += 1 + break + return (len(a) - count) / (len(a) * 2) + + def get_sub_cost(self, a_seg, b_seg): + """ + Calculate the substitution cost between words a and b + 计算两个单词替换操作的编辑cost,最大为2,等于一次删除和一次添加 + """ + if a_seg[0] == b_seg[0]: + return 0 + + if self.granularity == "word": # 词级别可以额外利用词性信息 + semantic_cost = self._get_semantic_cost(a_seg[0], b_seg[0]) / 6.0 + pos_cost = self._get_pos_cost(a_seg[1], b_seg[1]) + char_cost = self._get_char_cost(a_seg[0], b_seg[0], a_seg[2], b_seg[2]) + return semantic_cost + pos_cost + char_cost + else: # 字级别只能利用字义信息(从大词林中获取)和字面相似度信息 + semantic_cost = self._get_semantic_cost(a_seg[0], b_seg[0]) / 6.0 + if a_seg[0] in punct and b_seg[0] in punct: + pos_cost = 0.0 + elif a_seg[0] not in punct and b_seg[0] not in punct: + pos_cost = 0.25 + else: + pos_cost = 0.499 + # pos_cost = 0.0 if (a_seg[0] in punct and b_seg[0] in punct) or (a_seg[0] not in punct and b_seg[0] not in punct) else 0.5 + char_cost = self._get_char_cost(a_seg[0], b_seg[0], a_seg[2], b_seg[2]) + return semantic_cost + char_cost + pos_cost + + def align(self, + src: List[Tuple], + tgt: List[Tuple]): + """ + Based on ERRANT's alignment + 基于改进的动态规划算法,为原句子的每个字打上编辑标签,以便使它能够成功转换为目标句子。 + 编辑操作类别: + 1) M:Match,即KEEP,即当前字保持不变 + 2) D:Delete,删除,即当前字需要被删除 + 3) I:Insert,插入,即当前字需要被插入 + 4) T:Transposition,移位操作,即涉及到词序问题 + """ + cost_matrix = np.zeros((len(src) + 1, len(tgt) + 1)) # 编辑cost矩阵 + oper_matrix = np.full( + (len(src) + 1, len(tgt) + 1), "O", dtype=object + ) # 操作矩阵 + # Fill in the edges + for i in range(1, len(src) + 1): + cost_matrix[i][0] = cost_matrix[i - 1][0] + 1 + oper_matrix[i][0] = ["D"] + for j in range(1, len(tgt) + 1): + cost_matrix[0][j] = cost_matrix[0][j - 1] + 1 + oper_matrix[0][j] = ["I"] + + # Loop through the cost matrix + for i in range(len(src)): + for j in range(len(tgt)): + # Matches + if src[i][0] == tgt[j][0]: # 如果两个字相等,则匹配成功(Match),编辑距离为0 + cost_matrix[i + 1][j + 1] = cost_matrix[i][j] + oper_matrix[i + 1][j + 1] = ["M"] + # Non-matches + else: + del_cost = cost_matrix[i][j + 1] + self.deletion_cost # 由删除动作得到的总cost + ins_cost = cost_matrix[i + 1][j] + self.insertion_cost # 由插入动作得到的总cost + sub_cost = cost_matrix[i][j] + self.get_sub_cost( + src[i], tgt[j] + ) # 由替换动作得到的总cost + # Calculate transposition cost + # 计算移位操作的总cost + trans_cost = float("inf") + k = 1 + while ( + i - k >= 0 + and j - k >= 0 + and cost_matrix[i - k + 1][j - k + 1] + != cost_matrix[i - k][j - k] + ): + p1 = sorted([a[0] for a in src][i - k: i + 1]) + p2 = sorted([b[0] for b in tgt][j - k: j + 1]) + if p1 == p2: + trans_cost = cost_matrix[i - k][j - k] + k + break + k += 1 + + costs = [trans_cost, sub_cost, ins_cost, del_cost] + ind = costs.index(min(costs)) + cost_matrix[i + 1][j + 1] = costs[ind] + # ind = costs.index(costs[ind], ind+1) + for idx, cost in enumerate(costs): + if cost == costs[ind]: + if idx == 0: + if oper_matrix[i + 1][j + 1] == "O": + oper_matrix[i + 1][j + 1] = ["T" + str(k + 1)] + else: + oper_matrix[i + 1][j + 1].append("T" + str(k + 1)) + elif idx == 1: + if oper_matrix[i + 1][j + 1] == "O": + oper_matrix[i + 1][j + 1] = ["S"] + else: + oper_matrix[i + 1][j + 1].append("S") + elif idx == 2: + if oper_matrix[i + 1][j + 1] == "O": + oper_matrix[i + 1][j + 1] = ["I"] + else: + oper_matrix[i + 1][j + 1].append("I") + else: + if oper_matrix[i + 1][j + 1] == "O": + oper_matrix[i + 1][j + 1] = ["D"] + else: + oper_matrix[i + 1][j + 1].append("D") + return cost_matrix, oper_matrix + + def _dfs(self, i, j, align_seq_now, oper_matrix, strategy="all"): + """ + 深度优先遍历,获取最小编辑距离相同的所有序列 + """ + if i + j == 0: + self.align_seqs.append(align_seq_now) + else: + ops = oper_matrix[i][j] # 可以类比成搜索一棵树从根结点到叶子结点的所有路径 + if strategy != "all": ops = ops[:1] + for op in ops: + if op in {"M", "S"}: + self._dfs(i - 1, j - 1, align_seq_now + [(op, i - 1, i, j - 1, j)], oper_matrix, strategy) + elif op == "D": + self._dfs(i - 1, j, align_seq_now + [(op, i - 1, i, j, j)], oper_matrix, strategy) + elif op == "I": + self._dfs(i, j - 1, align_seq_now + [(op, i, i, j - 1, j)], oper_matrix, strategy) + else: + k = int(op[1:]) + self._dfs(i - k, j - k, align_seq_now + [(op, i - k, i, j - k, j)], oper_matrix, strategy) + + def get_cheapest_align_seq(self, oper_matrix): + """ + 回溯获得编辑距离最小的编辑序列 + """ + self.align_seqs = [] + i = oper_matrix.shape[0] - 1 + j = oper_matrix.shape[1] - 1 + if abs(i - j) > 10: + self._dfs(i, j , [], oper_matrix, "first") + else: + self._dfs(i, j , [], oper_matrix, "all") + final_align_seqs = [seq[::-1] for seq in self.align_seqs] + return final_align_seqs + + +if __name__ == "__main__": + tokenizer = Tokenizer("word") + semantic_dict, semantic_class = read_cilin() + confusion_dict = read_confusion() + alignment = Alignment(semantic_dict, confusion_dict) + sents = ["首先 , 我们 得 准备 : 大 虾六 到 九 只 、 盐 一 茶匙 、 已 搾 好 的 柠檬汁 三 汤匙 、 泰国 柠檬 叶三叶 、 柠檬 香草 一 根 、 鱼酱 两 汤匙 、 辣椒 6 粒 , 纯净 水 4量杯 、 香菜 半量杯 和 草菇 10 个 。".replace(" ", ""), "首先 , 我们 得 准备 : 大 虾六 到 九 只 、 盐 一 茶匙 、 已 榨 好 的 柠檬汁 三 汤匙 、 泰国 柠檬 叶三叶 、 柠檬 香草 一 根 、 鱼酱 两 汤匙 、 辣椒 六 粒 , 纯净 水 四 量杯 、 香菜 半量杯 和 草菇 十 个 。".replace(" ", "")] + src, tgt = tokenizer(sents) alignment(src, tgt, verbose=True) \ No newline at end of file diff --git a/opencompass/datasets/lawbench/utils/modules/annotator.py b/opencompass/datasets/lawbench/utils/modules/annotator.py index d7b00d06..4bb40597 100644 --- a/opencompass/datasets/lawbench/utils/modules/annotator.py +++ b/opencompass/datasets/lawbench/utils/modules/annotator.py @@ -1,76 +1,76 @@ -from typing import List, Tuple -from modules.alignment import read_cilin, read_confusion, Alignment -from modules.merger import Merger -from modules.classifier import Classifier - -class Annotator: - def __init__(self, - align: Alignment, - merger: Merger, - classifier: Classifier, - granularity: str = "word", - strategy: str = "first"): - self.align = align - self.merger = merger - self.classifier = classifier - self.granularity = granularity - self.strategy = strategy - - @classmethod - def create_default(cls, granularity: str = "word", strategy: str = "first"): - """ - Default parameters used in the paper - """ - semantic_dict, semantic_class = read_cilin() - confusion_dict = read_confusion() - align = Alignment(semantic_dict, confusion_dict, granularity) - merger = Merger(granularity) - classifier = Classifier(granularity) - return cls(align, merger, classifier, granularity, strategy) - - def __call__(self, - src: List[Tuple], - tgt: List[Tuple], - annotator_id: int = 0, - verbose: bool = False): - """ - Align sentences and annotate them with error type information - """ - src_tokens = [x[0] for x in src] - tgt_tokens = [x[0] for x in tgt] - src_str = "".join(src_tokens) - tgt_str = "".join(tgt_tokens) - # convert to text form - annotations_out = ["S " + " ".join(src_tokens) + "\n"] - if tgt_str == "没有错误" or src_str == tgt_str: # Error Free Case - annotations_out.append(f"T{annotator_id} 没有错误\n") - cors = [tgt_str] - op, toks, inds = "noop", "-NONE-", (-1, -1) - a_str = f"A {inds[0]} {inds[1]}|||{op}|||{toks}|||REQUIRED|||-NONE-|||{annotator_id}\n" - annotations_out.append(a_str) - elif tgt_str == "无法标注": # Not Annotatable Case - annotations_out.append(f"T{annotator_id} 无法标注\n") - cors = [tgt_str] - op, toks, inds = "NA", "-NONE-", (-1, -1) - a_str = f"A {inds[0]} {inds[1]}|||{op}|||{toks}|||REQUIRED|||-NONE-|||{annotator_id}\n" - annotations_out.append(a_str) - else: # Other - align_objs = self.align(src, tgt) - edit_objs = [] - align_idx = 0 - if self.strategy == "first": - align_objs = align_objs[:1] - for align_obj in align_objs: - edits = self.merger(align_obj, src, tgt, verbose) - if edits not in edit_objs: - edit_objs.append(edits) - annotations_out.append(f"T{annotator_id}-A{align_idx} " + " ".join(tgt_tokens) + "\n") - align_idx += 1 - cors = self.classifier(src, tgt, edits, verbose) - # annotations_out = [] - for cor in cors: - op, toks, inds = cor.op, cor.toks, cor.inds - a_str = f"A {inds[0]} {inds[1]}|||{op}|||{toks}|||REQUIRED|||-NONE-|||{annotator_id}\n" - annotations_out.append(a_str) - annotations_out.append("\n") - return annotations_out, cors +from typing import List, Tuple +from modules.alignment import read_cilin, read_confusion, Alignment +from modules.merger import Merger +from modules.classifier import Classifier + +class Annotator: + def __init__(self, + align: Alignment, + merger: Merger, + classifier: Classifier, + granularity: str = "word", + strategy: str = "first"): + self.align = align + self.merger = merger + self.classifier = classifier + self.granularity = granularity + self.strategy = strategy + + @classmethod + def create_default(cls, granularity: str = "word", strategy: str = "first"): + """ + Default parameters used in the paper + """ + semantic_dict, semantic_class = read_cilin() + confusion_dict = read_confusion() + align = Alignment(semantic_dict, confusion_dict, granularity) + merger = Merger(granularity) + classifier = Classifier(granularity) + return cls(align, merger, classifier, granularity, strategy) + + def __call__(self, + src: List[Tuple], + tgt: List[Tuple], + annotator_id: int = 0, + verbose: bool = False): + """ + Align sentences and annotate them with error type information + """ + src_tokens = [x[0] for x in src] + tgt_tokens = [x[0] for x in tgt] + src_str = "".join(src_tokens) + tgt_str = "".join(tgt_tokens) + # convert to text form + annotations_out = ["S " + " ".join(src_tokens) + "\n"] + if tgt_str == "没有错误" or src_str == tgt_str: # Error Free Case + annotations_out.append(f"T{annotator_id} 没有错误\n") + cors = [tgt_str] + op, toks, inds = "noop", "-NONE-", (-1, -1) + a_str = f"A {inds[0]} {inds[1]}|||{op}|||{toks}|||REQUIRED|||-NONE-|||{annotator_id}\n" + annotations_out.append(a_str) + elif tgt_str == "无法标注": # Not Annotatable Case + annotations_out.append(f"T{annotator_id} 无法标注\n") + cors = [tgt_str] + op, toks, inds = "NA", "-NONE-", (-1, -1) + a_str = f"A {inds[0]} {inds[1]}|||{op}|||{toks}|||REQUIRED|||-NONE-|||{annotator_id}\n" + annotations_out.append(a_str) + else: # Other + align_objs = self.align(src, tgt) + edit_objs = [] + align_idx = 0 + if self.strategy == "first": + align_objs = align_objs[:1] + for align_obj in align_objs: + edits = self.merger(align_obj, src, tgt, verbose) + if edits not in edit_objs: + edit_objs.append(edits) + annotations_out.append(f"T{annotator_id}-A{align_idx} " + " ".join(tgt_tokens) + "\n") + align_idx += 1 + cors = self.classifier(src, tgt, edits, verbose) + # annotations_out = [] + for cor in cors: + op, toks, inds = cor.op, cor.toks, cor.inds + a_str = f"A {inds[0]} {inds[1]}|||{op}|||{toks}|||REQUIRED|||-NONE-|||{annotator_id}\n" + annotations_out.append(a_str) + annotations_out.append("\n") + return annotations_out, cors diff --git a/opencompass/datasets/lawbench/utils/modules/classifier.py b/opencompass/datasets/lawbench/utils/modules/classifier.py index a8e9b921..9ee1acb5 100644 --- a/opencompass/datasets/lawbench/utils/modules/classifier.py +++ b/opencompass/datasets/lawbench/utils/modules/classifier.py @@ -1,150 +1,150 @@ -from char_smi import CharFuncs -from collections import namedtuple -from pypinyin import pinyin, Style -import os -Correction = namedtuple( - "Correction", - [ - "op", - "toks", - "inds", - ], -) -char_smi = CharFuncs(os.path.join(os.path.dirname(__file__), "..", "..", "..", "..", "..", "data", "lawbench", "eval_assets", "char_meta.txt")) - -def check_spell_error(src_span: str, - tgt_span: str, - threshold: float = 0.8) -> bool: - if len(src_span) != len(tgt_span): - return False - src_chars = [ch for ch in src_span] - tgt_chars = [ch for ch in tgt_span] - if sorted(src_chars) == sorted(tgt_chars): # 词内部字符异位 - return True - for src_char, tgt_char in zip(src_chars, tgt_chars): - if src_char != tgt_char: - if src_char not in char_smi.data or tgt_char not in char_smi.data: - return False - v_sim = char_smi.shape_similarity(src_char, tgt_char) - p_sim = char_smi.pronunciation_similarity(src_char, tgt_char) - if v_sim + p_sim < threshold and not ( - set(pinyin(src_char, style=Style.NORMAL, heteronym=True)[0]) & set(pinyin(tgt_char, style=Style.NORMAL, heteronym=True)[0])): - return False - return True - -class Classifier: - """ - 错误类型分类器 - """ - def __init__(self, - granularity: str = "word"): - - self.granularity = granularity - - @staticmethod - def get_pos_type(pos): - if pos in {"n", "nd"}: - return "NOUN" - if pos in {"nh", "ni", "nl", "ns", "nt", "nz"}: - return "NOUN-NE" - if pos in {"v"}: - return "VERB" - if pos in {"a", "b"}: - return "ADJ" - if pos in {"c"}: - return "CONJ" - if pos in {"r"}: - return "PRON" - if pos in {"d"}: - return "ADV" - if pos in {"u"}: - return "AUX" - # if pos in {"k"}: # TODO 后缀词比例太少,暂且分入其它 - # return "SUFFIX" - if pos in {"m"}: - return "NUM" - if pos in {"p"}: - return "PREP" - if pos in {"q"}: - return "QUAN" - if pos in {"wp"}: - return "PUNCT" - return "OTHER" - - def __call__(self, - src, - tgt, - edits, - verbose: bool = False): - """ - 为编辑操作划分错误类型 - :param src: 错误句子信息 - :param tgt: 正确句子信息 - :param edits: 编辑操作 - :param verbose: 是否打印信息 - :return: 划分完错误类型后的编辑操作 - """ - results = [] - src_tokens = [x[0] for x in src] - tgt_tokens = [x[0] for x in tgt] - for edit in edits: - error_type = edit[0] - src_span = " ".join(src_tokens[edit[1]: edit[2]]) - tgt_span = " ".join(tgt_tokens[edit[3]: edit[4]]) - # print(tgt_span) - cor = None - if error_type[0] == "T": - cor = Correction("W", tgt_span, (edit[1], edit[2])) - elif error_type[0] == "D": - if self.granularity == "word": # 词级别可以细分错误类型 - if edit[2] - edit[1] > 1: # 词组冗余暂时分为OTHER - cor = Correction("R:OTHER", "-NONE-", (edit[1], edit[2])) - else: - pos = self.get_pos_type(src[edit[1]][1]) - pos = "NOUN" if pos == "NOUN-NE" else pos - pos = "MC" if tgt_span == "[缺失成分]" else pos - cor = Correction("R:{:s}".format(pos), "-NONE-", (edit[1], edit[2])) - else: # 字级别可以只需要根据操作划分类型即可 - cor = Correction("R", "-NONE-", (edit[1], edit[2])) - elif error_type[0] == "I": - if self.granularity == "word": # 词级别可以细分错误类型 - if edit[4] - edit[3] > 1: # 词组丢失暂时分为OTHER - cor = Correction("M:OTHER", tgt_span, (edit[1], edit[2])) - else: - pos = self.get_pos_type(tgt[edit[3]][1]) - pos = "NOUN" if pos == "NOUN-NE" else pos - pos = "MC" if tgt_span == "[缺失成分]" else pos - cor = Correction("M:{:s}".format(pos), tgt_span, (edit[1], edit[2])) - else: # 字级别可以只需要根据操作划分类型即可 - cor = Correction("M", tgt_span, (edit[1], edit[2])) - elif error_type[0] == "S": - if self.granularity == "word": # 词级别可以细分错误类型 - if check_spell_error(src_span.replace(" ", ""), tgt_span.replace(" ", "")): - cor = Correction("S:SPELL", tgt_span, (edit[1], edit[2])) - # Todo 暂且不单独区分命名实体拼写错误 - # if edit[4] - edit[3] > 1: - # cor = Correction("S:SPELL:COMMON", tgt_span, (edit[1], edit[2])) - # else: - # pos = self.get_pos_type(tgt[edit[3]][1]) - # if pos == "NOUN-NE": # 命名实体拼写有误 - # cor = Correction("S:SPELL:NE", tgt_span, (edit[1], edit[2])) - # else: # 普通词语拼写有误 - # cor = Correction("S:SPELL:COMMON", tgt_span, (edit[1], edit[2])) - else: - if edit[4] - edit[3] > 1: # 词组被替换暂时分为OTHER - cor = Correction("S:OTHER", tgt_span, (edit[1], edit[2])) - else: - pos = self.get_pos_type(tgt[edit[3]][1]) - pos = "NOUN" if pos == "NOUN-NE" else pos - pos = "MC" if tgt_span == "[缺失成分]" else pos - cor = Correction("S:{:s}".format(pos), tgt_span, (edit[1], edit[2])) - else: # 字级别可以只需要根据操作划分类型即可 - cor = Correction("S", tgt_span, (edit[1], edit[2])) - results.append(cor) - if verbose: - print("========== Corrections ==========") - for cor in results: - print("Type: {:s}, Position: {:d} -> {:d}, Target: {:s}".format(cor.op, cor.inds[0], cor.inds[1], cor.toks)) - return results - -# print(pinyin("朝", style=Style.NORMAL)) +from char_smi import CharFuncs +from collections import namedtuple +from pypinyin import pinyin, Style +import os +Correction = namedtuple( + "Correction", + [ + "op", + "toks", + "inds", + ], +) +char_smi = CharFuncs(os.path.join(os.path.dirname(__file__), "..", "..", "..", "..", "..", "data", "lawbench", "eval_assets", "char_meta.txt")) + +def check_spell_error(src_span: str, + tgt_span: str, + threshold: float = 0.8) -> bool: + if len(src_span) != len(tgt_span): + return False + src_chars = [ch for ch in src_span] + tgt_chars = [ch for ch in tgt_span] + if sorted(src_chars) == sorted(tgt_chars): # 词内部字符异位 + return True + for src_char, tgt_char in zip(src_chars, tgt_chars): + if src_char != tgt_char: + if src_char not in char_smi.data or tgt_char not in char_smi.data: + return False + v_sim = char_smi.shape_similarity(src_char, tgt_char) + p_sim = char_smi.pronunciation_similarity(src_char, tgt_char) + if v_sim + p_sim < threshold and not ( + set(pinyin(src_char, style=Style.NORMAL, heteronym=True)[0]) & set(pinyin(tgt_char, style=Style.NORMAL, heteronym=True)[0])): + return False + return True + +class Classifier: + """ + 错误类型分类器 + """ + def __init__(self, + granularity: str = "word"): + + self.granularity = granularity + + @staticmethod + def get_pos_type(pos): + if pos in {"n", "nd"}: + return "NOUN" + if pos in {"nh", "ni", "nl", "ns", "nt", "nz"}: + return "NOUN-NE" + if pos in {"v"}: + return "VERB" + if pos in {"a", "b"}: + return "ADJ" + if pos in {"c"}: + return "CONJ" + if pos in {"r"}: + return "PRON" + if pos in {"d"}: + return "ADV" + if pos in {"u"}: + return "AUX" + # if pos in {"k"}: # TODO 后缀词比例太少,暂且分入其它 + # return "SUFFIX" + if pos in {"m"}: + return "NUM" + if pos in {"p"}: + return "PREP" + if pos in {"q"}: + return "QUAN" + if pos in {"wp"}: + return "PUNCT" + return "OTHER" + + def __call__(self, + src, + tgt, + edits, + verbose: bool = False): + """ + 为编辑操作划分错误类型 + :param src: 错误句子信息 + :param tgt: 正确句子信息 + :param edits: 编辑操作 + :param verbose: 是否打印信息 + :return: 划分完错误类型后的编辑操作 + """ + results = [] + src_tokens = [x[0] for x in src] + tgt_tokens = [x[0] for x in tgt] + for edit in edits: + error_type = edit[0] + src_span = " ".join(src_tokens[edit[1]: edit[2]]) + tgt_span = " ".join(tgt_tokens[edit[3]: edit[4]]) + # print(tgt_span) + cor = None + if error_type[0] == "T": + cor = Correction("W", tgt_span, (edit[1], edit[2])) + elif error_type[0] == "D": + if self.granularity == "word": # 词级别可以细分错误类型 + if edit[2] - edit[1] > 1: # 词组冗余暂时分为OTHER + cor = Correction("R:OTHER", "-NONE-", (edit[1], edit[2])) + else: + pos = self.get_pos_type(src[edit[1]][1]) + pos = "NOUN" if pos == "NOUN-NE" else pos + pos = "MC" if tgt_span == "[缺失成分]" else pos + cor = Correction("R:{:s}".format(pos), "-NONE-", (edit[1], edit[2])) + else: # 字级别可以只需要根据操作划分类型即可 + cor = Correction("R", "-NONE-", (edit[1], edit[2])) + elif error_type[0] == "I": + if self.granularity == "word": # 词级别可以细分错误类型 + if edit[4] - edit[3] > 1: # 词组丢失暂时分为OTHER + cor = Correction("M:OTHER", tgt_span, (edit[1], edit[2])) + else: + pos = self.get_pos_type(tgt[edit[3]][1]) + pos = "NOUN" if pos == "NOUN-NE" else pos + pos = "MC" if tgt_span == "[缺失成分]" else pos + cor = Correction("M:{:s}".format(pos), tgt_span, (edit[1], edit[2])) + else: # 字级别可以只需要根据操作划分类型即可 + cor = Correction("M", tgt_span, (edit[1], edit[2])) + elif error_type[0] == "S": + if self.granularity == "word": # 词级别可以细分错误类型 + if check_spell_error(src_span.replace(" ", ""), tgt_span.replace(" ", "")): + cor = Correction("S:SPELL", tgt_span, (edit[1], edit[2])) + # Todo 暂且不单独区分命名实体拼写错误 + # if edit[4] - edit[3] > 1: + # cor = Correction("S:SPELL:COMMON", tgt_span, (edit[1], edit[2])) + # else: + # pos = self.get_pos_type(tgt[edit[3]][1]) + # if pos == "NOUN-NE": # 命名实体拼写有误 + # cor = Correction("S:SPELL:NE", tgt_span, (edit[1], edit[2])) + # else: # 普通词语拼写有误 + # cor = Correction("S:SPELL:COMMON", tgt_span, (edit[1], edit[2])) + else: + if edit[4] - edit[3] > 1: # 词组被替换暂时分为OTHER + cor = Correction("S:OTHER", tgt_span, (edit[1], edit[2])) + else: + pos = self.get_pos_type(tgt[edit[3]][1]) + pos = "NOUN" if pos == "NOUN-NE" else pos + pos = "MC" if tgt_span == "[缺失成分]" else pos + cor = Correction("S:{:s}".format(pos), tgt_span, (edit[1], edit[2])) + else: # 字级别可以只需要根据操作划分类型即可 + cor = Correction("S", tgt_span, (edit[1], edit[2])) + results.append(cor) + if verbose: + print("========== Corrections ==========") + for cor in results: + print("Type: {:s}, Position: {:d} -> {:d}, Target: {:s}".format(cor.op, cor.inds[0], cor.inds[1], cor.toks)) + return results + +# print(pinyin("朝", style=Style.NORMAL)) diff --git a/opencompass/datasets/lawbench/utils/modules/merger.py b/opencompass/datasets/lawbench/utils/modules/merger.py index 26e7039b..8c0f6db6 100644 --- a/opencompass/datasets/lawbench/utils/modules/merger.py +++ b/opencompass/datasets/lawbench/utils/modules/merger.py @@ -1,273 +1,273 @@ -from itertools import groupby -from string import punctuation -from typing import List -from modules.tokenizer import Tokenizer -from modules.alignment import Alignment, read_cilin, read_confusion -import Levenshtein - -class Merger: - """ - 合并编辑操作,从Token-Level转换为Span-Level - """ - - def __init__(self, - granularity: str = "word", - merge: bool = False): - chinese_punct = "!?。"#$%&'()*+,-/:;<=>@[\]^_`{|}~⦅⦆「」、、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟–—‘'‛“”„‟…‧." - self.punctuation = punctuation + chinese_punct - self.not_merge_token = [punct for punct in self.punctuation] - self.granularity = granularity - self.merge = merge - - @staticmethod - def _merge_edits(seq, tag="X"): - if seq: - return [(tag, seq[0][1], seq[-1][2], seq[0][3], seq[-1][4])] - else: - return seq - - @staticmethod - def _check_revolve(span_a, span_b): - span_a = span_a + span_a - return span_b in span_a - - def _process_seq(self, seq, src_tokens, tgt_tokens): - if len(seq) <= 1: - return seq - - ops = [op[0] for op in seq] - if set(ops) == {"D"} or set(ops) == {"I"}: - return self._merge_edits(seq, set(ops).pop()) - - if set(ops) == {"D", "I"} or set(ops) == {"I", "D"}: - # do not merge this pattern_from_qua.txt - return seq - - if set(ops) == {"S"}: - if self.granularity == "word": - return seq - else: - return self._merge_edits(seq, "S") - - if set(ops) == {"M"}: - return self._merge_edits(seq, "M") - - return self._merge_edits(seq, "S") - - def __call__(self, - align_obj, - src: List, - tgt: List, - verbose: bool = False): - """ - Based on ERRANT's merge, adapted for Chinese - """ - src_tokens = [x[0] for x in src] - tgt_tokens = [x[0] for x in tgt] - edits = [] - # Split alignment into groups of M, T and rest. (T has a number after it) - # Todo 一旦插入、删除、替换的对象中含有标点,那么不与其它编辑合并 - # Todo 缺失成分标签也不与其它编辑合并 - for op, group in groupby( - align_obj, - lambda x: x[0][0] if x[0][0] in {"M", "T"} else False, - ): - group = list(group) - # T is always split TODO: Evaluate this - if op == "T": - for seq in group: - edits.append(seq) - # Process D, I and S subsequence - else: - # Turn the processed sequence into edits - processed = self._process_seq(group, src_tokens, tgt_tokens) - for seq in processed: - edits.append(seq) - - filtered_edits = [] - i = 0 - while i < len(edits): - e1 = edits[i][0][0] - - if i < len(edits) - 2: - e2 = edits[i + 1][0][0] - e3 = edits[i + 2][0][0] - - # Find "S M S" patterns - # Ex: - # S M S - # 冬阴功 对 外国人 - # 外国人 对 冬阴功 - if e1 == "S" and e2 == "M" and e3 == "S": - w1 = "".join(src_tokens[edits[i][1]: edits[i][2]]) - w2 = "".join(tgt_tokens[edits[i][3]: edits[i][4]]) - w3 = "".join(src_tokens[edits[i + 2][1]: edits[i + 2][2]]) - w4 = "".join(tgt_tokens[edits[i + 2][3]: edits[i + 2][4]]) - if min([len(w1), len(w2), len(w3), len(w4)]) == 1: - if w1 == w4 and w2 == w3: - group = [edits[i], edits[i + 1], edits[i + 2]] - processed = self._merge_edits(group, "T" + str(edits[i+2][2] - edits[i][1])) - for seq in processed: - filtered_edits.append(seq) - i += 3 - else: - filtered_edits.append(edits[i]) - i += 1 - else: - if Levenshtein.distance(w1, w4) <= 1 and Levenshtein.distance(w2, w3) <= 1: - group = [edits[i], edits[i + 1], edits[i + 2]] - processed = self._merge_edits(group, "T" + str(edits[i + 2][2] - edits[i][1])) - for seq in processed: - filtered_edits.append(seq) - i += 3 - else: - filtered_edits.append(edits[i]) - i += 1 - # Find "D M I" or "I M D" patterns - # Ex: - # D M I - # 旅游 去 陌生 的 地方 - # 去 陌生 的 地方 旅游 - elif (e1 == "D" and (e2 == "M" or e2.startswith("T")) and e3 == "I") or (e1 == "I" and (e2 == "M" or e2.startswith("T")) and e3 == "D"): - if e1 == "D": - delete_token = src_tokens[edits[i][1]: edits[i][2]] - insert_token = tgt_tokens[edits[i + 2][3]: edits[i + 2][4]] - else: - delete_token = src_tokens[edits[i + 2][1]: edits[i + 2][2]] - insert_token = tgt_tokens[edits[i][3]: edits[i][4]] - a, b = "".join(delete_token), "".join(insert_token) - if len(a) < len(b): - a, b = b, a - if a not in self.punctuation and b not in self.punctuation and len(a) - len(b) <= 1: - if len(b) == 1: - if a == b: - group = [edits[i], edits[i + 1], edits[i + 2]] - processed = self._merge_edits(group, "T" + str(edits[i+2][2] - edits[i][1])) - for seq in processed: - filtered_edits.append(seq) - i += 3 - else: - filtered_edits.append(edits[i]) - i += 1 - else: - if Levenshtein.distance(a, b) <= 1 or (len(a) == len(b) and self._check_revolve(a, b)): - group = [edits[i], edits[i + 1], edits[i + 2]] - processed = self._merge_edits(group, "T" + str(edits[i + 2][2] - edits[i][1])) - for seq in processed: - filtered_edits.append(seq) - i += 3 - else: - filtered_edits.append(edits[i]) - i += 1 - else: - filtered_edits.append(edits[i]) - i += 1 - else: - if e1 != "M": - filtered_edits.append(edits[i]) - i += 1 - else: - if e1 != "M": - filtered_edits.append(edits[i]) - i += 1 - # In rare cases with word-level tokenization, the following error can occur: - # M D S M - # 有 時 住 上層 - # 有 時住 上層 - # Which results in S: 時住 --> 時住 - # We need to filter this case out - second_filter = [] - for edit in filtered_edits: # 避免因为分词错误导致的mismatch现象 - span1 = "".join(src_tokens[edit[1] : edit[2]]) - span2 = "".join(tgt_tokens[edit[3] : edit[4]]) - - if span1 != span2: - if edit[0] == "S": - b = True - # In rare cases with word-level tokenization, the following error can occur: - # S I I M - # 负责任 老师 - # 负 责任 的 老师 - # Which results in S: 负责任 --> 负 责任 的 - # We need to convert this edit to I: --> 的 - - # 首部有重叠 - common_str = "" - tmp_new_start_1 = edit[1] - for i in range(edit[1], edit[2]): - if not span2.startswith(common_str + src_tokens[i]): - break - common_str += src_tokens[i] - tmp_new_start_1 = i + 1 - new_start_1, new_start_2 = edit[1], edit[3] - if common_str: - tmp_str = "" - for i in range(edit[3], edit[4]): - tmp_str += tgt_tokens[i] - if tmp_str == common_str: - new_start_1, new_start_2 = tmp_new_start_1, i + 1 - # second_filter.append(("S", new_start_1, edit[2], i + 1, edit[4])) - b = False - break - elif len(tmp_str) > len(common_str): - break - # 尾部有重叠 - common_str = "" - new_end_1, new_end_2 = edit[2], edit[4] - tmp_new_end_1 = edit[2] - for i in reversed(range(new_start_1, edit[2])): - if not span2.endswith(src_tokens[i] + common_str): - break - common_str = src_tokens[i] + common_str - tmp_new_end_1 = i - if common_str: - tmp_str = "" - for i in reversed(range(new_start_2, edit[4])): - tmp_str = tgt_tokens[i] + tmp_str - if tmp_str == common_str: - new_end_1, new_end_2 = tmp_new_end_1, i - b = False - break - elif len(tmp_str) > len(common_str): - break - if b: - second_filter.append(edit) - else: - if new_start_1 == new_end_1: - new_edit = ("I", new_start_1, new_end_1, new_start_2, new_end_2) - elif new_start_2 == new_end_2: - new_edit = ("D", new_start_1, new_end_1, new_start_2, new_end_2) - else: - new_edit = ("S", new_start_1, new_end_1, new_start_2, new_end_2) - second_filter.append(new_edit) - else: - second_filter.append(edit) - if verbose: - print("========== Parallels ==========") - print("".join(src_tokens)) - print("".join(tgt_tokens)) - print("========== Results ==========") - for edit in second_filter: - op = edit[0] - s = " ".join(src_tokens[edit[1]: edit[2]]) - t = " ".join(tgt_tokens[edit[3]: edit[4]]) - print(f"{op}:\t{s}\t-->\t{t}") - print("========== Infos ==========") - print(str(src)) - print(str(tgt)) - return second_filter - -if __name__ == "__main__": - tokenizer = Tokenizer("char") - semantic_dict, semantic_class = read_cilin() - confusion_dict = read_confusion() - alignment = Alignment(semantic_dict, confusion_dict) - sents = [ - "所 以 印 度 对 全 世 界 人 没 有 说 服 不 要 吃 牛 肉 。".replace( - " ", ""), - "所 以 印 度 没 有 说 服 全 世 界 人 不 要 吃 牛 肉 。".replace( - " ", "")] - src, tgt = tokenizer(sents) - align_obj = alignment(src, tgt) - m = Merger() +from itertools import groupby +from string import punctuation +from typing import List +from modules.tokenizer import Tokenizer +from modules.alignment import Alignment, read_cilin, read_confusion +import Levenshtein + +class Merger: + """ + 合并编辑操作,从Token-Level转换为Span-Level + """ + + def __init__(self, + granularity: str = "word", + merge: bool = False): + chinese_punct = "!?。"#$%&'()*+,-/:;<=>@[\]^_`{|}~⦅⦆「」、、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟–—‘'‛“”„‟…‧." + self.punctuation = punctuation + chinese_punct + self.not_merge_token = [punct for punct in self.punctuation] + self.granularity = granularity + self.merge = merge + + @staticmethod + def _merge_edits(seq, tag="X"): + if seq: + return [(tag, seq[0][1], seq[-1][2], seq[0][3], seq[-1][4])] + else: + return seq + + @staticmethod + def _check_revolve(span_a, span_b): + span_a = span_a + span_a + return span_b in span_a + + def _process_seq(self, seq, src_tokens, tgt_tokens): + if len(seq) <= 1: + return seq + + ops = [op[0] for op in seq] + if set(ops) == {"D"} or set(ops) == {"I"}: + return self._merge_edits(seq, set(ops).pop()) + + if set(ops) == {"D", "I"} or set(ops) == {"I", "D"}: + # do not merge this pattern_from_qua.txt + return seq + + if set(ops) == {"S"}: + if self.granularity == "word": + return seq + else: + return self._merge_edits(seq, "S") + + if set(ops) == {"M"}: + return self._merge_edits(seq, "M") + + return self._merge_edits(seq, "S") + + def __call__(self, + align_obj, + src: List, + tgt: List, + verbose: bool = False): + """ + Based on ERRANT's merge, adapted for Chinese + """ + src_tokens = [x[0] for x in src] + tgt_tokens = [x[0] for x in tgt] + edits = [] + # Split alignment into groups of M, T and rest. (T has a number after it) + # Todo 一旦插入、删除、替换的对象中含有标点,那么不与其它编辑合并 + # Todo 缺失成分标签也不与其它编辑合并 + for op, group in groupby( + align_obj, + lambda x: x[0][0] if x[0][0] in {"M", "T"} else False, + ): + group = list(group) + # T is always split TODO: Evaluate this + if op == "T": + for seq in group: + edits.append(seq) + # Process D, I and S subsequence + else: + # Turn the processed sequence into edits + processed = self._process_seq(group, src_tokens, tgt_tokens) + for seq in processed: + edits.append(seq) + + filtered_edits = [] + i = 0 + while i < len(edits): + e1 = edits[i][0][0] + + if i < len(edits) - 2: + e2 = edits[i + 1][0][0] + e3 = edits[i + 2][0][0] + + # Find "S M S" patterns + # Ex: + # S M S + # 冬阴功 对 外国人 + # 外国人 对 冬阴功 + if e1 == "S" and e2 == "M" and e3 == "S": + w1 = "".join(src_tokens[edits[i][1]: edits[i][2]]) + w2 = "".join(tgt_tokens[edits[i][3]: edits[i][4]]) + w3 = "".join(src_tokens[edits[i + 2][1]: edits[i + 2][2]]) + w4 = "".join(tgt_tokens[edits[i + 2][3]: edits[i + 2][4]]) + if min([len(w1), len(w2), len(w3), len(w4)]) == 1: + if w1 == w4 and w2 == w3: + group = [edits[i], edits[i + 1], edits[i + 2]] + processed = self._merge_edits(group, "T" + str(edits[i+2][2] - edits[i][1])) + for seq in processed: + filtered_edits.append(seq) + i += 3 + else: + filtered_edits.append(edits[i]) + i += 1 + else: + if Levenshtein.distance(w1, w4) <= 1 and Levenshtein.distance(w2, w3) <= 1: + group = [edits[i], edits[i + 1], edits[i + 2]] + processed = self._merge_edits(group, "T" + str(edits[i + 2][2] - edits[i][1])) + for seq in processed: + filtered_edits.append(seq) + i += 3 + else: + filtered_edits.append(edits[i]) + i += 1 + # Find "D M I" or "I M D" patterns + # Ex: + # D M I + # 旅游 去 陌生 的 地方 + # 去 陌生 的 地方 旅游 + elif (e1 == "D" and (e2 == "M" or e2.startswith("T")) and e3 == "I") or (e1 == "I" and (e2 == "M" or e2.startswith("T")) and e3 == "D"): + if e1 == "D": + delete_token = src_tokens[edits[i][1]: edits[i][2]] + insert_token = tgt_tokens[edits[i + 2][3]: edits[i + 2][4]] + else: + delete_token = src_tokens[edits[i + 2][1]: edits[i + 2][2]] + insert_token = tgt_tokens[edits[i][3]: edits[i][4]] + a, b = "".join(delete_token), "".join(insert_token) + if len(a) < len(b): + a, b = b, a + if a not in self.punctuation and b not in self.punctuation and len(a) - len(b) <= 1: + if len(b) == 1: + if a == b: + group = [edits[i], edits[i + 1], edits[i + 2]] + processed = self._merge_edits(group, "T" + str(edits[i+2][2] - edits[i][1])) + for seq in processed: + filtered_edits.append(seq) + i += 3 + else: + filtered_edits.append(edits[i]) + i += 1 + else: + if Levenshtein.distance(a, b) <= 1 or (len(a) == len(b) and self._check_revolve(a, b)): + group = [edits[i], edits[i + 1], edits[i + 2]] + processed = self._merge_edits(group, "T" + str(edits[i + 2][2] - edits[i][1])) + for seq in processed: + filtered_edits.append(seq) + i += 3 + else: + filtered_edits.append(edits[i]) + i += 1 + else: + filtered_edits.append(edits[i]) + i += 1 + else: + if e1 != "M": + filtered_edits.append(edits[i]) + i += 1 + else: + if e1 != "M": + filtered_edits.append(edits[i]) + i += 1 + # In rare cases with word-level tokenization, the following error can occur: + # M D S M + # 有 時 住 上層 + # 有 時住 上層 + # Which results in S: 時住 --> 時住 + # We need to filter this case out + second_filter = [] + for edit in filtered_edits: # 避免因为分词错误导致的mismatch现象 + span1 = "".join(src_tokens[edit[1] : edit[2]]) + span2 = "".join(tgt_tokens[edit[3] : edit[4]]) + + if span1 != span2: + if edit[0] == "S": + b = True + # In rare cases with word-level tokenization, the following error can occur: + # S I I M + # 负责任 老师 + # 负 责任 的 老师 + # Which results in S: 负责任 --> 负 责任 的 + # We need to convert this edit to I: --> 的 + + # 首部有重叠 + common_str = "" + tmp_new_start_1 = edit[1] + for i in range(edit[1], edit[2]): + if not span2.startswith(common_str + src_tokens[i]): + break + common_str += src_tokens[i] + tmp_new_start_1 = i + 1 + new_start_1, new_start_2 = edit[1], edit[3] + if common_str: + tmp_str = "" + for i in range(edit[3], edit[4]): + tmp_str += tgt_tokens[i] + if tmp_str == common_str: + new_start_1, new_start_2 = tmp_new_start_1, i + 1 + # second_filter.append(("S", new_start_1, edit[2], i + 1, edit[4])) + b = False + break + elif len(tmp_str) > len(common_str): + break + # 尾部有重叠 + common_str = "" + new_end_1, new_end_2 = edit[2], edit[4] + tmp_new_end_1 = edit[2] + for i in reversed(range(new_start_1, edit[2])): + if not span2.endswith(src_tokens[i] + common_str): + break + common_str = src_tokens[i] + common_str + tmp_new_end_1 = i + if common_str: + tmp_str = "" + for i in reversed(range(new_start_2, edit[4])): + tmp_str = tgt_tokens[i] + tmp_str + if tmp_str == common_str: + new_end_1, new_end_2 = tmp_new_end_1, i + b = False + break + elif len(tmp_str) > len(common_str): + break + if b: + second_filter.append(edit) + else: + if new_start_1 == new_end_1: + new_edit = ("I", new_start_1, new_end_1, new_start_2, new_end_2) + elif new_start_2 == new_end_2: + new_edit = ("D", new_start_1, new_end_1, new_start_2, new_end_2) + else: + new_edit = ("S", new_start_1, new_end_1, new_start_2, new_end_2) + second_filter.append(new_edit) + else: + second_filter.append(edit) + if verbose: + print("========== Parallels ==========") + print("".join(src_tokens)) + print("".join(tgt_tokens)) + print("========== Results ==========") + for edit in second_filter: + op = edit[0] + s = " ".join(src_tokens[edit[1]: edit[2]]) + t = " ".join(tgt_tokens[edit[3]: edit[4]]) + print(f"{op}:\t{s}\t-->\t{t}") + print("========== Infos ==========") + print(str(src)) + print(str(tgt)) + return second_filter + +if __name__ == "__main__": + tokenizer = Tokenizer("char") + semantic_dict, semantic_class = read_cilin() + confusion_dict = read_confusion() + alignment = Alignment(semantic_dict, confusion_dict) + sents = [ + "所 以 印 度 对 全 世 界 人 没 有 说 服 不 要 吃 牛 肉 。".replace( + " ", ""), + "所 以 印 度 没 有 说 服 全 世 界 人 不 要 吃 牛 肉 。".replace( + " ", "")] + src, tgt = tokenizer(sents) + align_obj = alignment(src, tgt) + m = Merger() m(align_obj, src, tgt, verbose=True) \ No newline at end of file diff --git a/opencompass/datasets/lawbench/utils/modules/tokenization.py b/opencompass/datasets/lawbench/utils/modules/tokenization.py index da62badb..6a98cd83 100644 --- a/opencompass/datasets/lawbench/utils/modules/tokenization.py +++ b/opencompass/datasets/lawbench/utils/modules/tokenization.py @@ -1,346 +1,346 @@ -# coding=utf-8 -# Copyright 2018 The Google AI Language Team Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Tokenization classes.""" - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import collections -import unicodedata -import six - - -def convert_to_unicode(text): - """Converts `text` to Unicode (if it's not already), assuming utf-8 input.""" - if six.PY3: - if isinstance(text, str): - return text - elif isinstance(text, bytes): - return text.decode("utf-8", "ignore") - else: - raise ValueError("Unsupported string type: %s" % (type(text))) - elif six.PY2: - if isinstance(text, str): - return text.decode("utf-8", "ignore") - elif isinstance(text, unicode): - return text - else: - raise ValueError("Unsupported string type: %s" % (type(text))) - else: - raise ValueError("Not running on Python2 or Python 3?") - - -def printable_text(text): - """Returns text encoded in a way suitable for print or `tf.logging`.""" - - # These functions want `str` for both Python2 and Python3, but in one case - # it's a Unicode string and in the other it's a byte string. - if six.PY3: - if isinstance(text, str): - return text - elif isinstance(text, bytes): - return text.decode("utf-8", "ignore") - else: - raise ValueError("Unsupported string type: %s" % (type(text))) - elif six.PY2: - if isinstance(text, str): - return text - elif isinstance(text, unicode): - return text.encode("utf-8") - else: - raise ValueError("Unsupported string type: %s" % (type(text))) - else: - raise ValueError("Not running on Python2 or Python 3?") - - -def load_vocab(vocab_file): - """Loads a vocabulary file into a dictionary.""" - vocab = collections.OrderedDict() - index = 0 - with open(vocab_file, "r") as reader: - while True: - token = convert_to_unicode(reader.readline()) - if not token: - break - token = token.strip() - vocab[token] = index - index += 1 - return vocab - - -def convert_by_vocab(vocab, items): - """Converts a sequence of [tokens|ids] using the vocab.""" - output = [] - for item in items: - if item not in vocab: - print("warning: %s not in vocab" % item) - item = "[UNK]" - output.append(vocab[item]) - return output - - -def convert_tokens_to_ids(vocab, tokens): - return convert_by_vocab(vocab, tokens) - - -def convert_ids_to_tokens(inv_vocab, ids): - return convert_by_vocab(inv_vocab, ids) - - -def whitespace_tokenize(text): - """Runs basic whitespace cleaning and splitting on a peice of text.""" - text = text.strip() - if not text: - return [] - tokens = text.split() - return tokens - - -class FullTokenizer(object): - """Runs end-to-end tokenziation.""" - - def __init__(self, vocab_file, do_lower_case=True): - self.vocab = load_vocab(vocab_file) - self.inv_vocab = {v: k for k, v in self.vocab.items()} - self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case) - self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab) - - def tokenize(self, text): - split_tokens = [] - for token in self.basic_tokenizer.tokenize(text): - for sub_token in self.wordpiece_tokenizer.tokenize(token): - split_tokens.append(sub_token) - - return split_tokens - - def convert_tokens_to_ids(self, tokens): - return convert_by_vocab(self.vocab, tokens) - - def convert_ids_to_tokens(self, ids): - return convert_by_vocab(self.inv_vocab, ids) - - -class BasicTokenizer(object): - """Runs basic tokenization (punctuation splitting, lower casing, etc.).""" - - def __init__(self, do_lower_case=True): - """Constructs a BasicTokenizer. - Args: - do_lower_case: Whether to lower case the input. - """ - self.do_lower_case = do_lower_case - - def tokenize(self, text): - """Tokenizes a piece of text.""" - text = convert_to_unicode(text) - text = self._clean_text(text) - - # This was added on November 1st, 2018 for the multilingual and Chinese - # models. This is also applied to the English models now, but it doesn't - # matter since the English models were not trained on any Chinese data - # and generally don't have any Chinese data in them (there are Chinese - # characters in the vocabulary because Wikipedia does have some Chinese - # words in the English Wikipedia.). - text = self._tokenize_chinese_chars(text) - - orig_tokens = whitespace_tokenize(text) - split_tokens = [] - for token in orig_tokens: - if self.do_lower_case: - token = token.lower() - token = self._run_strip_accents(token) - split_tokens.extend(self._run_split_on_punc(token)) - - output_tokens = whitespace_tokenize(" ".join(split_tokens)) - return output_tokens - - def _run_strip_accents(self, text): - """Strips accents from a piece of text.""" - text = unicodedata.normalize("NFD", text) - output = [] - for char in text: - cat = unicodedata.category(char) - if cat == "Mn": - continue - output.append(char) - return "".join(output) - - def _run_split_on_punc(self, text): - """Splits punctuation on a piece of text.""" - chars = list(text) - i = 0 - start_new_word = True - output = [] - while i < len(chars): - char = chars[i] - if _is_punctuation(char): - output.append([char]) - start_new_word = True - else: - if start_new_word: - output.append([]) - start_new_word = False - output[-1].append(char) - i += 1 - - return ["".join(x) for x in output] - - def _tokenize_chinese_chars(self, text): - """Adds whitespace around any CJK character.""" - output = [] - for char in text: - cp = ord(char) - if self._is_chinese_char(cp): - output.append(" ") - output.append(char) - output.append(" ") - else: - output.append(char) - return "".join(output) - - def _is_chinese_char(self, cp): - """Checks whether CP is the codepoint of a CJK character.""" - # This defines a "chinese character" as anything in the CJK Unicode block: - # https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block) - # - # Note that the CJK Unicode block is NOT all Japanese and Korean characters, - # despite its name. The modern Korean Hangul alphabet is a different block, - # as is Japanese Hiragana and Katakana. Those alphabets are used to write - # space-separated words, so they are not treated specially and handled - # like the all of the other languages. - if ((cp >= 0x4E00 and cp <= 0x9FFF) or # - (cp >= 0x3400 and cp <= 0x4DBF) or # - (cp >= 0x20000 and cp <= 0x2A6DF) or # - (cp >= 0x2A700 and cp <= 0x2B73F) or # - (cp >= 0x2B740 and cp <= 0x2B81F) or # - (cp >= 0x2B820 and cp <= 0x2CEAF) or - (cp >= 0xF900 and cp <= 0xFAFF) or # - (cp >= 0x2F800 and cp <= 0x2FA1F)): # - return True - - return False - - def _clean_text(self, text): - """Performs invalid character removal and whitespace cleanup on text.""" - output = [] - for char in text: - cp = ord(char) - if cp == 0 or cp == 0xfffd or _is_control(char): - continue - if _is_whitespace(char): - output.append(" ") - else: - output.append(char) - return "".join(output) - - -class WordpieceTokenizer(object): - """Runs WordPiece tokenziation.""" - - def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=100): - self.vocab = vocab - self.unk_token = unk_token - self.max_input_chars_per_word = max_input_chars_per_word - - def tokenize(self, text): - """Tokenizes a piece of text into its word pieces. - This uses a greedy longest-match-first algorithm to perform tokenization - using the given vocabulary. - For example: - input = "unaffable" - output = ["un", "##aff", "##able"] - Args: - text: A single token or whitespace separated tokens. This should have - already been passed through `BasicTokenizer. - Returns: - A list of wordpiece tokens. - """ - - text = convert_to_unicode(text) - - output_tokens = [] - for token in whitespace_tokenize(text): - chars = list(token) - if len(chars) > self.max_input_chars_per_word: - output_tokens.append(self.unk_token) - continue - - is_bad = False - start = 0 - sub_tokens = [] - while start < len(chars): - end = len(chars) - cur_substr = None - while start < end: - substr = "".join(chars[start:end]) - if start > 0: - substr = "##" + substr - if substr in self.vocab: - cur_substr = substr - break - end -= 1 - if cur_substr is None: - is_bad = True - break - sub_tokens.append(cur_substr) - start = end - - if is_bad: - # output_tokens.append(self.unk_token) - output_tokens.append(token) # keep the UNK token - else: - output_tokens.extend(sub_tokens) - return output_tokens - - -def _is_whitespace(char): - """Checks whether `chars` is a whitespace character.""" - # \t, \n, and \r are technically contorl characters but we treat them - # as whitespace since they are generally considered as such. - if char == " " or char == "\t" or char == "\n" or char == "\r": - return True - cat = unicodedata.category(char) - if cat == "Zs": - return True - return False - - -def _is_control(char): - """Checks whether `chars` is a control character.""" - # These are technically control characters but we count them as whitespace - # characters. - if char == "\t" or char == "\n" or char == "\r": - return False - cat = unicodedata.category(char) - if cat.startswith("C"): - return True - return False - - -def _is_punctuation(char): - """Checks whether `chars` is a punctuation character.""" - cp = ord(char) - # We treat all non-letter/number ASCII as punctuation. - # Characters such as "^", "$", and "`" are not in the Unicode - # Punctuation class but we treat them as punctuation anyways, for - # consistency. - if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or - (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)): - return True - cat = unicodedata.category(char) - if cat.startswith("P"): - return True +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Tokenization classes.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import collections +import unicodedata +import six + + +def convert_to_unicode(text): + """Converts `text` to Unicode (if it's not already), assuming utf-8 input.""" + if six.PY3: + if isinstance(text, str): + return text + elif isinstance(text, bytes): + return text.decode("utf-8", "ignore") + else: + raise ValueError("Unsupported string type: %s" % (type(text))) + elif six.PY2: + if isinstance(text, str): + return text.decode("utf-8", "ignore") + elif isinstance(text, unicode): + return text + else: + raise ValueError("Unsupported string type: %s" % (type(text))) + else: + raise ValueError("Not running on Python2 or Python 3?") + + +def printable_text(text): + """Returns text encoded in a way suitable for print or `tf.logging`.""" + + # These functions want `str` for both Python2 and Python3, but in one case + # it's a Unicode string and in the other it's a byte string. + if six.PY3: + if isinstance(text, str): + return text + elif isinstance(text, bytes): + return text.decode("utf-8", "ignore") + else: + raise ValueError("Unsupported string type: %s" % (type(text))) + elif six.PY2: + if isinstance(text, str): + return text + elif isinstance(text, unicode): + return text.encode("utf-8") + else: + raise ValueError("Unsupported string type: %s" % (type(text))) + else: + raise ValueError("Not running on Python2 or Python 3?") + + +def load_vocab(vocab_file): + """Loads a vocabulary file into a dictionary.""" + vocab = collections.OrderedDict() + index = 0 + with open(vocab_file, "r") as reader: + while True: + token = convert_to_unicode(reader.readline()) + if not token: + break + token = token.strip() + vocab[token] = index + index += 1 + return vocab + + +def convert_by_vocab(vocab, items): + """Converts a sequence of [tokens|ids] using the vocab.""" + output = [] + for item in items: + if item not in vocab: + print("warning: %s not in vocab" % item) + item = "[UNK]" + output.append(vocab[item]) + return output + + +def convert_tokens_to_ids(vocab, tokens): + return convert_by_vocab(vocab, tokens) + + +def convert_ids_to_tokens(inv_vocab, ids): + return convert_by_vocab(inv_vocab, ids) + + +def whitespace_tokenize(text): + """Runs basic whitespace cleaning and splitting on a peice of text.""" + text = text.strip() + if not text: + return [] + tokens = text.split() + return tokens + + +class FullTokenizer(object): + """Runs end-to-end tokenziation.""" + + def __init__(self, vocab_file, do_lower_case=True): + self.vocab = load_vocab(vocab_file) + self.inv_vocab = {v: k for k, v in self.vocab.items()} + self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case) + self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab) + + def tokenize(self, text): + split_tokens = [] + for token in self.basic_tokenizer.tokenize(text): + for sub_token in self.wordpiece_tokenizer.tokenize(token): + split_tokens.append(sub_token) + + return split_tokens + + def convert_tokens_to_ids(self, tokens): + return convert_by_vocab(self.vocab, tokens) + + def convert_ids_to_tokens(self, ids): + return convert_by_vocab(self.inv_vocab, ids) + + +class BasicTokenizer(object): + """Runs basic tokenization (punctuation splitting, lower casing, etc.).""" + + def __init__(self, do_lower_case=True): + """Constructs a BasicTokenizer. + Args: + do_lower_case: Whether to lower case the input. + """ + self.do_lower_case = do_lower_case + + def tokenize(self, text): + """Tokenizes a piece of text.""" + text = convert_to_unicode(text) + text = self._clean_text(text) + + # This was added on November 1st, 2018 for the multilingual and Chinese + # models. This is also applied to the English models now, but it doesn't + # matter since the English models were not trained on any Chinese data + # and generally don't have any Chinese data in them (there are Chinese + # characters in the vocabulary because Wikipedia does have some Chinese + # words in the English Wikipedia.). + text = self._tokenize_chinese_chars(text) + + orig_tokens = whitespace_tokenize(text) + split_tokens = [] + for token in orig_tokens: + if self.do_lower_case: + token = token.lower() + token = self._run_strip_accents(token) + split_tokens.extend(self._run_split_on_punc(token)) + + output_tokens = whitespace_tokenize(" ".join(split_tokens)) + return output_tokens + + def _run_strip_accents(self, text): + """Strips accents from a piece of text.""" + text = unicodedata.normalize("NFD", text) + output = [] + for char in text: + cat = unicodedata.category(char) + if cat == "Mn": + continue + output.append(char) + return "".join(output) + + def _run_split_on_punc(self, text): + """Splits punctuation on a piece of text.""" + chars = list(text) + i = 0 + start_new_word = True + output = [] + while i < len(chars): + char = chars[i] + if _is_punctuation(char): + output.append([char]) + start_new_word = True + else: + if start_new_word: + output.append([]) + start_new_word = False + output[-1].append(char) + i += 1 + + return ["".join(x) for x in output] + + def _tokenize_chinese_chars(self, text): + """Adds whitespace around any CJK character.""" + output = [] + for char in text: + cp = ord(char) + if self._is_chinese_char(cp): + output.append(" ") + output.append(char) + output.append(" ") + else: + output.append(char) + return "".join(output) + + def _is_chinese_char(self, cp): + """Checks whether CP is the codepoint of a CJK character.""" + # This defines a "chinese character" as anything in the CJK Unicode block: + # https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block) + # + # Note that the CJK Unicode block is NOT all Japanese and Korean characters, + # despite its name. The modern Korean Hangul alphabet is a different block, + # as is Japanese Hiragana and Katakana. Those alphabets are used to write + # space-separated words, so they are not treated specially and handled + # like the all of the other languages. + if ((cp >= 0x4E00 and cp <= 0x9FFF) or # + (cp >= 0x3400 and cp <= 0x4DBF) or # + (cp >= 0x20000 and cp <= 0x2A6DF) or # + (cp >= 0x2A700 and cp <= 0x2B73F) or # + (cp >= 0x2B740 and cp <= 0x2B81F) or # + (cp >= 0x2B820 and cp <= 0x2CEAF) or + (cp >= 0xF900 and cp <= 0xFAFF) or # + (cp >= 0x2F800 and cp <= 0x2FA1F)): # + return True + + return False + + def _clean_text(self, text): + """Performs invalid character removal and whitespace cleanup on text.""" + output = [] + for char in text: + cp = ord(char) + if cp == 0 or cp == 0xfffd or _is_control(char): + continue + if _is_whitespace(char): + output.append(" ") + else: + output.append(char) + return "".join(output) + + +class WordpieceTokenizer(object): + """Runs WordPiece tokenziation.""" + + def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=100): + self.vocab = vocab + self.unk_token = unk_token + self.max_input_chars_per_word = max_input_chars_per_word + + def tokenize(self, text): + """Tokenizes a piece of text into its word pieces. + This uses a greedy longest-match-first algorithm to perform tokenization + using the given vocabulary. + For example: + input = "unaffable" + output = ["un", "##aff", "##able"] + Args: + text: A single token or whitespace separated tokens. This should have + already been passed through `BasicTokenizer. + Returns: + A list of wordpiece tokens. + """ + + text = convert_to_unicode(text) + + output_tokens = [] + for token in whitespace_tokenize(text): + chars = list(token) + if len(chars) > self.max_input_chars_per_word: + output_tokens.append(self.unk_token) + continue + + is_bad = False + start = 0 + sub_tokens = [] + while start < len(chars): + end = len(chars) + cur_substr = None + while start < end: + substr = "".join(chars[start:end]) + if start > 0: + substr = "##" + substr + if substr in self.vocab: + cur_substr = substr + break + end -= 1 + if cur_substr is None: + is_bad = True + break + sub_tokens.append(cur_substr) + start = end + + if is_bad: + # output_tokens.append(self.unk_token) + output_tokens.append(token) # keep the UNK token + else: + output_tokens.extend(sub_tokens) + return output_tokens + + +def _is_whitespace(char): + """Checks whether `chars` is a whitespace character.""" + # \t, \n, and \r are technically contorl characters but we treat them + # as whitespace since they are generally considered as such. + if char == " " or char == "\t" or char == "\n" or char == "\r": + return True + cat = unicodedata.category(char) + if cat == "Zs": + return True + return False + + +def _is_control(char): + """Checks whether `chars` is a control character.""" + # These are technically control characters but we count them as whitespace + # characters. + if char == "\t" or char == "\n" or char == "\r": + return False + cat = unicodedata.category(char) + if cat.startswith("C"): + return True + return False + + +def _is_punctuation(char): + """Checks whether `chars` is a punctuation character.""" + cp = ord(char) + # We treat all non-letter/number ASCII as punctuation. + # Characters such as "^", "$", and "`" are not in the Unicode + # Punctuation class but we treat them as punctuation anyways, for + # consistency. + if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or + (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)): + return True + cat = unicodedata.category(char) + if cat.startswith("P"): + return True return False \ No newline at end of file diff --git a/opencompass/datasets/lawbench/utils/modules/tokenizer.py b/opencompass/datasets/lawbench/utils/modules/tokenizer.py index bb7111af..3f39e2c5 100644 --- a/opencompass/datasets/lawbench/utils/modules/tokenizer.py +++ b/opencompass/datasets/lawbench/utils/modules/tokenizer.py @@ -1,91 +1,91 @@ -from ltp import LTP -from typing import List -from pypinyin import pinyin, Style, lazy_pinyin -import torch -import os -import functools - -class Tokenizer: - """ - 分词器 - """ - - def __init__(self, - granularity: str = "word", - device: str = "cpu", - segmented: bool = False, - bpe: bool = False, - ) -> None: - """ - 构造函数 - :param mode: 分词模式,可选级别:字级别(char)、词级别(word) - """ - self.ltp = None - if granularity == "word": - self.ltp = LTP(device=torch.device(device) if torch.cuda.is_available() else torch.device("cpu")) - self.ltp.add_words(words=["[缺失成分]"], max_window=6) - self.segmented = segmented - self.granularity = granularity - if self.granularity == "word": - self.tokenizer = self.split_word - elif self.granularity == "char": - self.tokenizer = functools.partial(self.split_char, bpe=bpe) - else: - raise NotImplementedError - - def __repr__(self) -> str: - return "{:s}\nMode:{:s}\n}".format(str(self.__class__.__name__), self.mode) - - def __call__(self, - input_strings: List[str] - ) -> List: - """ - 分词函数 - :param input_strings: 需要分词的字符串列表 - :return: 分词后的结果列表,由元组组成,元组为(token,pos_tag,pinyin)的形式 - """ - if not self.segmented: - input_strings = ["".join(s.split(" ")) for s in input_strings] - results = self.tokenizer(input_strings) - return results - - def split_char(self, input_strings: List[str], bpe=False) -> List: - """ - 分字函数 - :param input_strings: 需要分字的字符串 - :return: 分字结果 - """ - if bpe: - from . import tokenization - tokenizer = tokenization.FullTokenizer(vocab_file=os.path.join(os.path.dirname(__file__), "..", "..", "..", "..", "..", "data", "lawbench", "eval_assets", "chinese_vocab.txt"), do_lower_case=False) - results = [] - for input_string in input_strings: - if not self.segmented: # 如果没有被分字,就按照每个字符隔开(不考虑英文标点的特殊处理,也不考虑BPE),否则遵循原分字结果 - segment_string = " ".join([char for char in input_string] if not bpe else tokenizer.tokenize(input_string)) - else: - segment_string = input_string - # print(segment_string) - segment_string = segment_string.replace("[ 缺 失 成 分 ]", "[缺失成分]").split(" ") # 缺失成分当成一个单独的token - results.append([(char, "unk", pinyin(char, style=Style.NORMAL, heteronym=True)[0]) for char in segment_string]) - return results - - def split_word(self, input_strings: List[str]) -> List: - """ - 分词函数 - :param input_strings: 需要分词的字符串 - :return: 分词结果 - """ - if self.segmented: - seg, hidden = self.ltp.seg([input_string.split(" ") for input_string in input_strings], is_preseged=True) - else: - seg, hidden = self.ltp.seg(input_strings) - pos = self.ltp.pos(hidden) - result = [] - for s, p in zip(seg, pos): - pinyin = [lazy_pinyin(word) for word in s] - result.append(list(zip(s, p, pinyin))) - return result - -if __name__ == "__main__": - tokenizer = Tokenizer("word") - print(tokenizer(["LAC是个优秀的分词工具", "百度是一家高科技公司"])) +from ltp import LTP +from typing import List +from pypinyin import pinyin, Style, lazy_pinyin +import torch +import os +import functools + +class Tokenizer: + """ + 分词器 + """ + + def __init__(self, + granularity: str = "word", + device: str = "cpu", + segmented: bool = False, + bpe: bool = False, + ) -> None: + """ + 构造函数 + :param mode: 分词模式,可选级别:字级别(char)、词级别(word) + """ + self.ltp = None + if granularity == "word": + self.ltp = LTP(device=torch.device(device) if torch.cuda.is_available() else torch.device("cpu")) + self.ltp.add_words(words=["[缺失成分]"], max_window=6) + self.segmented = segmented + self.granularity = granularity + if self.granularity == "word": + self.tokenizer = self.split_word + elif self.granularity == "char": + self.tokenizer = functools.partial(self.split_char, bpe=bpe) + else: + raise NotImplementedError + + def __repr__(self) -> str: + return "{:s}\nMode:{:s}\n}".format(str(self.__class__.__name__), self.mode) + + def __call__(self, + input_strings: List[str] + ) -> List: + """ + 分词函数 + :param input_strings: 需要分词的字符串列表 + :return: 分词后的结果列表,由元组组成,元组为(token,pos_tag,pinyin)的形式 + """ + if not self.segmented: + input_strings = ["".join(s.split(" ")) for s in input_strings] + results = self.tokenizer(input_strings) + return results + + def split_char(self, input_strings: List[str], bpe=False) -> List: + """ + 分字函数 + :param input_strings: 需要分字的字符串 + :return: 分字结果 + """ + if bpe: + from . import tokenization + tokenizer = tokenization.FullTokenizer(vocab_file=os.path.join(os.path.dirname(__file__), "..", "..", "..", "..", "..", "data", "lawbench", "eval_assets", "chinese_vocab.txt"), do_lower_case=False) + results = [] + for input_string in input_strings: + if not self.segmented: # 如果没有被分字,就按照每个字符隔开(不考虑英文标点的特殊处理,也不考虑BPE),否则遵循原分字结果 + segment_string = " ".join([char for char in input_string] if not bpe else tokenizer.tokenize(input_string)) + else: + segment_string = input_string + # print(segment_string) + segment_string = segment_string.replace("[ 缺 失 成 分 ]", "[缺失成分]").split(" ") # 缺失成分当成一个单独的token + results.append([(char, "unk", pinyin(char, style=Style.NORMAL, heteronym=True)[0]) for char in segment_string]) + return results + + def split_word(self, input_strings: List[str]) -> List: + """ + 分词函数 + :param input_strings: 需要分词的字符串 + :return: 分词结果 + """ + if self.segmented: + seg, hidden = self.ltp.seg([input_string.split(" ") for input_string in input_strings], is_preseged=True) + else: + seg, hidden = self.ltp.seg(input_strings) + pos = self.ltp.pos(hidden) + result = [] + for s, p in zip(seg, pos): + pinyin = [lazy_pinyin(word) for word in s] + result.append(list(zip(s, p, pinyin))) + return result + +if __name__ == "__main__": + tokenizer = Tokenizer("word") + print(tokenizer(["LAC是个优秀的分词工具", "百度是一家高科技公司"])) diff --git a/opencompass/datasets/lawbench/utils/parallel_to_m2.py b/opencompass/datasets/lawbench/utils/parallel_to_m2.py index 6b2c035b..30dbb2f1 100644 --- a/opencompass/datasets/lawbench/utils/parallel_to_m2.py +++ b/opencompass/datasets/lawbench/utils/parallel_to_m2.py @@ -1,221 +1,221 @@ -import os -from modules.annotator import Annotator -from modules.tokenizer import Tokenizer -import argparse -from collections import Counter -from tqdm import tqdm -import torch -from collections import defaultdict -from multiprocessing import Pool -from opencc import OpenCC -import timeout_decorator - -os.environ["TOKENIZERS_PARALLELISM"] = "false" - -annotator, sentence_to_tokenized = None, None -cc = OpenCC("t2s") - -@timeout_decorator.timeout(10) -def annotate_with_time_out(line): - """ - :param line: - :return: - """ - sent_list = line.split("\t")[1:] - source = sent_list[0] - if args.segmented: - source = source.strip() - else: - source = "".join(source.strip().split()) - output_str = "" - for idx, target in enumerate(sent_list[1:]): - try: - if args.segmented: - target = target.strip() - else: - target = "".join(target.strip().split()) - if not args.no_simplified: - target = cc.convert(target) - source_tokenized, target_tokenized = sentence_to_tokenized[source], sentence_to_tokenized[target] - out, cors = annotator(source_tokenized, target_tokenized, idx) - if idx == 0: - output_str += "".join(out[:-1]) - else: - output_str += "".join(out[1:-1]) - except Exception: - raise Exception - return output_str - - -def annotate(line): - """ - :param line: - :return: - """ - sent_list = line.split("\t")[1:] - source = sent_list[0] - if args.segmented: - source = source.strip() - else: - source = "".join(source.strip().split()) - output_str = "" - for idx, target in enumerate(sent_list[1:]): - try: - if args.segmented: - target = target.strip() - else: - target = "".join(target.strip().split()) - if not args.no_simplified: - target = cc.convert(target) - source_tokenized, target_tokenized = sentence_to_tokenized[source], sentence_to_tokenized[target] - out, cors = annotator(source_tokenized, target_tokenized, idx) - if idx == 0: - output_str += "".join(out[:-1]) - else: - output_str += "".join(out[1:-1]) - except Exception: - raise Exception - return output_str - - - - - -def firsttime_process(args): - tokenizer = Tokenizer(args.granularity, args.device, args.segmented, args.bpe) - global annotator, sentence_to_tokenized - annotator = Annotator.create_default(args.granularity, args.multi_cheapest_strategy) - lines = open(args.file, "r", encoding="utf-8").read().strip().split("\n") # format: id src tgt1 tgt2... - # error_types = [] - - with open(args.output, "w", encoding="utf-8") as f: - count = 0 - sentence_set = set() - sentence_to_tokenized = {} - for line in lines: - sent_list = line.split("\t")[1:] - for idx, sent in enumerate(sent_list): - if args.segmented: - # print(sent) - sent = sent.strip() - else: - sent = "".join(sent.split()).strip() - if idx >= 1: - if not args.no_simplified: - sentence_set.add(cc.convert(sent)) - else: - sentence_set.add(sent) - else: - sentence_set.add(sent) - batch = [] - for sent in tqdm(sentence_set): - count += 1 - if sent: - batch.append(sent) - if count % args.batch_size == 0: - results = tokenizer(batch) - for s, r in zip(batch, results): - sentence_to_tokenized[s] = r # Get tokenization map. - batch = [] - if batch: - results = tokenizer(batch) - for s, r in zip(batch, results): - sentence_to_tokenized[s] = r # Get tokenization map. - - timeout_indices = [] - - # 单进程模式 - for idx, line in enumerate(tqdm(lines)): - try: - ret = annotate_with_time_out(line) - except Exception: - timeout_indices.append(idx) - return timeout_indices - - - -def main(args): - timeout_indices = firsttime_process(args) - tokenizer = Tokenizer(args.granularity, args.device, args.segmented, args.bpe) - global annotator, sentence_to_tokenized - annotator = Annotator.create_default(args.granularity, args.multi_cheapest_strategy) - lines = open(args.file, "r", encoding="utf-8").read().strip().split("\n") - new_lines = []# format: id src tgt1 tgt2... - - with open(args.output, "w", encoding="utf-8") as f: - count = 0 - sentence_set = set() - sentence_to_tokenized = {} - for line_idx, line in enumerate(lines): - - if line_idx in timeout_indices: - # print(f"line before split: {line}") - line_split = line.split("\t") - line_number, sent_list = line_split[0], line_split[1:] - assert len(sent_list) == 2 - sent_list[-1] = " 无" - line = line_number + "\t" + "\t".join(sent_list) - # print(f"line time out: {line}") - new_lines.append(line) - else: - new_lines.append(line) - - sent_list = line.split("\t")[1:] - for idx, sent in enumerate(sent_list): - if args.segmented: - # print(sent) - sent = sent.strip() - else: - sent = "".join(sent.split()).strip() - if idx >= 1: - if not args.no_simplified: - sentence_set.add(cc.convert(sent)) - else: - sentence_set.add(sent) - else: - sentence_set.add(sent) - batch = [] - for sent in tqdm(sentence_set): - count += 1 - if sent: - batch.append(sent) - if count % args.batch_size == 0: - results = tokenizer(batch) - for s, r in zip(batch, results): - sentence_to_tokenized[s] = r # Get tokenization map. - batch = [] - if batch: - results = tokenizer(batch) - for s, r in zip(batch, results): - sentence_to_tokenized[s] = r # Get tokenization map. - - # 单进程模式 - lines = new_lines - for idx, line in enumerate(tqdm(lines)): - ret = annotate(line) - f.write(ret) - f.write("\n") - - # 多进程模式:仅在Linux环境下测试,建议在linux服务器上使用 - # with Pool(args.worker_num) as pool: - # for ret in pool.imap(annotate, tqdm(lines), chunksize=8): - # if ret: - # f.write(ret) - # f.write("\n") - - -if __name__ == "__main__": - parser = argparse.ArgumentParser(description="Choose input file to annotate") - parser.add_argument("-f", "--file", type=str, required=True, help="Input parallel file") - parser.add_argument("-o", "--output", type=str, help="Output file", required=True) - parser.add_argument("-b", "--batch_size", type=int, help="The size of batch", default=128) - parser.add_argument("-d", "--device", type=int, help="The ID of GPU", default=0) - parser.add_argument("-w", "--worker_num", type=int, help="The number of workers", default=16) - parser.add_argument("-g", "--granularity", type=str, help="Choose char-level or word-level evaluation", default="char") - parser.add_argument("-m", "--merge", help="Whether merge continuous replacement/deletion/insertion", action="store_true") - parser.add_argument("-s", "--multi_cheapest_strategy", type=str, choices=["first", "all"], default="all") - parser.add_argument("--segmented", help="Whether tokens have been segmented", action="store_true") # 支持提前token化,用空格隔开 - parser.add_argument("--no_simplified", help="Whether simplifying chinese", action="store_true") # 将所有corrections转换为简体中文 - parser.add_argument("--bpe", help="Whether to use bpe", action="store_true") # 支持 bpe 切分英文单词 - args = parser.parse_args() - main(args) +import os +from modules.annotator import Annotator +from modules.tokenizer import Tokenizer +import argparse +from collections import Counter +from tqdm import tqdm +import torch +from collections import defaultdict +from multiprocessing import Pool +from opencc import OpenCC +import timeout_decorator + +os.environ["TOKENIZERS_PARALLELISM"] = "false" + +annotator, sentence_to_tokenized = None, None +cc = OpenCC("t2s") + +@timeout_decorator.timeout(10) +def annotate_with_time_out(line): + """ + :param line: + :return: + """ + sent_list = line.split("\t")[1:] + source = sent_list[0] + if args.segmented: + source = source.strip() + else: + source = "".join(source.strip().split()) + output_str = "" + for idx, target in enumerate(sent_list[1:]): + try: + if args.segmented: + target = target.strip() + else: + target = "".join(target.strip().split()) + if not args.no_simplified: + target = cc.convert(target) + source_tokenized, target_tokenized = sentence_to_tokenized[source], sentence_to_tokenized[target] + out, cors = annotator(source_tokenized, target_tokenized, idx) + if idx == 0: + output_str += "".join(out[:-1]) + else: + output_str += "".join(out[1:-1]) + except Exception: + raise Exception + return output_str + + +def annotate(line): + """ + :param line: + :return: + """ + sent_list = line.split("\t")[1:] + source = sent_list[0] + if args.segmented: + source = source.strip() + else: + source = "".join(source.strip().split()) + output_str = "" + for idx, target in enumerate(sent_list[1:]): + try: + if args.segmented: + target = target.strip() + else: + target = "".join(target.strip().split()) + if not args.no_simplified: + target = cc.convert(target) + source_tokenized, target_tokenized = sentence_to_tokenized[source], sentence_to_tokenized[target] + out, cors = annotator(source_tokenized, target_tokenized, idx) + if idx == 0: + output_str += "".join(out[:-1]) + else: + output_str += "".join(out[1:-1]) + except Exception: + raise Exception + return output_str + + + + + +def firsttime_process(args): + tokenizer = Tokenizer(args.granularity, args.device, args.segmented, args.bpe) + global annotator, sentence_to_tokenized + annotator = Annotator.create_default(args.granularity, args.multi_cheapest_strategy) + lines = open(args.file, "r", encoding="utf-8").read().strip().split("\n") # format: id src tgt1 tgt2... + # error_types = [] + + with open(args.output, "w", encoding="utf-8") as f: + count = 0 + sentence_set = set() + sentence_to_tokenized = {} + for line in lines: + sent_list = line.split("\t")[1:] + for idx, sent in enumerate(sent_list): + if args.segmented: + # print(sent) + sent = sent.strip() + else: + sent = "".join(sent.split()).strip() + if idx >= 1: + if not args.no_simplified: + sentence_set.add(cc.convert(sent)) + else: + sentence_set.add(sent) + else: + sentence_set.add(sent) + batch = [] + for sent in tqdm(sentence_set): + count += 1 + if sent: + batch.append(sent) + if count % args.batch_size == 0: + results = tokenizer(batch) + for s, r in zip(batch, results): + sentence_to_tokenized[s] = r # Get tokenization map. + batch = [] + if batch: + results = tokenizer(batch) + for s, r in zip(batch, results): + sentence_to_tokenized[s] = r # Get tokenization map. + + timeout_indices = [] + + # 单进程模式 + for idx, line in enumerate(tqdm(lines)): + try: + ret = annotate_with_time_out(line) + except Exception: + timeout_indices.append(idx) + return timeout_indices + + + +def main(args): + timeout_indices = firsttime_process(args) + tokenizer = Tokenizer(args.granularity, args.device, args.segmented, args.bpe) + global annotator, sentence_to_tokenized + annotator = Annotator.create_default(args.granularity, args.multi_cheapest_strategy) + lines = open(args.file, "r", encoding="utf-8").read().strip().split("\n") + new_lines = []# format: id src tgt1 tgt2... + + with open(args.output, "w", encoding="utf-8") as f: + count = 0 + sentence_set = set() + sentence_to_tokenized = {} + for line_idx, line in enumerate(lines): + + if line_idx in timeout_indices: + # print(f"line before split: {line}") + line_split = line.split("\t") + line_number, sent_list = line_split[0], line_split[1:] + assert len(sent_list) == 2 + sent_list[-1] = " 无" + line = line_number + "\t" + "\t".join(sent_list) + # print(f"line time out: {line}") + new_lines.append(line) + else: + new_lines.append(line) + + sent_list = line.split("\t")[1:] + for idx, sent in enumerate(sent_list): + if args.segmented: + # print(sent) + sent = sent.strip() + else: + sent = "".join(sent.split()).strip() + if idx >= 1: + if not args.no_simplified: + sentence_set.add(cc.convert(sent)) + else: + sentence_set.add(sent) + else: + sentence_set.add(sent) + batch = [] + for sent in tqdm(sentence_set): + count += 1 + if sent: + batch.append(sent) + if count % args.batch_size == 0: + results = tokenizer(batch) + for s, r in zip(batch, results): + sentence_to_tokenized[s] = r # Get tokenization map. + batch = [] + if batch: + results = tokenizer(batch) + for s, r in zip(batch, results): + sentence_to_tokenized[s] = r # Get tokenization map. + + # 单进程模式 + lines = new_lines + for idx, line in enumerate(tqdm(lines)): + ret = annotate(line) + f.write(ret) + f.write("\n") + + # 多进程模式:仅在Linux环境下测试,建议在linux服务器上使用 + # with Pool(args.worker_num) as pool: + # for ret in pool.imap(annotate, tqdm(lines), chunksize=8): + # if ret: + # f.write(ret) + # f.write("\n") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Choose input file to annotate") + parser.add_argument("-f", "--file", type=str, required=True, help="Input parallel file") + parser.add_argument("-o", "--output", type=str, help="Output file", required=True) + parser.add_argument("-b", "--batch_size", type=int, help="The size of batch", default=128) + parser.add_argument("-d", "--device", type=int, help="The ID of GPU", default=0) + parser.add_argument("-w", "--worker_num", type=int, help="The number of workers", default=16) + parser.add_argument("-g", "--granularity", type=str, help="Choose char-level or word-level evaluation", default="char") + parser.add_argument("-m", "--merge", help="Whether merge continuous replacement/deletion/insertion", action="store_true") + parser.add_argument("-s", "--multi_cheapest_strategy", type=str, choices=["first", "all"], default="all") + parser.add_argument("--segmented", help="Whether tokens have been segmented", action="store_true") # 支持提前token化,用空格隔开 + parser.add_argument("--no_simplified", help="Whether simplifying chinese", action="store_true") # 将所有corrections转换为简体中文 + parser.add_argument("--bpe", help="Whether to use bpe", action="store_true") # 支持 bpe 切分英文单词 + args = parser.parse_args() + main(args) diff --git a/opencompass/datasets/lawbench/utils/rc_f1.py b/opencompass/datasets/lawbench/utils/rc_f1.py index ec403513..c8fc77c3 100644 --- a/opencompass/datasets/lawbench/utils/rc_f1.py +++ b/opencompass/datasets/lawbench/utils/rc_f1.py @@ -1,158 +1,158 @@ -"""Official evaluation script for CAIL-2021. - -The code is based partially on CoQA evaluation script. -""" -import json -import sys - -from collections import Counter - - -class CJRCEvaluator: - def __init__(self, gold_file): - self.gold_data = CJRCEvaluator.gold_answers_to_dict(gold_file) - - @staticmethod - def gold_answers_to_dict(gold_file): - dataset = json.load(open(gold_file, mode="r", encoding="utf-8")) - gold_dict = {} - # id_to_domain = {} - for story in dataset['data']: - qas = story["paragraphs"][0]["qas"] - for qa in qas: - qid = qa['id'] - gold_answers = [] - answers = qa["answers"] - if len(answers) == 0: - gold_answers = [''] - else: - for answer in qa["answers"]: - if type(answer) == dict: - gold_answers.append(answer["text"]) - elif type(answer) == list: - gold_answers.append("".join([a["text"] for a in answer])) - if qid in gold_dict: - sys.stderr.write("Gold file has duplicate stories: {}".format(qid)) - gold_dict[qid] = gold_answers - return gold_dict - - @staticmethod - def preds_to_dict(pred_file): - preds = json.load(open(pred_file, mode="r", encoding="utf-8")) - pred_dict = {} - for pred in preds: - pred_dict[pred['id']] = "".join(pred['answer']) - return pred_dict - - @staticmethod - def normalize_answer(s): - """Lower text and remove punctuation, storys and extra whitespace.""" - - def remove_punc(text): - return "".join(ch for ch in text if ch.isdigit() or ch.isalpha()) - - def lower(text): - return text.lower() - - return remove_punc(lower(s)) - - @staticmethod - def get_tokens(s): - if not s: return [] - return list(CJRCEvaluator.normalize_answer(s)) - - @staticmethod - def compute_exact(a_gold, a_pred): - return int(CJRCEvaluator.normalize_answer(a_gold) == CJRCEvaluator.normalize_answer(a_pred)) - - @staticmethod - def compute_f1(a_gold, a_pred): - gold_toks = CJRCEvaluator.get_tokens(a_gold) - pred_toks = CJRCEvaluator.get_tokens(a_pred) - common = Counter(gold_toks) & Counter(pred_toks) - num_same = sum(common.values()) - if len(gold_toks) == 0 or len(pred_toks) == 0: - # If either is no-answer, then F1 is 1 if they agree, 0 otherwise - return int(gold_toks == pred_toks) - if num_same == 0: - return 0 - precision = 1.0 * num_same / len(pred_toks) - recall = 1.0 * num_same / len(gold_toks) - f1 = (2 * precision * recall) / (precision + recall) - return f1 - - @staticmethod - def _compute_turn_score(a_gold_list, a_pred): - f1_sum = 0.0 - em_sum = 0.0 - if len(a_gold_list) > 1: - for i in range(len(a_gold_list)): - # exclude the current answer - gold_answers = a_gold_list[0:i] + a_gold_list[i + 1:] - em_sum += max(CJRCEvaluator.compute_exact(a, a_pred) for a in gold_answers) - f1_sum += max(CJRCEvaluator.compute_f1(a, a_pred) for a in gold_answers) - else: - em_sum += max(CJRCEvaluator.compute_exact(a, a_pred) for a in a_gold_list) - f1_sum += max(CJRCEvaluator.compute_f1(a, a_pred) for a in a_gold_list) - if f1_sum != 1: - a = 1 + 1 - return {'em': em_sum / max(1, len(a_gold_list)), 'f1': f1_sum / max(1, len(a_gold_list))} - - def compute_turn_score(self, qid, a_pred): - ''' This is the function what you are probably looking for. a_pred is the answer string your model predicted. ''' - a_gold_list = self.gold_data[qid] - return CJRCEvaluator._compute_turn_score(a_gold_list, a_pred) - - def get_raw_scores(self, pred_data): - ''''Returns a dict with score''' - exact_scores = {} - f1_scores = {} - for qid in self.gold_data: - if qid not in pred_data: - sys.stderr.write('Missing prediction for {}\n'.format(qid)) - continue - a_pred = pred_data[qid] - scores = self.compute_turn_score(qid, a_pred) - # Take max over all gold answers - exact_scores[qid] = scores['em'] - f1_scores[qid] = scores['f1'] - return exact_scores, f1_scores - - def get_raw_scores_human(self): - ''' - Returns a dict with score - ''' - exact_scores = {} - f1_scores = {} - for qid in self.gold_data: - f1_sum = 0.0 - em_sum = 0.0 - if len(self.gold_data[qid]) > 1: - for i in range(len(self.gold_data[qid])): - # exclude the current answer - gold_answers = self.gold_data[qid][0:i] + self.gold_data[qid][i + 1:] - em_sum += max(CJRCEvaluator.compute_exact(a, self.gold_data[qid][i]) for a in gold_answers) - f1_sum += max(CJRCEvaluator.compute_f1(a, self.gold_data[qid][i]) for a in gold_answers) - else: - exit("Gold answers should be multiple: {}={}".format(qid, self.gold_data[qid])) - exact_scores[qid] = em_sum / len(self.gold_data[qid]) - f1_scores[qid] = f1_sum / len(self.gold_data[qid]) - return exact_scores, f1_scores - - def human_performance(self): - exact_scores, f1_scores = self.get_raw_scores_human() - return self.get_total_scores(exact_scores, f1_scores) - - def model_performance(self, pred_data): - exact_scores, f1_scores = self.get_raw_scores(pred_data) - return self.get_total_scores(exact_scores, f1_scores) - - def get_total_scores(self, exact_scores, f1_scores): - em_total, f1_total, turn_count = 0, 0, 0 - scores = {} - for qid in self.gold_data: - em_total += exact_scores.get(qid, 0) - f1_total += f1_scores.get(qid, 0) - turn_count += 1 - scores["F1"] = round(f1_total / max(1, turn_count) * 100, 1) - return scores +"""Official evaluation script for CAIL-2021. + +The code is based partially on CoQA evaluation script. +""" +import json +import sys + +from collections import Counter + + +class CJRCEvaluator: + def __init__(self, gold_file): + self.gold_data = CJRCEvaluator.gold_answers_to_dict(gold_file) + + @staticmethod + def gold_answers_to_dict(gold_file): + dataset = json.load(open(gold_file, mode="r", encoding="utf-8")) + gold_dict = {} + # id_to_domain = {} + for story in dataset['data']: + qas = story["paragraphs"][0]["qas"] + for qa in qas: + qid = qa['id'] + gold_answers = [] + answers = qa["answers"] + if len(answers) == 0: + gold_answers = [''] + else: + for answer in qa["answers"]: + if type(answer) == dict: + gold_answers.append(answer["text"]) + elif type(answer) == list: + gold_answers.append("".join([a["text"] for a in answer])) + if qid in gold_dict: + sys.stderr.write("Gold file has duplicate stories: {}".format(qid)) + gold_dict[qid] = gold_answers + return gold_dict + + @staticmethod + def preds_to_dict(pred_file): + preds = json.load(open(pred_file, mode="r", encoding="utf-8")) + pred_dict = {} + for pred in preds: + pred_dict[pred['id']] = "".join(pred['answer']) + return pred_dict + + @staticmethod + def normalize_answer(s): + """Lower text and remove punctuation, storys and extra whitespace.""" + + def remove_punc(text): + return "".join(ch for ch in text if ch.isdigit() or ch.isalpha()) + + def lower(text): + return text.lower() + + return remove_punc(lower(s)) + + @staticmethod + def get_tokens(s): + if not s: return [] + return list(CJRCEvaluator.normalize_answer(s)) + + @staticmethod + def compute_exact(a_gold, a_pred): + return int(CJRCEvaluator.normalize_answer(a_gold) == CJRCEvaluator.normalize_answer(a_pred)) + + @staticmethod + def compute_f1(a_gold, a_pred): + gold_toks = CJRCEvaluator.get_tokens(a_gold) + pred_toks = CJRCEvaluator.get_tokens(a_pred) + common = Counter(gold_toks) & Counter(pred_toks) + num_same = sum(common.values()) + if len(gold_toks) == 0 or len(pred_toks) == 0: + # If either is no-answer, then F1 is 1 if they agree, 0 otherwise + return int(gold_toks == pred_toks) + if num_same == 0: + return 0 + precision = 1.0 * num_same / len(pred_toks) + recall = 1.0 * num_same / len(gold_toks) + f1 = (2 * precision * recall) / (precision + recall) + return f1 + + @staticmethod + def _compute_turn_score(a_gold_list, a_pred): + f1_sum = 0.0 + em_sum = 0.0 + if len(a_gold_list) > 1: + for i in range(len(a_gold_list)): + # exclude the current answer + gold_answers = a_gold_list[0:i] + a_gold_list[i + 1:] + em_sum += max(CJRCEvaluator.compute_exact(a, a_pred) for a in gold_answers) + f1_sum += max(CJRCEvaluator.compute_f1(a, a_pred) for a in gold_answers) + else: + em_sum += max(CJRCEvaluator.compute_exact(a, a_pred) for a in a_gold_list) + f1_sum += max(CJRCEvaluator.compute_f1(a, a_pred) for a in a_gold_list) + if f1_sum != 1: + a = 1 + 1 + return {'em': em_sum / max(1, len(a_gold_list)), 'f1': f1_sum / max(1, len(a_gold_list))} + + def compute_turn_score(self, qid, a_pred): + ''' This is the function what you are probably looking for. a_pred is the answer string your model predicted. ''' + a_gold_list = self.gold_data[qid] + return CJRCEvaluator._compute_turn_score(a_gold_list, a_pred) + + def get_raw_scores(self, pred_data): + ''''Returns a dict with score''' + exact_scores = {} + f1_scores = {} + for qid in self.gold_data: + if qid not in pred_data: + sys.stderr.write('Missing prediction for {}\n'.format(qid)) + continue + a_pred = pred_data[qid] + scores = self.compute_turn_score(qid, a_pred) + # Take max over all gold answers + exact_scores[qid] = scores['em'] + f1_scores[qid] = scores['f1'] + return exact_scores, f1_scores + + def get_raw_scores_human(self): + ''' + Returns a dict with score + ''' + exact_scores = {} + f1_scores = {} + for qid in self.gold_data: + f1_sum = 0.0 + em_sum = 0.0 + if len(self.gold_data[qid]) > 1: + for i in range(len(self.gold_data[qid])): + # exclude the current answer + gold_answers = self.gold_data[qid][0:i] + self.gold_data[qid][i + 1:] + em_sum += max(CJRCEvaluator.compute_exact(a, self.gold_data[qid][i]) for a in gold_answers) + f1_sum += max(CJRCEvaluator.compute_f1(a, self.gold_data[qid][i]) for a in gold_answers) + else: + exit("Gold answers should be multiple: {}={}".format(qid, self.gold_data[qid])) + exact_scores[qid] = em_sum / len(self.gold_data[qid]) + f1_scores[qid] = f1_sum / len(self.gold_data[qid]) + return exact_scores, f1_scores + + def human_performance(self): + exact_scores, f1_scores = self.get_raw_scores_human() + return self.get_total_scores(exact_scores, f1_scores) + + def model_performance(self, pred_data): + exact_scores, f1_scores = self.get_raw_scores(pred_data) + return self.get_total_scores(exact_scores, f1_scores) + + def get_total_scores(self, exact_scores, f1_scores): + em_total, f1_total, turn_count = 0, 0, 0 + scores = {} + for qid in self.gold_data: + em_total += exact_scores.get(qid, 0) + f1_total += f1_scores.get(qid, 0) + turn_count += 1 + scores["F1"] = round(f1_total / max(1, turn_count) * 100, 1) + return scores diff --git a/opencompass/datasets/maritime_bench.py b/opencompass/datasets/maritime_bench.py new file mode 100644 index 00000000..089f8c27 --- /dev/null +++ b/opencompass/datasets/maritime_bench.py @@ -0,0 +1,64 @@ +import json +import os.path as osp +from os import environ + +import datasets +from datasets import Dataset, DatasetDict + +from opencompass.registry import LOAD_DATASET +from opencompass.utils import get_data_path + +from .base import BaseDataset + + +@LOAD_DATASET.register_module() +class MaritimeBenchDataset(BaseDataset): + + @staticmethod + def load(path: str, name: str) -> datasets.Dataset: + path = get_data_path(path) + dataset = DatasetDict() + dataset_list = [] + + if environ.get('DATASET_SOURCE') == 'ModelScope': + from modelscope import MsDataset + for split in ['test']: + # 从 ModelScope 加载数据 + ms_dataset = MsDataset.load(path, + subset_name=name, + split=split) + + for line in ms_dataset: + question = line['question'] + A = line['A'] + B = line['B'] + C = line['C'] + D = line['D'] + answer = line['answer'] + dataset_list.append({ + 'question': question, + 'A': A, + 'B': B, + 'C': C, + 'D': D, + 'answer': answer, + }) + # dataset[split] = Dataset.from_list(dataset_list) + else: + for split in ['test']: + filename = osp.join(path, split, f'{name}_{split}.jsonl') + with open(filename, encoding='utf-8') as f: + for line in f: + data = json.loads(line) + dataset_list.append({ + 'question': data['question'], + 'A': data['A'], + 'B': data['B'], + 'C': data['C'], + 'D': data['D'], + 'answer': data['answer'] + }) + + dataset[split] = Dataset.from_list(dataset_list) + + return dataset diff --git a/opencompass/datasets/medbench/__init__.py b/opencompass/datasets/medbench/__init__.py index 5d0d8ccd..eac60625 100644 --- a/opencompass/datasets/medbench/__init__.py +++ b/opencompass/datasets/medbench/__init__.py @@ -1,3 +1,3 @@ -# flake8: noqa - -from .medbench import * # noqa: F401, F403 +# flake8: noqa + +from .medbench import * # noqa: F401, F403 diff --git a/opencompass/datasets/medbench/constructions.py b/opencompass/datasets/medbench/constructions.py index c3302173..323e47ff 100644 --- a/opencompass/datasets/medbench/constructions.py +++ b/opencompass/datasets/medbench/constructions.py @@ -1,104 +1,104 @@ -# flake8: noqa -import pandas as pd - - -class TaskSchema(object): - - def __init__(self, - passage=None, - question=None, - options=None, - label=None, - answer=None, - other=None): - self.passage = passage - self.question = question - self.options = options - self.label = label - self.answer = answer - self.other = other - - def to_dict(self): - return { - 'passage': self.passage, - 'question': self.question, - 'options': self.options, - 'label': self.label, - 'answer': self.answer, - 'other': self.other - } - - -# define README.json -class MedBenchInstance(object): - - def __init__(self, task_description, data_source, task_schema, output, - evaluation_metric, task_example): - self.task_description = task_description - self.data_source = data_source - self.task_schema = task_schema - self.output = output - self.evaluation_metric = evaluation_metric - self.task_example = task_example - - def to_dict(self): - return { - 'task description': self.task_description, - 'data source': self.data_source, - 'task schema': self.task_schema.to_dict(), - 'output': self.output, - 'evaluation metric': self.evaluation_metric, - 'task example': self.task_example - } - - -class ChatGPTSchema(object): - - def __init__(self, context=None, metadata=''): - self.context = context - self.metadata = metadata - - def to_dict(self): - return {'context': self.context, 'metadata': self.metadata} - - -class ResultsForHumanSchema(object): - - def __init__(self, - index, - problem_input, - label, - model_input='', - model_output='', - parse_result='', - first_stage_output='', - second_stage_input='', - is_correct=False): - self.index = index - self.problem_input = problem_input - self.model_input = model_input - self.model_output = model_output - self.parse_result = parse_result - self.label = label - self.first_stage_output = first_stage_output - self.second_stage_input = second_stage_input - self.is_correct = is_correct - - def to_dict(self): - return { - 'index': self.index, - 'problem_input': self.problem_input, - 'model_input': self.model_input, - 'model_output': self.model_output, - 'parse_result': self.parse_result, - 'label': self.label, - 'is_correct': self.is_correct, - 'first_stage_output': self.first_stage_output, - 'second_stage_input': self.second_stage_input, - } - - @staticmethod - def to_tsv(result_list, path): - result_json = [item.to_dict() for item in result_list] - table = pd.json_normalize(result_json) - table.to_excel(path, index=False) +# flake8: noqa +import pandas as pd + + +class TaskSchema(object): + + def __init__(self, + passage=None, + question=None, + options=None, + label=None, + answer=None, + other=None): + self.passage = passage + self.question = question + self.options = options + self.label = label + self.answer = answer + self.other = other + + def to_dict(self): + return { + 'passage': self.passage, + 'question': self.question, + 'options': self.options, + 'label': self.label, + 'answer': self.answer, + 'other': self.other + } + + +# define README.json +class MedBenchInstance(object): + + def __init__(self, task_description, data_source, task_schema, output, + evaluation_metric, task_example): + self.task_description = task_description + self.data_source = data_source + self.task_schema = task_schema + self.output = output + self.evaluation_metric = evaluation_metric + self.task_example = task_example + + def to_dict(self): + return { + 'task description': self.task_description, + 'data source': self.data_source, + 'task schema': self.task_schema.to_dict(), + 'output': self.output, + 'evaluation metric': self.evaluation_metric, + 'task example': self.task_example + } + + +class ChatGPTSchema(object): + + def __init__(self, context=None, metadata=''): + self.context = context + self.metadata = metadata + + def to_dict(self): + return {'context': self.context, 'metadata': self.metadata} + + +class ResultsForHumanSchema(object): + + def __init__(self, + index, + problem_input, + label, + model_input='', + model_output='', + parse_result='', + first_stage_output='', + second_stage_input='', + is_correct=False): + self.index = index + self.problem_input = problem_input + self.model_input = model_input + self.model_output = model_output + self.parse_result = parse_result + self.label = label + self.first_stage_output = first_stage_output + self.second_stage_input = second_stage_input + self.is_correct = is_correct + + def to_dict(self): + return { + 'index': self.index, + 'problem_input': self.problem_input, + 'model_input': self.model_input, + 'model_output': self.model_output, + 'parse_result': self.parse_result, + 'label': self.label, + 'is_correct': self.is_correct, + 'first_stage_output': self.first_stage_output, + 'second_stage_input': self.second_stage_input, + } + + @staticmethod + def to_tsv(result_list, path): + result_json = [item.to_dict() for item in result_list] + table = pd.json_normalize(result_json) + table.to_excel(path, index=False) diff --git a/opencompass/datasets/medbench/dataset_loader.py b/opencompass/datasets/medbench/dataset_loader.py index fb704214..8fc617fe 100644 --- a/opencompass/datasets/medbench/dataset_loader.py +++ b/opencompass/datasets/medbench/dataset_loader.py @@ -1,338 +1,338 @@ -# flake8: noqa -import ast -import json -import os - -import pandas as pd -import tiktoken -from tqdm import tqdm - -from .constructions import ChatGPTSchema, ResultsForHumanSchema -from .utils import extract_answer, read_jsonl, save_jsonl - -# define the datasets -medbench_multiple_choices_sets = ['Med-Exam', 'DDx-basic', 'DDx-advanced', 'DDx-advanced', 'SafetyBench'] # 选择题,用acc判断 - -medbench_qa_sets = ['MedHC', 'MedMC', 'MedDG', 'MedSpeQA', 'MedTreat', 'CMB-Clin'] # 开放式QA,有标答 - -medbench_cloze_sets = ['MedHG'] # 限定域QA,有标答 - -medbench_single_choice_sets = ['DrugCA'] # 正确与否判断,有标答 - -medbench_ie_sets = ['DBMHG', 'CMeEE', 'CMeIE', 'CHIP-CDEE', 'CHIP-CDN', 'CHIP-CTC', 'SMDoc', 'IMCS-V2-MRG'] # 判断识别的实体是否一致,用F1评价 - -def convert_zero_shot(line, dataset_name): - # passage = line['passage'] if line['passage'] is not None else '' - # if dataset_name in medbench_qa_sets: - # return line['question'] - # elif dataset_name in medbench_cloze_sets: - # return '问题:' + line['question'] + '\n答案:' - # elif dataset_name in medbench_multiple_choices_sets: - # return '问题:' + line['question'] + ' ' \ - # + '选项:' + ' '.join(line['options']) + '\n从A到G,我们应该选择' - # else: - # return line['question'] - return line['question'] - -prefix = '该问题为单选题,所有选项中必有一个正确答案,且只有一个正确答案。\n' - -# def convert_zero_shot_CoT_stage1(line, dataset_name): -# try: -# passage = line['passage'] if line['passage'] is not None else '' -# if dataset_name in english_qa_datasets: -# return passage + 'Q: ' + line['question'] + ' ' \ -# + 'Answer Choices: ' + ' '.join(line['options']) + '\n' + \ -# "Let's think step by step." - -# elif dataset_name in chinese_qa_datasets: -# option_string = 'ABCDEFG' -# count = len(line['options']) -# if count == 1: -# count = 4 -# return passage + '问题:' + line['question'] + ' ' \ -# + '选项:' + ' '.join(line['options']) + '\n' + \ -# '从A到{}, 我们应选择什么?让我们逐步思考:'.format(option_string[count - 1]) - -# elif dataset_name in english_cloze_datasets: -# return passage + 'Q: ' + line['question'] + '\n' \ -# "A: Let's think step by step." - -# elif dataset_name in chinese_cloze_datasets: -# return passage + '问题:' + line['question'] + '\n' \ -# '答案:让我们逐步思考:' -# except NameError: -# print('Dataset not defined.') - - -# process few-shot raw_prompts -def combine_prompt(prompt_path, - dataset_name, - load_explanation=True, - chat_mode=False): - skip_passage = False - if dataset_name == 'sat-en-without-passage': - skip_passage = True - dataset_name = 'sat-en' - demostrations = [] - # read the prompts by context and explanation - context_row = [0, 1, 3, 5, 7, 9] - explanation_row = [0, 2, 4, 6, 8, 10] - raw_prompts_context = pd.read_csv(prompt_path, - header=0, - skiprows=lambda x: x not in context_row, - keep_default_na=False) - raw_prompts_explanation = pd.read_csv( - prompt_path, - header=0, - skiprows=lambda x: x not in explanation_row, - keep_default_na=False).replace(r'\n\n', '\n', regex=True) - contexts = [] - for line in list(raw_prompts_context[dataset_name]): - if line: - # print(line) - contexts.append(ast.literal_eval(line)) - explanations = [ - exp for exp in raw_prompts_explanation[dataset_name] if exp - ] - - for idx, (con, exp) in enumerate(zip(contexts, explanations)): - passage = con['passage'] if con[ - 'passage'] is not None and not skip_passage else '' - question = con['question'] - options = con['options'] if con['options'] is not None else '' - label = con['label'] if con['label'] is not None else '' - answer = con[ - 'answer'] if 'answer' in con and con['answer'] is not None else '' - - if dataset_name in qa_datasets: - question_input = '问题 {}. '.format(idx + 1) + passage + ' ' + question + '\n' \ - + '从以下选项中选择: ' + ' '.join(options) + '\n' - question_output = (('问题 {}的解析: '.format(idx + 1) + exp + '\n') if load_explanation else '') \ - + '答案是 {}'.format(label) - - elif dataset_name in cloze_datasets: - question_input = '问题 {}. '.format(idx + 1) + question + '\n' - question_output = (('问题 {}的解析: '.format(idx + 1) + exp + '\n') if load_explanation else '') \ - + '答案是 {}'.format(answer) - else: - raise ValueError( - f'During loading few-sot examples, found unknown dataset: {dataset_name}' - ) - if chat_mode: - demostrations.append((question_input, question_output)) - else: - demostrations.append(question_input + question_output + '\n') - - return demostrations - - -enc = None - - -def _lazy_load_enc(): - global enc - if enc is None: - enc = tiktoken.encoding_for_model('gpt-4') - - -# cut prompt if reach max token length -def concat_prompt(demos, - dataset_name, - max_tokens, - end_of_example='\n', - verbose=False): - _lazy_load_enc() - demostration_en = 'Here are the answers for the problems in the exam.\n' - demostration_zh = '以下是考试中各个问题的答案。\n' - - for i in range(len(demos)): - # print(len(enc.encode(demostration_en)), len(enc.encode(demostration_zh))) - if dataset_name in english_qa_datasets: - demostration_en = demostration_en + demos[i] + end_of_example - elif dataset_name in chinese_qa_datasets: - demostration_zh = demostration_zh + demos[i] + end_of_example - elif dataset_name in english_cloze_datasets: - demostration_en = demostration_en + demos[i] + end_of_example - elif dataset_name in chinese_cloze_datasets: - demostration_zh = demostration_zh + demos[i] + end_of_example - # break if reach max token limit - if len(enc.encode(demostration_en)) < max_tokens and len( - enc.encode(demostration_zh)) < max_tokens: - output = demostration_en if len(demostration_en) > len( - demostration_zh) else demostration_zh - prompt_num = i + 1 - else: - break - if verbose: - print('max_tokens set as ', max_tokens, 'actual_tokens is', - len(enc.encode(output)), 'num_shot is', prompt_num) - return output, prompt_num - - -def concat_prompt_chat_mode(demos, - dataset_name, - max_tokens, - end_of_example='\n', - verbose=False): - _lazy_load_enc() - answers = [] - sentences = '' - for i in range(len(demos)): - answers += [ - { - 'role': 'user', - 'content': demos[i][0] - }, - { - 'role': 'assistant', - 'content': demos[i][1] - }, - ] - sentences += json.dumps(answers[-1]) - # break if reach max token limit - if len(enc.encode(sentences)) > max_tokens: - answers.pop() - answers.pop() - break - if verbose: - print('max_tokens set as ', max_tokens, 'actual_tokens is', - len(enc.encode(sentences)), 'num_shot is', - len(answers) // 2) - return answers, len(answers) // 2 - - -def convert_few_shot(line, dataset_name, demo, n_shot, chat_mode=False): - passage = line['passage'] if line['passage'] is not None else '' - question = line['question'] - options = line['options'] if line['options'] is not None else '' - - if dataset_name in qa_datasets: - question_input = '问题 {}. '.format(n_shot + 1) + passage + ' ' + question + '\n' \ - + '从以下选项中选择: ' + ' '.join(options) + '\n' - # + "问题 {}的解析: ".format(n_shot + 1) - - if dataset_name in cloze_datasets: - question_input = '问题 {}. '.format(n_shot + 1) + question + '\n' - # + "问题 {}的解析: ".format(n_shot + 1) - if chat_mode: - return demo + [ - { - 'role': 'user', - 'content': question_input - }, - ] - else: - return demo + question_input - - -def load_dataset(dataset_name, - setting_name, - parent_path, - prompt_path=None, - max_tokens=None, - end_of_example='\n', - chat_mode=False, - verbose=False): - test_path = os.path.join(parent_path, dataset_name + '.jsonl') - loaded_jsonl = read_jsonl(test_path) - processed = [] - if setting_name == 'few-shot-CoT' or setting_name == 'few-shot': - # process demo once if it is few-shot-CoT - processed_demos = combine_prompt( - prompt_path, - dataset_name, - load_explanation=setting_name == 'few-shot-CoT', - chat_mode=chat_mode) - if chat_mode: - chosen_prompt, n_shot = concat_prompt_chat_mode(processed_demos, - dataset_name, - max_tokens, - end_of_example, - verbose=verbose) - else: - chosen_prompt, n_shot = concat_prompt(processed_demos, - dataset_name, - max_tokens, - end_of_example, - verbose=verbose) - - if verbose: - loaded_jsonl = tqdm(loaded_jsonl) - for meta_idx, line in enumerate(loaded_jsonl): - # 正确 - if setting_name == 'zero-shot': - ctxt = convert_zero_shot(line, dataset_name) - elif setting_name == 'zero-shot-CoT': - ctxt = convert_zero_shot_CoT_stage1(line, dataset_name) - elif setting_name == 'few-shot-CoT' or setting_name == 'few-shot': - ctxt = convert_few_shot(line, dataset_name, chosen_prompt, n_shot, - chat_mode) - try: - new_instance = ChatGPTSchema(context=ctxt, metadata=meta_idx) - processed.append(new_instance.to_dict()) - except NameError: - print('Dataset not defined.') - return processed - - -def generate_second_stage_input(dataset_name, - input_list, - output_list, - with_format_prompt=False): - try: - chinese_format_prompt = '根据以上内容,你的任务是把最终的答案提取出来并填在【】中,例如【0】或者【A】。' - if dataset_name in qa_datasets: - prompt_suffix = '因此,从A到D, 我们应选择' - if with_format_prompt: - prompt_suffix = chinese_format_prompt + prompt_suffix - elif dataset_name in cloze_datasets: - prompt_suffix = '因此,答案是' - if with_format_prompt: - prompt_suffix = chinese_format_prompt + prompt_suffix - except NameError: - print('Dataset not defined.') - processed = [] - for i in range(len(input_list)): - ctxt = '{0}\n{1}\n{2}'.format(input_list[i]['context'], - extract_answer(output_list[i]), - prompt_suffix) - new_instance = ChatGPTSchema(context=ctxt, - metadata=input_list[i]['metadata']) - processed.append(new_instance.to_dict()) - return processed - - -def load_dataset_as_result_schema(dataset_name, parent_path): - test_path = os.path.join(parent_path, dataset_name + '.jsonl') - loaded_jsonl = read_jsonl(test_path) - - processed = [] - for i, line in enumerate(loaded_jsonl): - problem_input = convert_zero_shot(line, dataset_name) - processed.append( - ResultsForHumanSchema( - index=i, - problem_input=problem_input, - # label=line['label'] if line['label'] else line['answer'] - label = line['answer'] - )) - return processed - - -if __name__ == '__main__': - # set variables - parent_dir = '../../data/exam_guidance' - - # set dataset name to process - setting_name = 'zero-shot' # setting_name can be chosen from ["zero-shot", "zero-shot-CoT", "few-shot-CoT"] - data_name = 'health_exam' - save_dir = '../../experiment_input/{}/'.format(setting_name) - if not os.path.exists(save_dir): - os.makedirs(save_dir) - processed_data = load_dataset(data_name, - setting_name, - parent_dir, - prompt_path=raw_prompt_path, - max_tokens=2048) - save_jsonl(processed_data, - os.path.join(save_dir, '{}.jsonl'.format(data_name))) +# flake8: noqa +import ast +import json +import os + +import pandas as pd +import tiktoken +from tqdm import tqdm + +from .constructions import ChatGPTSchema, ResultsForHumanSchema +from .utils import extract_answer, read_jsonl, save_jsonl + +# define the datasets +medbench_multiple_choices_sets = ['Med-Exam', 'DDx-basic', 'DDx-advanced', 'DDx-advanced', 'SafetyBench'] # 选择题,用acc判断 + +medbench_qa_sets = ['MedHC', 'MedMC', 'MedDG', 'MedSpeQA', 'MedTreat', 'CMB-Clin'] # 开放式QA,有标答 + +medbench_cloze_sets = ['MedHG'] # 限定域QA,有标答 + +medbench_single_choice_sets = ['DrugCA'] # 正确与否判断,有标答 + +medbench_ie_sets = ['DBMHG', 'CMeEE', 'CMeIE', 'CHIP-CDEE', 'CHIP-CDN', 'CHIP-CTC', 'SMDoc', 'IMCS-V2-MRG'] # 判断识别的实体是否一致,用F1评价 + +def convert_zero_shot(line, dataset_name): + # passage = line['passage'] if line['passage'] is not None else '' + # if dataset_name in medbench_qa_sets: + # return line['question'] + # elif dataset_name in medbench_cloze_sets: + # return '问题:' + line['question'] + '\n答案:' + # elif dataset_name in medbench_multiple_choices_sets: + # return '问题:' + line['question'] + ' ' \ + # + '选项:' + ' '.join(line['options']) + '\n从A到G,我们应该选择' + # else: + # return line['question'] + return line['question'] + +prefix = '该问题为单选题,所有选项中必有一个正确答案,且只有一个正确答案。\n' + +# def convert_zero_shot_CoT_stage1(line, dataset_name): +# try: +# passage = line['passage'] if line['passage'] is not None else '' +# if dataset_name in english_qa_datasets: +# return passage + 'Q: ' + line['question'] + ' ' \ +# + 'Answer Choices: ' + ' '.join(line['options']) + '\n' + \ +# "Let's think step by step." + +# elif dataset_name in chinese_qa_datasets: +# option_string = 'ABCDEFG' +# count = len(line['options']) +# if count == 1: +# count = 4 +# return passage + '问题:' + line['question'] + ' ' \ +# + '选项:' + ' '.join(line['options']) + '\n' + \ +# '从A到{}, 我们应选择什么?让我们逐步思考:'.format(option_string[count - 1]) + +# elif dataset_name in english_cloze_datasets: +# return passage + 'Q: ' + line['question'] + '\n' \ +# "A: Let's think step by step." + +# elif dataset_name in chinese_cloze_datasets: +# return passage + '问题:' + line['question'] + '\n' \ +# '答案:让我们逐步思考:' +# except NameError: +# print('Dataset not defined.') + + +# process few-shot raw_prompts +def combine_prompt(prompt_path, + dataset_name, + load_explanation=True, + chat_mode=False): + skip_passage = False + if dataset_name == 'sat-en-without-passage': + skip_passage = True + dataset_name = 'sat-en' + demostrations = [] + # read the prompts by context and explanation + context_row = [0, 1, 3, 5, 7, 9] + explanation_row = [0, 2, 4, 6, 8, 10] + raw_prompts_context = pd.read_csv(prompt_path, + header=0, + skiprows=lambda x: x not in context_row, + keep_default_na=False) + raw_prompts_explanation = pd.read_csv( + prompt_path, + header=0, + skiprows=lambda x: x not in explanation_row, + keep_default_na=False).replace(r'\n\n', '\n', regex=True) + contexts = [] + for line in list(raw_prompts_context[dataset_name]): + if line: + # print(line) + contexts.append(ast.literal_eval(line)) + explanations = [ + exp for exp in raw_prompts_explanation[dataset_name] if exp + ] + + for idx, (con, exp) in enumerate(zip(contexts, explanations)): + passage = con['passage'] if con[ + 'passage'] is not None and not skip_passage else '' + question = con['question'] + options = con['options'] if con['options'] is not None else '' + label = con['label'] if con['label'] is not None else '' + answer = con[ + 'answer'] if 'answer' in con and con['answer'] is not None else '' + + if dataset_name in qa_datasets: + question_input = '问题 {}. '.format(idx + 1) + passage + ' ' + question + '\n' \ + + '从以下选项中选择: ' + ' '.join(options) + '\n' + question_output = (('问题 {}的解析: '.format(idx + 1) + exp + '\n') if load_explanation else '') \ + + '答案是 {}'.format(label) + + elif dataset_name in cloze_datasets: + question_input = '问题 {}. '.format(idx + 1) + question + '\n' + question_output = (('问题 {}的解析: '.format(idx + 1) + exp + '\n') if load_explanation else '') \ + + '答案是 {}'.format(answer) + else: + raise ValueError( + f'During loading few-sot examples, found unknown dataset: {dataset_name}' + ) + if chat_mode: + demostrations.append((question_input, question_output)) + else: + demostrations.append(question_input + question_output + '\n') + + return demostrations + + +enc = None + + +def _lazy_load_enc(): + global enc + if enc is None: + enc = tiktoken.encoding_for_model('gpt-4') + + +# cut prompt if reach max token length +def concat_prompt(demos, + dataset_name, + max_tokens, + end_of_example='\n', + verbose=False): + _lazy_load_enc() + demostration_en = 'Here are the answers for the problems in the exam.\n' + demostration_zh = '以下是考试中各个问题的答案。\n' + + for i in range(len(demos)): + # print(len(enc.encode(demostration_en)), len(enc.encode(demostration_zh))) + if dataset_name in english_qa_datasets: + demostration_en = demostration_en + demos[i] + end_of_example + elif dataset_name in chinese_qa_datasets: + demostration_zh = demostration_zh + demos[i] + end_of_example + elif dataset_name in english_cloze_datasets: + demostration_en = demostration_en + demos[i] + end_of_example + elif dataset_name in chinese_cloze_datasets: + demostration_zh = demostration_zh + demos[i] + end_of_example + # break if reach max token limit + if len(enc.encode(demostration_en)) < max_tokens and len( + enc.encode(demostration_zh)) < max_tokens: + output = demostration_en if len(demostration_en) > len( + demostration_zh) else demostration_zh + prompt_num = i + 1 + else: + break + if verbose: + print('max_tokens set as ', max_tokens, 'actual_tokens is', + len(enc.encode(output)), 'num_shot is', prompt_num) + return output, prompt_num + + +def concat_prompt_chat_mode(demos, + dataset_name, + max_tokens, + end_of_example='\n', + verbose=False): + _lazy_load_enc() + answers = [] + sentences = '' + for i in range(len(demos)): + answers += [ + { + 'role': 'user', + 'content': demos[i][0] + }, + { + 'role': 'assistant', + 'content': demos[i][1] + }, + ] + sentences += json.dumps(answers[-1]) + # break if reach max token limit + if len(enc.encode(sentences)) > max_tokens: + answers.pop() + answers.pop() + break + if verbose: + print('max_tokens set as ', max_tokens, 'actual_tokens is', + len(enc.encode(sentences)), 'num_shot is', + len(answers) // 2) + return answers, len(answers) // 2 + + +def convert_few_shot(line, dataset_name, demo, n_shot, chat_mode=False): + passage = line['passage'] if line['passage'] is not None else '' + question = line['question'] + options = line['options'] if line['options'] is not None else '' + + if dataset_name in qa_datasets: + question_input = '问题 {}. '.format(n_shot + 1) + passage + ' ' + question + '\n' \ + + '从以下选项中选择: ' + ' '.join(options) + '\n' + # + "问题 {}的解析: ".format(n_shot + 1) + + if dataset_name in cloze_datasets: + question_input = '问题 {}. '.format(n_shot + 1) + question + '\n' + # + "问题 {}的解析: ".format(n_shot + 1) + if chat_mode: + return demo + [ + { + 'role': 'user', + 'content': question_input + }, + ] + else: + return demo + question_input + + +def load_dataset(dataset_name, + setting_name, + parent_path, + prompt_path=None, + max_tokens=None, + end_of_example='\n', + chat_mode=False, + verbose=False): + test_path = os.path.join(parent_path, dataset_name + '.jsonl') + loaded_jsonl = read_jsonl(test_path) + processed = [] + if setting_name == 'few-shot-CoT' or setting_name == 'few-shot': + # process demo once if it is few-shot-CoT + processed_demos = combine_prompt( + prompt_path, + dataset_name, + load_explanation=setting_name == 'few-shot-CoT', + chat_mode=chat_mode) + if chat_mode: + chosen_prompt, n_shot = concat_prompt_chat_mode(processed_demos, + dataset_name, + max_tokens, + end_of_example, + verbose=verbose) + else: + chosen_prompt, n_shot = concat_prompt(processed_demos, + dataset_name, + max_tokens, + end_of_example, + verbose=verbose) + + if verbose: + loaded_jsonl = tqdm(loaded_jsonl) + for meta_idx, line in enumerate(loaded_jsonl): + # 正确 + if setting_name == 'zero-shot': + ctxt = convert_zero_shot(line, dataset_name) + elif setting_name == 'zero-shot-CoT': + ctxt = convert_zero_shot_CoT_stage1(line, dataset_name) + elif setting_name == 'few-shot-CoT' or setting_name == 'few-shot': + ctxt = convert_few_shot(line, dataset_name, chosen_prompt, n_shot, + chat_mode) + try: + new_instance = ChatGPTSchema(context=ctxt, metadata=meta_idx) + processed.append(new_instance.to_dict()) + except NameError: + print('Dataset not defined.') + return processed + + +def generate_second_stage_input(dataset_name, + input_list, + output_list, + with_format_prompt=False): + try: + chinese_format_prompt = '根据以上内容,你的任务是把最终的答案提取出来并填在【】中,例如【0】或者【A】。' + if dataset_name in qa_datasets: + prompt_suffix = '因此,从A到D, 我们应选择' + if with_format_prompt: + prompt_suffix = chinese_format_prompt + prompt_suffix + elif dataset_name in cloze_datasets: + prompt_suffix = '因此,答案是' + if with_format_prompt: + prompt_suffix = chinese_format_prompt + prompt_suffix + except NameError: + print('Dataset not defined.') + processed = [] + for i in range(len(input_list)): + ctxt = '{0}\n{1}\n{2}'.format(input_list[i]['context'], + extract_answer(output_list[i]), + prompt_suffix) + new_instance = ChatGPTSchema(context=ctxt, + metadata=input_list[i]['metadata']) + processed.append(new_instance.to_dict()) + return processed + + +def load_dataset_as_result_schema(dataset_name, parent_path): + test_path = os.path.join(parent_path, dataset_name + '.jsonl') + loaded_jsonl = read_jsonl(test_path) + + processed = [] + for i, line in enumerate(loaded_jsonl): + problem_input = convert_zero_shot(line, dataset_name) + processed.append( + ResultsForHumanSchema( + index=i, + problem_input=problem_input, + # label=line['label'] if line['label'] else line['answer'] + label = line['answer'] + )) + return processed + + +if __name__ == '__main__': + # set variables + parent_dir = '../../data/exam_guidance' + + # set dataset name to process + setting_name = 'zero-shot' # setting_name can be chosen from ["zero-shot", "zero-shot-CoT", "few-shot-CoT"] + data_name = 'health_exam' + save_dir = '../../experiment_input/{}/'.format(setting_name) + if not os.path.exists(save_dir): + os.makedirs(save_dir) + processed_data = load_dataset(data_name, + setting_name, + parent_dir, + prompt_path=raw_prompt_path, + max_tokens=2048) + save_jsonl(processed_data, + os.path.join(save_dir, '{}.jsonl'.format(data_name))) diff --git a/opencompass/datasets/medbench/evaluation.py b/opencompass/datasets/medbench/evaluation.py index c5a9916a..56379dc9 100644 --- a/opencompass/datasets/medbench/evaluation.py +++ b/opencompass/datasets/medbench/evaluation.py @@ -1,43 +1,43 @@ -# flake8: noqa -from . import dataset_loader, utils -from .math_equivalence import is_equiv - - -def convert_to_set(item): - if isinstance(item, list): - return set(item) - if isinstance(item, str): - return {item} - if item is None: - return {} - raise ValueError("Input can't parse:", item) - - -def evaluate_single_sample(dataset_name, prediction, label): - if dataset_name in dataset_loader.multi_choice_datasets: - p = convert_to_set(prediction) - l = convert_to_set(label) - return p == l - elif dataset_name in dataset_loader.math_output_datasets: - return is_equiv(prediction, label) - else: - return prediction == label - - -# def evaluate(dataset_name, prediction_list, label_list): -# correct = 0 -# if dataset_name in multi_choice_datasets: -# for prediction, label in zip(prediction_list, label_list): -# p = convert_to_set(prediction) -# l = convert_to_set(label) -# if p == l: -# correct += 1 -# elif dataset_name in math_output_datasets: -# for prediction, label in zip(prediction_list, label_list): -# if is_equiv(prediction, label): -# correct += 1 -# else: -# for prediction, label in zip(prediction_list, label_list): -# if prediction == label: -# correct += 1 -# return "{0:.2%}".format(correct / len(label_list)) +# flake8: noqa +from . import dataset_loader, utils +from .math_equivalence import is_equiv + + +def convert_to_set(item): + if isinstance(item, list): + return set(item) + if isinstance(item, str): + return {item} + if item is None: + return {} + raise ValueError("Input can't parse:", item) + + +def evaluate_single_sample(dataset_name, prediction, label): + if dataset_name in dataset_loader.multi_choice_datasets: + p = convert_to_set(prediction) + l = convert_to_set(label) + return p == l + elif dataset_name in dataset_loader.math_output_datasets: + return is_equiv(prediction, label) + else: + return prediction == label + + +# def evaluate(dataset_name, prediction_list, label_list): +# correct = 0 +# if dataset_name in multi_choice_datasets: +# for prediction, label in zip(prediction_list, label_list): +# p = convert_to_set(prediction) +# l = convert_to_set(label) +# if p == l: +# correct += 1 +# elif dataset_name in math_output_datasets: +# for prediction, label in zip(prediction_list, label_list): +# if is_equiv(prediction, label): +# correct += 1 +# else: +# for prediction, label in zip(prediction_list, label_list): +# if prediction == label: +# correct += 1 +# return "{0:.2%}".format(correct / len(label_list)) diff --git a/opencompass/datasets/medbench/math_equivalence.py b/opencompass/datasets/medbench/math_equivalence.py index 788900ea..3f814435 100644 --- a/opencompass/datasets/medbench/math_equivalence.py +++ b/opencompass/datasets/medbench/math_equivalence.py @@ -1,161 +1,161 @@ -# flake8: noqa - - -# code from https://github.com/hendrycks/math/blob/main/modeling/math_equivalence.py -def _fix_fracs(string): - substrs = string.split('\\frac') - new_str = substrs[0] - if len(substrs) > 1: - substrs = substrs[1:] - for substr in substrs: - new_str += '\\frac' - if substr[0] == '{': - new_str += substr - else: - try: - assert len(substr) >= 2 - except: - return string - a = substr[0] - b = substr[1] - if b != '{': - if len(substr) > 2: - post_substr = substr[2:] - new_str += '{' + a + '}{' + b + '}' + post_substr - else: - new_str += '{' + a + '}{' + b + '}' - else: - if len(substr) > 2: - post_substr = substr[2:] - new_str += '{' + a + '}' + b + post_substr - else: - new_str += '{' + a + '}' + b - string = new_str - return string - - -def _fix_a_slash_b(string): - if len(string.split('/')) != 2: - return string - a = string.split('/')[0] - b = string.split('/')[1] - try: - a = int(a) - b = int(b) - assert string == '{}/{}'.format(a, b) - new_string = '\\frac{' + str(a) + '}{' + str(b) + '}' - return new_string - except: - return string - - -def _remove_right_units(string): - # "\\text{ " only ever occurs (at least in the val set) when describing units - if '\\text{ ' in string: - splits = string.split('\\text{ ') - assert len(splits) == 2 - return splits[0] - else: - return string - - -def _fix_sqrt(string): - if '\\sqrt' not in string: - return string - splits = string.split('\\sqrt') - new_string = splits[0] - for split in splits[1:]: - if split[0] != '{': - a = split[0] - new_substr = '\\sqrt{' + a + '}' + split[1:] - else: - new_substr = '\\sqrt' + split - new_string += new_substr - return new_string - - -def _strip_string(string): - # linebreaks - string = string.replace('\n', '') - # print(string) - - # remove inverse spaces - string = string.replace('\\!', '') - # print(string) - - # replace \\ with \ - string = string.replace('\\\\', '\\') - # print(string) - - # replace tfrac and dfrac with frac - string = string.replace('tfrac', 'frac') - string = string.replace('dfrac', 'frac') - # print(string) - - # remove \left and \right - string = string.replace('\\left', '') - string = string.replace('\\right', '') - # print(string) - - # Remove circ (degrees) - string = string.replace('^{\\circ}', '') - string = string.replace('^\\circ', '') - - # remove dollar signs - string = string.replace('\\$', '') - - # remove units (on the right) - string = _remove_right_units(string) - - # remove percentage - string = string.replace('\\%', '') - string = string.replace('\%', '') - - # " 0." equivalent to " ." and "{0." equivalent to "{." Alternatively, add "0" if "." is the start of the string - string = string.replace(' .', ' 0.') - string = string.replace('{.', '{0.') - # if empty, return empty string - if len(string) == 0: - return string - if string[0] == '.': - string = '0' + string - - # to consider: get rid of e.g. "k = " or "q = " at beginning - if len(string.split('=')) == 2: - if len(string.split('=')[0]) <= 2: - string = string.split('=')[1] - - # fix sqrt3 --> sqrt{3} - string = _fix_sqrt(string) - - # remove spaces - string = string.replace(' ', '') - - # \frac1b or \frac12 --> \frac{1}{b} and \frac{1}{2}, etc. Even works with \frac1{72} (but not \frac{72}1). Also does a/b --> \\frac{a}{b} - string = _fix_fracs(string) - - # manually change 0.5 --> \frac{1}{2} - if string == '0.5': - string = '\\frac{1}{2}' - - # NOTE: X/Y changed to \frac{X}{Y} in dataset, but in simple cases fix in case the model output is X/Y - string = _fix_a_slash_b(string) - - return string - - -def is_equiv(str1, str2, verbose=False): - if str1 is None and str2 is None: - print('WARNING: Both None') - return True - if str1 is None or str2 is None: - return False - - try: - ss1 = _strip_string(str1) - ss2 = _strip_string(str2) - if verbose: - print(ss1, ss2) - return ss1 == ss2 - except: - return str1 == str2 +# flake8: noqa + + +# code from https://github.com/hendrycks/math/blob/main/modeling/math_equivalence.py +def _fix_fracs(string): + substrs = string.split('\\frac') + new_str = substrs[0] + if len(substrs) > 1: + substrs = substrs[1:] + for substr in substrs: + new_str += '\\frac' + if substr[0] == '{': + new_str += substr + else: + try: + assert len(substr) >= 2 + except: + return string + a = substr[0] + b = substr[1] + if b != '{': + if len(substr) > 2: + post_substr = substr[2:] + new_str += '{' + a + '}{' + b + '}' + post_substr + else: + new_str += '{' + a + '}{' + b + '}' + else: + if len(substr) > 2: + post_substr = substr[2:] + new_str += '{' + a + '}' + b + post_substr + else: + new_str += '{' + a + '}' + b + string = new_str + return string + + +def _fix_a_slash_b(string): + if len(string.split('/')) != 2: + return string + a = string.split('/')[0] + b = string.split('/')[1] + try: + a = int(a) + b = int(b) + assert string == '{}/{}'.format(a, b) + new_string = '\\frac{' + str(a) + '}{' + str(b) + '}' + return new_string + except: + return string + + +def _remove_right_units(string): + # "\\text{ " only ever occurs (at least in the val set) when describing units + if '\\text{ ' in string: + splits = string.split('\\text{ ') + assert len(splits) == 2 + return splits[0] + else: + return string + + +def _fix_sqrt(string): + if '\\sqrt' not in string: + return string + splits = string.split('\\sqrt') + new_string = splits[0] + for split in splits[1:]: + if split[0] != '{': + a = split[0] + new_substr = '\\sqrt{' + a + '}' + split[1:] + else: + new_substr = '\\sqrt' + split + new_string += new_substr + return new_string + + +def _strip_string(string): + # linebreaks + string = string.replace('\n', '') + # print(string) + + # remove inverse spaces + string = string.replace('\\!', '') + # print(string) + + # replace \\ with \ + string = string.replace('\\\\', '\\') + # print(string) + + # replace tfrac and dfrac with frac + string = string.replace('tfrac', 'frac') + string = string.replace('dfrac', 'frac') + # print(string) + + # remove \left and \right + string = string.replace('\\left', '') + string = string.replace('\\right', '') + # print(string) + + # Remove circ (degrees) + string = string.replace('^{\\circ}', '') + string = string.replace('^\\circ', '') + + # remove dollar signs + string = string.replace('\\$', '') + + # remove units (on the right) + string = _remove_right_units(string) + + # remove percentage + string = string.replace('\\%', '') + string = string.replace('\%', '') + + # " 0." equivalent to " ." and "{0." equivalent to "{." Alternatively, add "0" if "." is the start of the string + string = string.replace(' .', ' 0.') + string = string.replace('{.', '{0.') + # if empty, return empty string + if len(string) == 0: + return string + if string[0] == '.': + string = '0' + string + + # to consider: get rid of e.g. "k = " or "q = " at beginning + if len(string.split('=')) == 2: + if len(string.split('=')[0]) <= 2: + string = string.split('=')[1] + + # fix sqrt3 --> sqrt{3} + string = _fix_sqrt(string) + + # remove spaces + string = string.replace(' ', '') + + # \frac1b or \frac12 --> \frac{1}{b} and \frac{1}{2}, etc. Even works with \frac1{72} (but not \frac{72}1). Also does a/b --> \\frac{a}{b} + string = _fix_fracs(string) + + # manually change 0.5 --> \frac{1}{2} + if string == '0.5': + string = '\\frac{1}{2}' + + # NOTE: X/Y changed to \frac{X}{Y} in dataset, but in simple cases fix in case the model output is X/Y + string = _fix_a_slash_b(string) + + return string + + +def is_equiv(str1, str2, verbose=False): + if str1 is None and str2 is None: + print('WARNING: Both None') + return True + if str1 is None or str2 is None: + return False + + try: + ss1 = _strip_string(str1) + ss2 = _strip_string(str2) + if verbose: + print(ss1, ss2) + return ss1 == ss2 + except: + return str1 == str2 diff --git a/opencompass/datasets/medbench/medbench.py b/opencompass/datasets/medbench/medbench.py index 54690947..140d4d73 100644 --- a/opencompass/datasets/medbench/medbench.py +++ b/opencompass/datasets/medbench/medbench.py @@ -1,589 +1,589 @@ -import json -import os.path as osp -import sys -from datasets import Dataset -from sklearn.metrics import classification_report -from opencompass.openicl.icl_evaluator import BaseEvaluator -from opencompass.registry import ICL_EVALUATORS, LOAD_DATASET -from opencompass.utils import get_data_path - -from ..base import BaseDataset -from .math_equivalence import is_equiv -from .post_process import parse_math_answer, parse_qa_multiple_answer - -import evaluate -from nltk.translate.bleu_score import sentence_bleu -# # from bert_score import score -import re -from transformers import BasicTokenizer -from rouge_chinese import Rouge -basic_tokenizer = BasicTokenizer(tokenize_chinese_chars=True) - -@LOAD_DATASET.register_module() -class MedBenchDataset(BaseDataset): - - @staticmethod - def load(path: str, name: str, setting_name: str): - path = get_data_path(path, local_mode=True) - from .dataset_loader import load_dataset, load_dataset_as_result_schema - - assert setting_name in 'zero-shot', 'only support zero-shot setting' - dataset_wo_label = load_dataset(name, setting_name, path) - dataset_with_label = load_dataset_as_result_schema(name, path) - dataset = [] - for d1, d2 in zip(dataset_wo_label, dataset_with_label): - dataset.append({ - 'id': d2.index, - 'problem_input': d1['context'], - 'label': d2.label, - }) - dataset = Dataset.from_list(dataset) - return dataset - - -@ICL_EVALUATORS.register_module() -class MedBenchEvaluator(BaseEvaluator): - - def score(self, predictions, references): - # predictions: [[]] - # references: [[]] - predictions = [parse_qa_multiple_answer(pred) for pred in predictions] - details = [] - cnt = 0 - for pred, ref in zip(predictions, references): - detail = {'pred': pred, 'answer': ref, 'correct': False} - if is_equiv(pred, ref): - cnt += 1 - detail['correct'] = True - details.append(detail) - score = cnt / len(predictions) * 100 - return {'Accuracy': score, 'details': details} - -def process_generated_results_CMeEE(pred_file): - # 实体每类占一行,每行格式为 "[类型名称]实体:实体名称1,实体名称2,实体名称3\n" - # 多个实体,用 ,符号分割 - structured_output = [] - answer_choices = ['药物', '设备', '医院科室', '微生物类', '身体部位', '医疗操作', '医学检验项目', '症状', '疾病'] - for pred in pred_file: - list_entities = [] - for choice in answer_choices: - for piece in re.split('\n', pred): - if piece.startswith(f"{choice}"): - mentions = re.split(r"[,,]", piece.replace(f"{choice}:", "").replace(f"{choice}:", "")) - for ment in mentions: - list_entities.append({'type':choice, 'entity':ment}) - structured_output.append(list_entities) - return structured_output - -def process_generated_results_EMR(pred_file): - structured_output = [] - regex = r"^(主诉|现病史|既往史|个人史|婚育史|家族史)[::]([\s\S]+)$" - for prediction in pred_file: - entities: dict = {} - if "\n\n" in prediction: - blocks = prediction.split("\n\n") - else: - blocks = prediction.splitlines() - for line in blocks: - if match := re.match(regex, line.strip()): - type_ = match[1] - mention = match[2].strip() - entities[type_] = mention - structured_output.append(entities) - return structured_output - -def process_generated_results_CMeIE(pred_file): - structured_output = [] - for line in pred_file: - gen_output = line - - answer_choices = "相关(导致)、鉴别诊断、遗传因素、发病性别倾向、相关(症状)、手术治疗、预防、辅助检查、筛查、阶段、临床表现、风险评估因素、同义词、发病年龄、预后生存率、病史、传播途径、治疗后症状、药物治疗、辅助治疗、化疗、死亡率、放射治疗、病因、组织学检查、内窥镜检查、多发群体、并发症、实验室检查、就诊科室、病理生理、高危因素、发病率、多发地区、病理分型、影像学检查、转移部位、发病部位、相关(转化)、外侵部位、预后状况、发病机制、多发季节" - re_choices = "|".join(re.escape(choice) for choice in answer_choices.split('、')) - regex = ( - rf'关系[::]["“]({re_choices})["”][,,]' - r'头实体[::]["“]([^"”]+)["”][,,]尾实体[::]["“]([^"”]+)["”]' - ) - - list_spos = [] - list_answer_strs = gen_output.split("\n") - for line in list_answer_strs: - for item in re.finditer(regex, line): - print(item) - for match in re.finditer(regex, line): - list_spos.append({"predicate": match[1], "subject": match[2], "object": match[3]}) - - structured_output.append(list_spos) - return structured_output - -def process_generated_results_CDN(pred_file): - structured_output = [] - answer_choices = json.load(open('./opencompass/datasets/medbench/entity_list.jsonl', 'r')) - for line in pred_file: - gen_output = line - - answer_str = gen_output.split("\n")[-1] - answers = answer_str.split(",") - answers = [w.strip() for w in answers if len(w.strip()) > 0] - answers = [w for w in answers if w in answer_choices] - answers = list(set(answers)) - answers = [ - { - "entity": w, - "type": "normalization", - } - for w in answers - ] - - structured_output.append(answers) - return structured_output - -def process_generated_results_CDEE(pred_file): - structured_output = [] - for prediction in pred_file: - events: list[dict] = [] - for line in prediction.splitlines(): - if "主体词" in line: - line = line.rstrip("。") - kvs = line.split(";") - kv_dict = dict(kv.split(":", maxsplit=1) for kv in kvs if ":" in kv) - events.append({ - "主体词": kv_dict.get("主体词", ""), - "发生状态": ( - v - if (v := kv_dict.get("发生状态", "不确定")) in ("不确定", "否定") - else "" - ), - "描述词": ( - v.split(",") if (v := kv_dict.get("描述词", "空")) != "空" else [] - ), - "解剖部位": ( - v.split(",") - if (v := kv_dict.get("解剖部位", "空")) != "空" - else [] - ), - }) - structured_output.append(events) - return structured_output - -def process_generated_results_CTC(pred_file): - structured_output = [] - - for line in pred_file: - gen_output = line - # 答案格式:直接回答分类标签 - answer_str = gen_output.strip() - structured_output.append(answer_str) - return structured_output - -def process_generated_results_doc_parsing(pred_file): - float_field_regex = r"(体温|脉搏|心率|收缩压|舒张压|呼吸)[^\d]*(\d+(?:\.\d+)?)" - - output = [] - for prediction in pred_file: - entities = { - "体温": "未扪及", - "脉搏": "未扪及", - "心率": "未扪及", - "收缩压": "未扪及", - "舒张压": "未扪及", - "呼吸": "未扪及", - "是否上腹部深压痛": None, - "是否腹部反跳痛": None, - "上腹部肿块": None, - } - for sentence in re.split("[,|。|\n]", prediction): - for match in re.finditer(float_field_regex, prediction): - entities[match[1]] = match[2] - if "上腹部深压痛" in sentence: - if re.search("是(?!否)|(?:^|[^不])存在|有", sentence): - entities["是否上腹部深压痛"] = "是" - else: - entities["是否上腹部深压痛"] = "否" - elif "腹部反跳痛" in sentence: - if re.search("是(?!否)|(?:^|[^不])存在|有", sentence): - entities["是否腹部反跳痛"] = "是" - else: - entities["是否腹部反跳痛"] = "否" - elif "上腹部肿块" in sentence: - if re.search("是(?!否)|(?:^|[^不])存在|有", sentence): - entities["上腹部肿块"] = "扪及" - else: - entities["上腹部肿块"] = "未扪及" - result = [ - { - "type": "体温(℃)", - "entity": entities["体温"], - }, - { - "type": "脉搏(次/分)", - "entity": entities["脉搏"], - }, - { - "type": "心率(次/分)", - "entity": entities["心率"], - }, - { - "type": "收缩压(mmHg)", - "entity": entities["收缩压"], - }, - { - "type": "舒张压(mmHg)", - "entity": entities["舒张压"], - }, - { - "type": "呼吸(次/分)", - "entity": entities["呼吸"], - }, - ] - if entities["是否上腹部深压痛"]: - result.append({ - "type": "是否上腹部深压痛", - "entity": entities["是否上腹部深压痛"], - }) - if entities["是否腹部反跳痛"]: - result.append({ - "type": "是否腹部反跳痛", - "entity": entities["是否腹部反跳痛"], - }) - if entities["上腹部肿块"]: - result.append({ - "type": "上腹部肿块", - "entity": entities["上腹部肿块"], - }) - - output.append(result) - return output - -def process_generated_results_mrg(pred_file): - structured_output = [] - regex = r"^(主诉|现病史|辅助检查|既往史|诊断|建议)[::]([\s\S]+)$" - for prediction in pred_file: - entities = {} - if "\n\n" in prediction: - blocks = prediction.split("\n\n") - else: - blocks = prediction.splitlines() - for line in blocks: - if match := re.match(regex, line.strip()): - type_ = match[1] - mention = match[2].strip() - entities[type_] = mention - structured_output.append(entities) - return structured_output - -def calc_info_extract_task_scores(list_structured_predict, list_structured_golden): - - assert len(list_structured_golden) == len(list_structured_predict) - - tp = 0 - fp = 0 - fn = 0 - for samp_golden, samp_predict in zip(list_structured_golden, list_structured_predict): - # samp_golden: [[{}]] - answer_golden = samp_golden - answer_predict = samp_predict - # assert isinstance(answer_golden, list) - # assert isinstance(answer_predict, list), "sample format is wrong!" - - set_golden = set() - for inst in answer_golden: - assert isinstance(inst, dict) - keys = sorted(list(inst.keys())) - inst = tuple([json.dumps(inst[w], ensure_ascii=False) for w in keys ]) - # inst = list(inst.items()) - # inst.sort() - # inst = tuple(inst) - - set_golden.add(inst) - - set_predict = set() - for inst in answer_predict: - assert isinstance(inst, dict) - keys = sorted(list(inst.keys())) - - inst = tuple([json.dumps(inst[w], ensure_ascii=False) for w in keys]) - - set_predict.add(inst) - - tp += len(set_golden.intersection(set_predict)) - fp += len(set_predict.difference(set_golden)) - fn += len(set_golden.difference(set_predict)) - - if tp: - precision = tp / (tp + fp) - recall = tp / (tp + fn) - f1 = 2 * precision * recall / (precision + recall) - - else: - precision, recall, f1 = 0, 0, 0 - - return precision, recall, f1 - -def calc_cls_task_scores(list_structured_golden, - list_structured_predict, - list_labels=None, - return_macro=False, - ): - # types = list_labels - # scores = {c: {"tp": 0, "fp": 0, "fn": 0, "tn": 0} for c in list_labels + ["ALL"]} - - predictions = [] - ground_truths = [] - - # Count GT relations and Predicted relations - assert len(list_structured_golden) == len(list_structured_predict) - n_sents = len(list_structured_golden) - - # Count TP, FP and FN per type - for pred_samp, gt_samp in zip(list_structured_predict, list_structured_golden): - - pred_label = pred_samp - gt_label = gt_samp - # assert gt_label != "" - if gt_label == "": - get_label = list_labels[0] - if pred_label == "": - pred_label = list_labels[0] - - predictions.append(pred_label) - ground_truths.append(gt_label) - - # metric - cls_report = classification_report( - ground_truths, predictions, - output_dict=True, - zero_division=0, - ) - - if return_macro: - return cls_report["macro avg"]["precision"], \ - cls_report["macro avg"]["recall"], \ - cls_report["macro avg"]["f1-score"] - else: - return cls_report["weighted avg"]["precision"], \ - cls_report["weighted avg"]["recall"], \ - cls_report["weighted avg"]["f1-score"] - -def calc_nlg_task_scores(list_structured_golden, list_structured_predict): - - assert len(list_structured_golden) == len(list_structured_predict) - - scores = [] - predictions = [] - references = [] - details = [] - for samp_golden, samp_predict in zip(list_structured_golden, list_structured_predict): - - answer_golden = samp_golden - answer_predict = samp_predict - - if not (answer_predict and answer_golden): - continue - - # basic tokenizer: 拆分中文字,保留英文单词 - answer_predict = basic_tokenizer.tokenize(answer_predict) - answer_golden = basic_tokenizer.tokenize(answer_golden) - answer_predict = " ".join(answer_predict).strip() - answer_golden = " ".join(answer_golden).strip() - if answer_golden.strip() == "": - answer_golden = "无 。" - if answer_predict.strip() == "": - answer_predict = "无 。" - - predictions.append(answer_predict) - references.append(answer_golden) - - details.append({'pred':answer_predict, 'answer':answer_golden, 'correct':False}) - - rouge = Rouge() - # bleu = evaluate.load('sacrebleu') - scores = rouge.get_scores(predictions, references, avg=True) - # scores_bleu = bleu.compute(predictions=predictions, references=references) - - rouge1 = scores["rouge-1"]["f"] - rouge2 = scores["rouge-2"]["f"] - rougeL = scores["rouge-l"]["f"] - - # bleu = sentence_bleu(references, predictions) - - # bert_score = [] - # for id in range(len(predictions)): - # P, R, F1 = score([predictions[i]], [references[i]], model_type='bert-base-chinese', lang="zh", verbose=True) - # bert_score.append(F1) - # bert_score = float(sum(bert_score)) / float(len(bert_score)) - # return rougeL, bleu, bert_score - return {'RougeL': rougeL, 'details':details} - -def calc_scores_f1(dict_gt, dict_pred): - details = [] - for gt, pred in zip(dict_gt, dict_pred): - details.append({'pred':pred, 'answer':gt, 'correct':None}) - - precision, recall, f1 = calc_info_extract_task_scores(dict_gt, dict_pred) - return {'F1':f1, 'details':details} - -def calc_scores_ctc(dict_gt, dict_pred): - details = [] - for gt, pred in zip(dict_gt, dict_pred): - details.append({'pred':pred, 'answer':gt, 'correct':None}) - - gts = dict_gt - preds = dict_pred - - precision, recall, f1 = calc_cls_task_scores( - gts, - preds, - list_labels=['非上述类型', '疾病', '症状(患者感受)', - '体征(医生检测)', '怀孕相关', '肿瘤进展', - '疾病分期', '过敏耐受', '器官组织状态', - '预期寿命', '口腔相关', '药物', - '治疗或手术', '设备', '护理', - '诊断', '实验室检查', '风险评估', - '受体状态', '年龄', '特殊病人特征', - '读写能力', '性别', '教育情况', - '居住情况', '种族', '知情同意', - '参与其它试验', '研究者决定', '能力', - '伦理审查', '依存性', '成瘾行为', - '睡眠', '锻炼', '饮食', '酒精使用', - '性取向', '吸烟状况', '献血', - '病例来源', '残疾群体', '健康群体', - '数据可及性', "含有多个类别"], - return_macro=True, - ) - return {'Macro-F1':f1, 'details':details} - -def calc_scores_nlg(dict_gt, dict_pred): - - # scores = {} - scores = {'score':0, 'details':[]} - success_flag = 1 - - gts = dict_gt - preds = dict_pred - # if not len(gts) == len(preds): - # success_flag = 0 - # try: - return calc_nlg_task_scores(gts, preds) - -@ICL_EVALUATORS.register_module() -class MedBenchEvaluator_CMeEE(BaseEvaluator): - - def score(self, predictions, references): - predictions = process_generated_results_CMeEE(predictions) - return calc_scores_f1(predictions, references) - -@ICL_EVALUATORS.register_module() -class MedBenchEvaluator_DBMHG(BaseEvaluator): - - def score(self, predictions, references): - predictions = process_generated_results_EMR(predictions) - return calc_scores_f1(predictions, references) - -@ICL_EVALUATORS.register_module() -class MedBenchEvaluator_IMCS_V2_MRG(BaseEvaluator): - - def score(self, predictions, references): - # predictions = process_generated_results_mrg(predictions) - references_revise = [] - for item in references: - temp_ref = '' - for sub_item in item: - temp_ref += sub_item['type'] + ':' + sub_item['entity'] + '\n' - references_revise.append(temp_ref) - return calc_nlg_task_scores(references_revise, predictions) - -@ICL_EVALUATORS.register_module() -class MedBenchEvaluator_CMeIE(BaseEvaluator): - - def score(self, predictions, references): - predictions = process_generated_results_CMeIE(predictions) - return calc_scores_f1(predictions, references) - -@ICL_EVALUATORS.register_module() -class MedBenchEvaluator_CHIP_CDEE(BaseEvaluator): - - def score(self, predictions, references): - predictions = process_generated_results_CDEE(predictions) - return calc_scores_f1(predictions, references) - -@ICL_EVALUATORS.register_module() -class MedBenchEvaluator_CHIP_CDN(BaseEvaluator): - - def score(self, predictions, references): - predictions = process_generated_results_CDN(predictions) - return calc_scores_f1(predictions, references) - -@ICL_EVALUATORS.register_module() -class MedBenchEvaluator_CHIP_CTC(BaseEvaluator): - - def score(self, predictions, references): - predictions = process_generated_results_CTC(predictions) - return calc_scores_ctc(predictions, references) - -@ICL_EVALUATORS.register_module() -class MedBenchEvaluator_Doc_parsing(BaseEvaluator): - - def score(self, predictions, references): - # predictions = process_generated_results_doc_parsing(predictions) - references_revise = [] - for item in references: - temp_ref = '' - for sub_item in item: - temp_ref += sub_item['type'] + ':' + sub_item['entity'] + '\n' - references_revise.append(temp_ref) - return calc_nlg_task_scores(references_revise, predictions) - -@ICL_EVALUATORS.register_module() -class MedBenchEvaluator_NLG(BaseEvaluator): - - def score(self, predictions, references): - # predictions = process_generated_results_med(predictions) - return calc_scores_nlg(predictions, references) - -@ICL_EVALUATORS.register_module() -class MedBenchEvaluator_Cloze(BaseEvaluator): - - def score(self, predictions, references): - # predictions: [[]] - # references: [[]] - # predictions = [parse_qa_multiple_answer(pred) for pred in predictions] - details = [] - cnt = 0 - - for pred, ref in zip(predictions, references): - detail = {'pred':pred, 'answer':ref, 'correct':False} - - if sum([item in pred for item in ref]) == len(ref): - cnt += 1 - detail['correct'] = True - details.append(detail) - score = cnt / len(predictions) * 100 - return {'Accuracy': score, 'details': details} - -@ICL_EVALUATORS.register_module() -class MedBenchEvaluator_TF(BaseEvaluator): - - def score(self, predictions, references): - # predictions: [[]] - # references: [[]] - # predictions = [parse_qa_multiple_answer(pred) for pred in predictions] - details = [] - cnt = 0 - - for pred, ref in zip(predictions, references): - - if '不' in pred or '否' in pred: - cur_pred = '不可以' - else: - cur_pred = '可以' - - detail = {'pred':cur_pred, 'answer':ref, 'correct':False} - - if cur_pred == ref: - cnt += 1 - detail['correct'] = True - - details.append(detail) - - score = cnt / len(predictions) * 100 +import json +import os.path as osp +import sys +from datasets import Dataset +from sklearn.metrics import classification_report +from opencompass.openicl.icl_evaluator import BaseEvaluator +from opencompass.registry import ICL_EVALUATORS, LOAD_DATASET +from opencompass.utils import get_data_path + +from ..base import BaseDataset +from .math_equivalence import is_equiv +from .post_process import parse_math_answer, parse_qa_multiple_answer + +import evaluate +from nltk.translate.bleu_score import sentence_bleu +# # from bert_score import score +import re +from transformers import BasicTokenizer +from rouge_chinese import Rouge +basic_tokenizer = BasicTokenizer(tokenize_chinese_chars=True) + +@LOAD_DATASET.register_module() +class MedBenchDataset(BaseDataset): + + @staticmethod + def load(path: str, name: str, setting_name: str): + path = get_data_path(path, local_mode=True) + from .dataset_loader import load_dataset, load_dataset_as_result_schema + + assert setting_name in 'zero-shot', 'only support zero-shot setting' + dataset_wo_label = load_dataset(name, setting_name, path) + dataset_with_label = load_dataset_as_result_schema(name, path) + dataset = [] + for d1, d2 in zip(dataset_wo_label, dataset_with_label): + dataset.append({ + 'id': d2.index, + 'problem_input': d1['context'], + 'label': d2.label, + }) + dataset = Dataset.from_list(dataset) + return dataset + + +@ICL_EVALUATORS.register_module() +class MedBenchEvaluator(BaseEvaluator): + + def score(self, predictions, references): + # predictions: [[]] + # references: [[]] + predictions = [parse_qa_multiple_answer(pred) for pred in predictions] + details = [] + cnt = 0 + for pred, ref in zip(predictions, references): + detail = {'pred': pred, 'answer': ref, 'correct': False} + if is_equiv(pred, ref): + cnt += 1 + detail['correct'] = True + details.append(detail) + score = cnt / len(predictions) * 100 + return {'Accuracy': score, 'details': details} + +def process_generated_results_CMeEE(pred_file): + # 实体每类占一行,每行格式为 "[类型名称]实体:实体名称1,实体名称2,实体名称3\n" + # 多个实体,用 ,符号分割 + structured_output = [] + answer_choices = ['药物', '设备', '医院科室', '微生物类', '身体部位', '医疗操作', '医学检验项目', '症状', '疾病'] + for pred in pred_file: + list_entities = [] + for choice in answer_choices: + for piece in re.split('\n', pred): + if piece.startswith(f"{choice}"): + mentions = re.split(r"[,,]", piece.replace(f"{choice}:", "").replace(f"{choice}:", "")) + for ment in mentions: + list_entities.append({'type':choice, 'entity':ment}) + structured_output.append(list_entities) + return structured_output + +def process_generated_results_EMR(pred_file): + structured_output = [] + regex = r"^(主诉|现病史|既往史|个人史|婚育史|家族史)[::]([\s\S]+)$" + for prediction in pred_file: + entities: dict = {} + if "\n\n" in prediction: + blocks = prediction.split("\n\n") + else: + blocks = prediction.splitlines() + for line in blocks: + if match := re.match(regex, line.strip()): + type_ = match[1] + mention = match[2].strip() + entities[type_] = mention + structured_output.append(entities) + return structured_output + +def process_generated_results_CMeIE(pred_file): + structured_output = [] + for line in pred_file: + gen_output = line + + answer_choices = "相关(导致)、鉴别诊断、遗传因素、发病性别倾向、相关(症状)、手术治疗、预防、辅助检查、筛查、阶段、临床表现、风险评估因素、同义词、发病年龄、预后生存率、病史、传播途径、治疗后症状、药物治疗、辅助治疗、化疗、死亡率、放射治疗、病因、组织学检查、内窥镜检查、多发群体、并发症、实验室检查、就诊科室、病理生理、高危因素、发病率、多发地区、病理分型、影像学检查、转移部位、发病部位、相关(转化)、外侵部位、预后状况、发病机制、多发季节" + re_choices = "|".join(re.escape(choice) for choice in answer_choices.split('、')) + regex = ( + rf'关系[::]["“]({re_choices})["”][,,]' + r'头实体[::]["“]([^"”]+)["”][,,]尾实体[::]["“]([^"”]+)["”]' + ) + + list_spos = [] + list_answer_strs = gen_output.split("\n") + for line in list_answer_strs: + for item in re.finditer(regex, line): + print(item) + for match in re.finditer(regex, line): + list_spos.append({"predicate": match[1], "subject": match[2], "object": match[3]}) + + structured_output.append(list_spos) + return structured_output + +def process_generated_results_CDN(pred_file): + structured_output = [] + answer_choices = json.load(open('./opencompass/datasets/medbench/entity_list.jsonl', 'r')) + for line in pred_file: + gen_output = line + + answer_str = gen_output.split("\n")[-1] + answers = answer_str.split(",") + answers = [w.strip() for w in answers if len(w.strip()) > 0] + answers = [w for w in answers if w in answer_choices] + answers = list(set(answers)) + answers = [ + { + "entity": w, + "type": "normalization", + } + for w in answers + ] + + structured_output.append(answers) + return structured_output + +def process_generated_results_CDEE(pred_file): + structured_output = [] + for prediction in pred_file: + events: list[dict] = [] + for line in prediction.splitlines(): + if "主体词" in line: + line = line.rstrip("。") + kvs = line.split(";") + kv_dict = dict(kv.split(":", maxsplit=1) for kv in kvs if ":" in kv) + events.append({ + "主体词": kv_dict.get("主体词", ""), + "发生状态": ( + v + if (v := kv_dict.get("发生状态", "不确定")) in ("不确定", "否定") + else "" + ), + "描述词": ( + v.split(",") if (v := kv_dict.get("描述词", "空")) != "空" else [] + ), + "解剖部位": ( + v.split(",") + if (v := kv_dict.get("解剖部位", "空")) != "空" + else [] + ), + }) + structured_output.append(events) + return structured_output + +def process_generated_results_CTC(pred_file): + structured_output = [] + + for line in pred_file: + gen_output = line + # 答案格式:直接回答分类标签 + answer_str = gen_output.strip() + structured_output.append(answer_str) + return structured_output + +def process_generated_results_doc_parsing(pred_file): + float_field_regex = r"(体温|脉搏|心率|收缩压|舒张压|呼吸)[^\d]*(\d+(?:\.\d+)?)" + + output = [] + for prediction in pred_file: + entities = { + "体温": "未扪及", + "脉搏": "未扪及", + "心率": "未扪及", + "收缩压": "未扪及", + "舒张压": "未扪及", + "呼吸": "未扪及", + "是否上腹部深压痛": None, + "是否腹部反跳痛": None, + "上腹部肿块": None, + } + for sentence in re.split("[,|。|\n]", prediction): + for match in re.finditer(float_field_regex, prediction): + entities[match[1]] = match[2] + if "上腹部深压痛" in sentence: + if re.search("是(?!否)|(?:^|[^不])存在|有", sentence): + entities["是否上腹部深压痛"] = "是" + else: + entities["是否上腹部深压痛"] = "否" + elif "腹部反跳痛" in sentence: + if re.search("是(?!否)|(?:^|[^不])存在|有", sentence): + entities["是否腹部反跳痛"] = "是" + else: + entities["是否腹部反跳痛"] = "否" + elif "上腹部肿块" in sentence: + if re.search("是(?!否)|(?:^|[^不])存在|有", sentence): + entities["上腹部肿块"] = "扪及" + else: + entities["上腹部肿块"] = "未扪及" + result = [ + { + "type": "体温(℃)", + "entity": entities["体温"], + }, + { + "type": "脉搏(次/分)", + "entity": entities["脉搏"], + }, + { + "type": "心率(次/分)", + "entity": entities["心率"], + }, + { + "type": "收缩压(mmHg)", + "entity": entities["收缩压"], + }, + { + "type": "舒张压(mmHg)", + "entity": entities["舒张压"], + }, + { + "type": "呼吸(次/分)", + "entity": entities["呼吸"], + }, + ] + if entities["是否上腹部深压痛"]: + result.append({ + "type": "是否上腹部深压痛", + "entity": entities["是否上腹部深压痛"], + }) + if entities["是否腹部反跳痛"]: + result.append({ + "type": "是否腹部反跳痛", + "entity": entities["是否腹部反跳痛"], + }) + if entities["上腹部肿块"]: + result.append({ + "type": "上腹部肿块", + "entity": entities["上腹部肿块"], + }) + + output.append(result) + return output + +def process_generated_results_mrg(pred_file): + structured_output = [] + regex = r"^(主诉|现病史|辅助检查|既往史|诊断|建议)[::]([\s\S]+)$" + for prediction in pred_file: + entities = {} + if "\n\n" in prediction: + blocks = prediction.split("\n\n") + else: + blocks = prediction.splitlines() + for line in blocks: + if match := re.match(regex, line.strip()): + type_ = match[1] + mention = match[2].strip() + entities[type_] = mention + structured_output.append(entities) + return structured_output + +def calc_info_extract_task_scores(list_structured_predict, list_structured_golden): + + assert len(list_structured_golden) == len(list_structured_predict) + + tp = 0 + fp = 0 + fn = 0 + for samp_golden, samp_predict in zip(list_structured_golden, list_structured_predict): + # samp_golden: [[{}]] + answer_golden = samp_golden + answer_predict = samp_predict + # assert isinstance(answer_golden, list) + # assert isinstance(answer_predict, list), "sample format is wrong!" + + set_golden = set() + for inst in answer_golden: + assert isinstance(inst, dict) + keys = sorted(list(inst.keys())) + inst = tuple([json.dumps(inst[w], ensure_ascii=False) for w in keys ]) + # inst = list(inst.items()) + # inst.sort() + # inst = tuple(inst) + + set_golden.add(inst) + + set_predict = set() + for inst in answer_predict: + assert isinstance(inst, dict) + keys = sorted(list(inst.keys())) + + inst = tuple([json.dumps(inst[w], ensure_ascii=False) for w in keys]) + + set_predict.add(inst) + + tp += len(set_golden.intersection(set_predict)) + fp += len(set_predict.difference(set_golden)) + fn += len(set_golden.difference(set_predict)) + + if tp: + precision = tp / (tp + fp) + recall = tp / (tp + fn) + f1 = 2 * precision * recall / (precision + recall) + + else: + precision, recall, f1 = 0, 0, 0 + + return precision, recall, f1 + +def calc_cls_task_scores(list_structured_golden, + list_structured_predict, + list_labels=None, + return_macro=False, + ): + # types = list_labels + # scores = {c: {"tp": 0, "fp": 0, "fn": 0, "tn": 0} for c in list_labels + ["ALL"]} + + predictions = [] + ground_truths = [] + + # Count GT relations and Predicted relations + assert len(list_structured_golden) == len(list_structured_predict) + n_sents = len(list_structured_golden) + + # Count TP, FP and FN per type + for pred_samp, gt_samp in zip(list_structured_predict, list_structured_golden): + + pred_label = pred_samp + gt_label = gt_samp + # assert gt_label != "" + if gt_label == "": + get_label = list_labels[0] + if pred_label == "": + pred_label = list_labels[0] + + predictions.append(pred_label) + ground_truths.append(gt_label) + + # metric + cls_report = classification_report( + ground_truths, predictions, + output_dict=True, + zero_division=0, + ) + + if return_macro: + return cls_report["macro avg"]["precision"], \ + cls_report["macro avg"]["recall"], \ + cls_report["macro avg"]["f1-score"] + else: + return cls_report["weighted avg"]["precision"], \ + cls_report["weighted avg"]["recall"], \ + cls_report["weighted avg"]["f1-score"] + +def calc_nlg_task_scores(list_structured_golden, list_structured_predict): + + assert len(list_structured_golden) == len(list_structured_predict) + + scores = [] + predictions = [] + references = [] + details = [] + for samp_golden, samp_predict in zip(list_structured_golden, list_structured_predict): + + answer_golden = samp_golden + answer_predict = samp_predict + + if not (answer_predict and answer_golden): + continue + + # basic tokenizer: 拆分中文字,保留英文单词 + answer_predict = basic_tokenizer.tokenize(answer_predict) + answer_golden = basic_tokenizer.tokenize(answer_golden) + answer_predict = " ".join(answer_predict).strip() + answer_golden = " ".join(answer_golden).strip() + if answer_golden.strip() == "": + answer_golden = "无 。" + if answer_predict.strip() == "": + answer_predict = "无 。" + + predictions.append(answer_predict) + references.append(answer_golden) + + details.append({'pred':answer_predict, 'answer':answer_golden, 'correct':False}) + + rouge = Rouge() + # bleu = evaluate.load('sacrebleu') + scores = rouge.get_scores(predictions, references, avg=True) + # scores_bleu = bleu.compute(predictions=predictions, references=references) + + rouge1 = scores["rouge-1"]["f"] + rouge2 = scores["rouge-2"]["f"] + rougeL = scores["rouge-l"]["f"] + + # bleu = sentence_bleu(references, predictions) + + # bert_score = [] + # for id in range(len(predictions)): + # P, R, F1 = score([predictions[i]], [references[i]], model_type='bert-base-chinese', lang="zh", verbose=True) + # bert_score.append(F1) + # bert_score = float(sum(bert_score)) / float(len(bert_score)) + # return rougeL, bleu, bert_score + return {'RougeL': rougeL, 'details':details} + +def calc_scores_f1(dict_gt, dict_pred): + details = [] + for gt, pred in zip(dict_gt, dict_pred): + details.append({'pred':pred, 'answer':gt, 'correct':None}) + + precision, recall, f1 = calc_info_extract_task_scores(dict_gt, dict_pred) + return {'F1':f1, 'details':details} + +def calc_scores_ctc(dict_gt, dict_pred): + details = [] + for gt, pred in zip(dict_gt, dict_pred): + details.append({'pred':pred, 'answer':gt, 'correct':None}) + + gts = dict_gt + preds = dict_pred + + precision, recall, f1 = calc_cls_task_scores( + gts, + preds, + list_labels=['非上述类型', '疾病', '症状(患者感受)', + '体征(医生检测)', '怀孕相关', '肿瘤进展', + '疾病分期', '过敏耐受', '器官组织状态', + '预期寿命', '口腔相关', '药物', + '治疗或手术', '设备', '护理', + '诊断', '实验室检查', '风险评估', + '受体状态', '年龄', '特殊病人特征', + '读写能力', '性别', '教育情况', + '居住情况', '种族', '知情同意', + '参与其它试验', '研究者决定', '能力', + '伦理审查', '依存性', '成瘾行为', + '睡眠', '锻炼', '饮食', '酒精使用', + '性取向', '吸烟状况', '献血', + '病例来源', '残疾群体', '健康群体', + '数据可及性', "含有多个类别"], + return_macro=True, + ) + return {'Macro-F1':f1, 'details':details} + +def calc_scores_nlg(dict_gt, dict_pred): + + # scores = {} + scores = {'score':0, 'details':[]} + success_flag = 1 + + gts = dict_gt + preds = dict_pred + # if not len(gts) == len(preds): + # success_flag = 0 + # try: + return calc_nlg_task_scores(gts, preds) + +@ICL_EVALUATORS.register_module() +class MedBenchEvaluator_CMeEE(BaseEvaluator): + + def score(self, predictions, references): + predictions = process_generated_results_CMeEE(predictions) + return calc_scores_f1(predictions, references) + +@ICL_EVALUATORS.register_module() +class MedBenchEvaluator_DBMHG(BaseEvaluator): + + def score(self, predictions, references): + predictions = process_generated_results_EMR(predictions) + return calc_scores_f1(predictions, references) + +@ICL_EVALUATORS.register_module() +class MedBenchEvaluator_IMCS_V2_MRG(BaseEvaluator): + + def score(self, predictions, references): + # predictions = process_generated_results_mrg(predictions) + references_revise = [] + for item in references: + temp_ref = '' + for sub_item in item: + temp_ref += sub_item['type'] + ':' + sub_item['entity'] + '\n' + references_revise.append(temp_ref) + return calc_nlg_task_scores(references_revise, predictions) + +@ICL_EVALUATORS.register_module() +class MedBenchEvaluator_CMeIE(BaseEvaluator): + + def score(self, predictions, references): + predictions = process_generated_results_CMeIE(predictions) + return calc_scores_f1(predictions, references) + +@ICL_EVALUATORS.register_module() +class MedBenchEvaluator_CHIP_CDEE(BaseEvaluator): + + def score(self, predictions, references): + predictions = process_generated_results_CDEE(predictions) + return calc_scores_f1(predictions, references) + +@ICL_EVALUATORS.register_module() +class MedBenchEvaluator_CHIP_CDN(BaseEvaluator): + + def score(self, predictions, references): + predictions = process_generated_results_CDN(predictions) + return calc_scores_f1(predictions, references) + +@ICL_EVALUATORS.register_module() +class MedBenchEvaluator_CHIP_CTC(BaseEvaluator): + + def score(self, predictions, references): + predictions = process_generated_results_CTC(predictions) + return calc_scores_ctc(predictions, references) + +@ICL_EVALUATORS.register_module() +class MedBenchEvaluator_Doc_parsing(BaseEvaluator): + + def score(self, predictions, references): + # predictions = process_generated_results_doc_parsing(predictions) + references_revise = [] + for item in references: + temp_ref = '' + for sub_item in item: + temp_ref += sub_item['type'] + ':' + sub_item['entity'] + '\n' + references_revise.append(temp_ref) + return calc_nlg_task_scores(references_revise, predictions) + +@ICL_EVALUATORS.register_module() +class MedBenchEvaluator_NLG(BaseEvaluator): + + def score(self, predictions, references): + # predictions = process_generated_results_med(predictions) + return calc_scores_nlg(predictions, references) + +@ICL_EVALUATORS.register_module() +class MedBenchEvaluator_Cloze(BaseEvaluator): + + def score(self, predictions, references): + # predictions: [[]] + # references: [[]] + # predictions = [parse_qa_multiple_answer(pred) for pred in predictions] + details = [] + cnt = 0 + + for pred, ref in zip(predictions, references): + detail = {'pred':pred, 'answer':ref, 'correct':False} + + if sum([item in pred for item in ref]) == len(ref): + cnt += 1 + detail['correct'] = True + details.append(detail) + score = cnt / len(predictions) * 100 + return {'Accuracy': score, 'details': details} + +@ICL_EVALUATORS.register_module() +class MedBenchEvaluator_TF(BaseEvaluator): + + def score(self, predictions, references): + # predictions: [[]] + # references: [[]] + # predictions = [parse_qa_multiple_answer(pred) for pred in predictions] + details = [] + cnt = 0 + + for pred, ref in zip(predictions, references): + + if '不' in pred or '否' in pred: + cur_pred = '不可以' + else: + cur_pred = '可以' + + detail = {'pred':cur_pred, 'answer':ref, 'correct':False} + + if cur_pred == ref: + cnt += 1 + detail['correct'] = True + + details.append(detail) + + score = cnt / len(predictions) * 100 return {'Accuracy': score, 'details': details} \ No newline at end of file diff --git a/opencompass/datasets/medbench/post_process.py b/opencompass/datasets/medbench/post_process.py index 36e4ca81..f78ebc3d 100644 --- a/opencompass/datasets/medbench/post_process.py +++ b/opencompass/datasets/medbench/post_process.py @@ -1,200 +1,200 @@ -# flake8: noqa -import json -import re - -from . import dataset_loader - - -def extract_last_line(string): - lines = string.split('\n') - for item in lines[::-1]: - if item.strip() != '': - string = item - break - return string - - -def remove_few_shot_prefix(string: str): - prefix_list = ['The answer is therefore', '答案是'] - for prefix in prefix_list: - if string.startswith(prefix): - string = string[len(prefix):].strip() - elif prefix in string: - index = string.rfind(prefix) - if index >= 0: - string = string[index + len(prefix):].strip() - return string - - -def try_parse_few_shot_qa_single_answer(string, setting_name, language='en'): - if setting_name == 'few-shot-CoT': - string = extract_last_line(string) - if language == 'en': - pattern = 'answer is .*?([A-G])' - match = re.search(pattern, string) - elif language == 'zh': - pattern = '答案是.*?([A-G])' - match = re.search(pattern, string) - else: - raise ValueError('Unknown language {0}'.format(language)) - if match: - return match.group(1) - else: - return None - - -def try_parse_few_shot_pattern(string: str, dataset_name, setting_name): - if setting_name == 'few-shot-CoT': - string = extract_last_line(string) - if dataset_name in dataset_loader.chinese_cloze_datasets: - return string.startswith('答案是') - elif dataset_name in dataset_loader.english_cloze_datasets: - return string.startswith('The answer is therefore') - elif dataset_name in dataset_loader.chinese_qa_datasets: - pattern = '答案是.*?([A-G])' - match = re.search(pattern, string) - return match is not None - elif dataset_name in dataset_loader.english_qa_datasets: - pattern = 'answer is .*?([A-G])' - match = re.search(pattern, string) - return match is not None - return False - - -def parse_few_shot_qa_single_answer(string, setting_name, language='en'): - answer = try_parse_few_shot_qa_single_answer(string, setting_name, - language) - if answer is None: - return find_first_capital_letter(string) - else: - return answer - - -def find_first_capital_letter(answer): - letter_set = {'A', 'B', 'C', 'D', 'E', 'F'} - for c in answer: - if c in letter_set: - return c - # print("Can't find capital letter in:", answer) - return '' - - -def extract_answer_in_bracket(answer, prefix='【', suffix='】'): - if prefix not in answer and suffix not in answer: - # print("doesn't found special tokens in:", answer) - return '' - s = answer.index(prefix) + len(prefix) - t = answer.index(suffix) - ret = answer[s:t] - return ret - - -def parse_math_answer(setting_name, raw_string): - if setting_name == 'few-shot-CoT': - raw_string = extract_last_line(raw_string) - if setting_name == 'few-shot-CoT' or setting_name == 'few-shot': - raw_string = remove_few_shot_prefix(raw_string) - return raw_string - - def remove_boxed(s): - left = '\\boxed{' - try: - assert s[:len(left)] == left - assert s[-1] == '}' - answer = s[len(left):-1] - if '=' in answer: - answer = answer.split('=')[-1].lstrip(' ') - return answer - except: - return None - - def last_boxed_only_string(string): - idx = string.rfind('\\boxed') - if idx < 0: - idx = string.rfind('\\fbox') - if idx < 0: - return None - i = idx - right_brace_idx = None - num_left_braces_open = 0 - while i < len(string): - if string[i] == '{': - num_left_braces_open += 1 - if string[i] == '}': - num_left_braces_open -= 1 - if num_left_braces_open == 0: - right_brace_idx = i - break - i += 1 - - if right_brace_idx == None: - retval = None - else: - retval = string[idx:right_brace_idx + 1] - - return retval - - def get_answer_with_dollar_sign(s): - first_pattern = '\$(.*)\$' - last_match = None - matches = re.findall(first_pattern, s) - if matches: - last_match = matches[-1] - if '=' in last_match: - last_match = last_match.split('=')[-1].lstrip(' ') - return last_match - - def get_answer_without_dollar_sign(s): - last_match = None - if '=' in s: - last_match = s.split('=')[-1].lstrip(' ').rstrip('.') - if '\n' in last_match: - last_match = last_match.split('\n')[0] - else: - pattern = '(?:\\$)?\d+(?:\.\d+)?(?![\w\d])' - matches = re.findall(pattern, s) - if matches: - last_match = matches[-1] - return last_match - - raw_string = remove_few_shot_prefix(raw_string) - if '\\boxed' in raw_string: - answer = remove_boxed(last_boxed_only_string(raw_string)) - else: - answer = get_answer_with_dollar_sign(raw_string) - if not answer: - answer = get_answer_without_dollar_sign(raw_string) - return answer - - -def parse_qa_multiple_answer(string): - # if setting_name == 'few-shot-CoT': - # string = extract_last_line(string) - for x in ['CC', 'CA', 'AC', 'POMES', 'AI', 'MIBG', 'CF', 'CTE', 'AD', 'CB', 'BG', 'BD', 'BE', 'BH', 'CTB', 'BI', 'CE', 'Pugh', 'Child', 'CTI', 'CTA', 'TACE', 'PPD', 'Castleman', 'BA', 'CH', 'AB', 'CTC', 'CT', 'CTH', 'CD', 'AH', 'AE', 'AA', 'AF', 'BC', 'CG', 'BB', 'CI', 'BF', 'CTF', 'CTG', 'AG', 'CTD', '分级C', '分级A', 'I131', '分级B', '分级D', '131I‐MIBG', 'NYHA', 'IPF', 'DIP', 'Lambert-Eaton', 'Graves', 'IIA期', 'CKD', 'FDA', 'A级', 'B级', 'C级', 'D级', '维生素D']: - string = string.replace(x, '') - pattern = '\(*([A-Z])\)*' - match = re.findall(pattern, string) - if match: - return match - return [] - - -def post_process(dataset_name, setting_name, prediction): - if dataset_name in dataset_loader.english_cloze_datasets or dataset_name in dataset_loader.chinese_cloze_datasets: - return parse_math_answer(setting_name, prediction) - - if dataset_name in ['jec-qa-kd', 'jec-qa-ca', 'gaokao-physics']: - return parse_qa_multiple_answer(prediction, setting_name) - - # all other datasets are QA problems with single answer - if 'zero-shot' in setting_name: - answer = find_first_capital_letter(prediction) - return answer - - # all other datasets are QA problems with single answer and setting_name are few-shot - language = 'en' if dataset_name in dataset_loader.english_qa_datasets else 'zh' - if dataset_name in dataset_loader.english_qa_datasets or dataset_name in dataset_loader.chinese_qa_datasets: - return parse_few_shot_qa_single_answer(prediction, setting_name, - language) - else: - raise ValueError(f'Unsupported dataset name {dataset_name}') +# flake8: noqa +import json +import re + +from . import dataset_loader + + +def extract_last_line(string): + lines = string.split('\n') + for item in lines[::-1]: + if item.strip() != '': + string = item + break + return string + + +def remove_few_shot_prefix(string: str): + prefix_list = ['The answer is therefore', '答案是'] + for prefix in prefix_list: + if string.startswith(prefix): + string = string[len(prefix):].strip() + elif prefix in string: + index = string.rfind(prefix) + if index >= 0: + string = string[index + len(prefix):].strip() + return string + + +def try_parse_few_shot_qa_single_answer(string, setting_name, language='en'): + if setting_name == 'few-shot-CoT': + string = extract_last_line(string) + if language == 'en': + pattern = 'answer is .*?([A-G])' + match = re.search(pattern, string) + elif language == 'zh': + pattern = '答案是.*?([A-G])' + match = re.search(pattern, string) + else: + raise ValueError('Unknown language {0}'.format(language)) + if match: + return match.group(1) + else: + return None + + +def try_parse_few_shot_pattern(string: str, dataset_name, setting_name): + if setting_name == 'few-shot-CoT': + string = extract_last_line(string) + if dataset_name in dataset_loader.chinese_cloze_datasets: + return string.startswith('答案是') + elif dataset_name in dataset_loader.english_cloze_datasets: + return string.startswith('The answer is therefore') + elif dataset_name in dataset_loader.chinese_qa_datasets: + pattern = '答案是.*?([A-G])' + match = re.search(pattern, string) + return match is not None + elif dataset_name in dataset_loader.english_qa_datasets: + pattern = 'answer is .*?([A-G])' + match = re.search(pattern, string) + return match is not None + return False + + +def parse_few_shot_qa_single_answer(string, setting_name, language='en'): + answer = try_parse_few_shot_qa_single_answer(string, setting_name, + language) + if answer is None: + return find_first_capital_letter(string) + else: + return answer + + +def find_first_capital_letter(answer): + letter_set = {'A', 'B', 'C', 'D', 'E', 'F'} + for c in answer: + if c in letter_set: + return c + # print("Can't find capital letter in:", answer) + return '' + + +def extract_answer_in_bracket(answer, prefix='【', suffix='】'): + if prefix not in answer and suffix not in answer: + # print("doesn't found special tokens in:", answer) + return '' + s = answer.index(prefix) + len(prefix) + t = answer.index(suffix) + ret = answer[s:t] + return ret + + +def parse_math_answer(setting_name, raw_string): + if setting_name == 'few-shot-CoT': + raw_string = extract_last_line(raw_string) + if setting_name == 'few-shot-CoT' or setting_name == 'few-shot': + raw_string = remove_few_shot_prefix(raw_string) + return raw_string + + def remove_boxed(s): + left = '\\boxed{' + try: + assert s[:len(left)] == left + assert s[-1] == '}' + answer = s[len(left):-1] + if '=' in answer: + answer = answer.split('=')[-1].lstrip(' ') + return answer + except: + return None + + def last_boxed_only_string(string): + idx = string.rfind('\\boxed') + if idx < 0: + idx = string.rfind('\\fbox') + if idx < 0: + return None + i = idx + right_brace_idx = None + num_left_braces_open = 0 + while i < len(string): + if string[i] == '{': + num_left_braces_open += 1 + if string[i] == '}': + num_left_braces_open -= 1 + if num_left_braces_open == 0: + right_brace_idx = i + break + i += 1 + + if right_brace_idx == None: + retval = None + else: + retval = string[idx:right_brace_idx + 1] + + return retval + + def get_answer_with_dollar_sign(s): + first_pattern = '\$(.*)\$' + last_match = None + matches = re.findall(first_pattern, s) + if matches: + last_match = matches[-1] + if '=' in last_match: + last_match = last_match.split('=')[-1].lstrip(' ') + return last_match + + def get_answer_without_dollar_sign(s): + last_match = None + if '=' in s: + last_match = s.split('=')[-1].lstrip(' ').rstrip('.') + if '\n' in last_match: + last_match = last_match.split('\n')[0] + else: + pattern = '(?:\\$)?\d+(?:\.\d+)?(?![\w\d])' + matches = re.findall(pattern, s) + if matches: + last_match = matches[-1] + return last_match + + raw_string = remove_few_shot_prefix(raw_string) + if '\\boxed' in raw_string: + answer = remove_boxed(last_boxed_only_string(raw_string)) + else: + answer = get_answer_with_dollar_sign(raw_string) + if not answer: + answer = get_answer_without_dollar_sign(raw_string) + return answer + + +def parse_qa_multiple_answer(string): + # if setting_name == 'few-shot-CoT': + # string = extract_last_line(string) + for x in ['CC', 'CA', 'AC', 'POMES', 'AI', 'MIBG', 'CF', 'CTE', 'AD', 'CB', 'BG', 'BD', 'BE', 'BH', 'CTB', 'BI', 'CE', 'Pugh', 'Child', 'CTI', 'CTA', 'TACE', 'PPD', 'Castleman', 'BA', 'CH', 'AB', 'CTC', 'CT', 'CTH', 'CD', 'AH', 'AE', 'AA', 'AF', 'BC', 'CG', 'BB', 'CI', 'BF', 'CTF', 'CTG', 'AG', 'CTD', '分级C', '分级A', 'I131', '分级B', '分级D', '131I‐MIBG', 'NYHA', 'IPF', 'DIP', 'Lambert-Eaton', 'Graves', 'IIA期', 'CKD', 'FDA', 'A级', 'B级', 'C级', 'D级', '维生素D']: + string = string.replace(x, '') + pattern = '\(*([A-Z])\)*' + match = re.findall(pattern, string) + if match: + return match + return [] + + +def post_process(dataset_name, setting_name, prediction): + if dataset_name in dataset_loader.english_cloze_datasets or dataset_name in dataset_loader.chinese_cloze_datasets: + return parse_math_answer(setting_name, prediction) + + if dataset_name in ['jec-qa-kd', 'jec-qa-ca', 'gaokao-physics']: + return parse_qa_multiple_answer(prediction, setting_name) + + # all other datasets are QA problems with single answer + if 'zero-shot' in setting_name: + answer = find_first_capital_letter(prediction) + return answer + + # all other datasets are QA problems with single answer and setting_name are few-shot + language = 'en' if dataset_name in dataset_loader.english_qa_datasets else 'zh' + if dataset_name in dataset_loader.english_qa_datasets or dataset_name in dataset_loader.chinese_qa_datasets: + return parse_few_shot_qa_single_answer(prediction, setting_name, + language) + else: + raise ValueError(f'Unsupported dataset name {dataset_name}') diff --git a/opencompass/datasets/medbench/utils.py b/opencompass/datasets/medbench/utils.py index fbb31105..a8c4632c 100644 --- a/opencompass/datasets/medbench/utils.py +++ b/opencompass/datasets/medbench/utils.py @@ -1,43 +1,43 @@ -# flake8: noqa -import json - - -def read_jsonl(path): - with open(path, encoding='utf8') as fh: - results = [] - for line in fh: - if line is None: - continue - try: - results.append(json.loads(line) if line != 'null' else line) - except Exception as e: - print(e) - print(path) - print(line) - raise e - return results - - -def save_jsonl(lines, directory): - with open(directory, 'w', encoding='utf8') as f: - for line in lines: - f.write(json.dumps(line, ensure_ascii=False) + '\n') - - -def extract_answer(js): - try: - if js is None or js == 'null': - return '' - answer = '' - if isinstance(js, str): - answer = js - elif 'text' in js['choices'][0]: - answer = js['choices'][0]['text'] - else: - answer = js['choices'][0]['message']['content'] - # answer = js[''] - return answer - except Exception as e: - # print(e) - # print(js) - return '' +# flake8: noqa +import json + + +def read_jsonl(path): + with open(path, encoding='utf8') as fh: + results = [] + for line in fh: + if line is None: + continue + try: + results.append(json.loads(line) if line != 'null' else line) + except Exception as e: + print(e) + print(path) + print(line) + raise e + return results + + +def save_jsonl(lines, directory): + with open(directory, 'w', encoding='utf8') as f: + for line in lines: + f.write(json.dumps(line, ensure_ascii=False) + '\n') + + +def extract_answer(js): + try: + if js is None or js == 'null': + return '' + answer = '' + if isinstance(js, str): + answer = js + elif 'text' in js['choices'][0]: + answer = js['choices'][0]['text'] + else: + answer = js['choices'][0]['message']['content'] + # answer = js[''] + return answer + except Exception as e: + # print(e) + # print(js) + return '' diff --git a/opencompass/datasets/subjective/mtbench101.py b/opencompass/datasets/subjective/mtbench101.py index a8109932..1e0acddf 100644 --- a/opencompass/datasets/subjective/mtbench101.py +++ b/opencompass/datasets/subjective/mtbench101.py @@ -1,385 +1,385 @@ -# flake8: noqa: E501 -import json -import os.path as osp -import re - -from datasets import Dataset, DatasetDict -from collections import defaultdict -from opencompass.utils import get_data_path - -from ..base import BaseDataset -from typing import Optional - -from opencompass.registry import DICT_POSTPROCESSORS, LOAD_DATASET - -from .utils import get_judgeanswer_and_reference -skip_first_tasks = ['FR', 'CR', 'AR', 'SA', 'SC', 'CM'] - -need_ref_tasks = ['MR', 'GR'] - -judge = "Please act as an impartial judge follow this instructions: In the following conversations, only the response of the 'assistant' in the last round of conversations is the output of the large language model (AI assistant) that needs to be evaluated. Please act as an impartial judge and score this response on a scale of 1 to 10, where 1 indicates that the response completely fails to meet the criteria, and 10 indicates that the response perfectly meets all the evaluation criteria.\ - Note that only the response of the 'assistant' in the LAST ROUND of conversations is the output of the large language model (the AI assistant) that needs to be evaluated; the previous conversations is the groud truth history which do NOT need to be evaluated." - -score_format = "\n\n Note that only the response of the 'assistant' in the LAST ROUND of conversations is the output of the large language model (the AI assistant) that needs to be evaluated!! You must provide your explanation. After providing your explanation, please show the score by strictly following this format: 'Rating: [[score]]', for example 'Rating: [[6]]'. The DIALGUE need to be judged is in this format: \n *** \n DIALGUE \n ***" - -eval_CM = "The capacity of a large language model to recall and utilize previously mentioned information from earlier in the conversation is a critical indicator of its conversational memory abilities. This competency is essential for maintaining context and coherence throughout an extended dialogue. The performance of the AI assistant should be evaluated based on its ability to consistently reference and integrate past information into current responses. The evaluation criteria are as follows:\n\ -\n\ -1.Analyze whether the AI assistant appropriately recalls relevant details from earlier parts of the conversation when responding to 'Human's inquiries or comments.\n\ -2.Assess the AI assistant's ability to integrate the remembered information into its current responses in a way that is coherent and adds value to the dialogue.\n\ -3.Examine the AI assistant's consistency in maintaining the context established by previous dialogue exchanges throughout the entire conversation.\n\ -4.Evaluate the effectiveness of the AI assistant's memory recall in facilitating a smooth and logical progression of the conversation, avoiding repetitive or contradictory statements.\n\ -Scoring Guidelines:\n\ -\n\ -1-3 points: The AI assistant demonstrates poor recall of previous conversation details, leading to inconsistent or contradictory responses, and fails to maintain the dialogue's context, resulting in a disjointed or unclear conversation flow.\n\ -4-6 points: The AI assistant exhibits a moderate ability to remember past information, but its integration into the conversation is sporadic or partially effective, leading to a conversation that lacks full coherence or occasionally disregards established context.\n\ -7-9 points: The AI assistant reliably recalls and utilizes earlier information, contributing to a coherent dialogue that respects the conversation's context, with minor lapses in memory that do not significantly disrupt the conversation flow.\n\ -10 points: The AI assistant demonstrates exceptional memory recall, seamlessly weaving past details into current responses to enrich the dialogue and preserve context, ensuring a smooth and logical conversation that progresses naturally.\n\ -When scoring, consider the significance of the AI assistant's memory recall to the overall quality of the conversation. If recalling past information was not necessary for a particular exchange, the AI assistant's failure to reference earlier dialogue should not impact the score negatively. However, if recalling previous information enhances the dialogue's clarity, relevance, and continuity, this should be regarded as a positive attribute of the language model's performance.\n\ -\n\ -Please provide a rationale for your score, specifically addressing how the AI assistant's memory recall and the use of past information align with the evaluation criteria and contribute to the conversation's effectiveness." - -eval_SI = "\n We aim to specifically evaluate the command-following ability of the large language model (AI assistant). The criteria for evaluation are as follows:\ -\n \ -1. In the first round, 'Human' will present a task request without providing details about what needs to be done. If the AI Assistant being evaluated generates a response for the first round, it should ask 'Human' for the specific details of the task required or wait for 'Human' to provide specific details of the required tasks, rather than directly attempting to answer the task.\ -2. Starting from the second round, 'Human' will provide the specific content of what needs to be carried out for the task, without repeating the task requirement. The AI Assistant being evaluated should then provide correct and specific answers directly addressing the task requirements.\ -\n \ -Please rate the AI assistant's response using a 1 to 10 scale based on the following guidelines:\ -\n \ -- 1-3 points: The AI assistant failed to understand the ta///sk request and neither asked relevant questions nor provided information related to the task.\ -- 4-6 points: The AI assistant understood some aspects of the task request but the response could be more specific or relevant.\ -- 7-9 points: The AI assistant provided a useful response that was mostly correct and targeted, even though there may be minor oversights.\ -- 10 points: The AI assistant demonstrated a perfect understanding of the task requirements and provided a comprehensive and accurate answer, fully meeting 'Human's expectations.\ -\n \ -Additionally, please provide a brief justification for the score given, particularly highlighting how the AI assistant's response aligns with or deviates from the above criteria. This will help us understand the performance of the AI assistant and take steps for improvement if necessary." - -eval_CR = "\nWe aim to specifically evaluate the paraphrasing ability of the large language model (AI assistant). The criteria for evaluation are as follows:\n \ -\n \ -1. The content of the AI assistant's rewritten response must maintain the same main idea as the Assistant's response in the first round.\n \ -2. The rewritten content must comply with the specific rewriting requirements set forth by the Human in the current round.\n \ -\n \ -Scoring Guidelines:\n \ -\n \ -- 1-3 points: The rewritten response significantly deviates from the original main idea or fails to meet the rewriting requirements.\n \ -- 4-6 points: The rewritten response captures the original main idea but only partially meets the rewriting requirements or lacks fluency/coherence.\n \ -- 7-9 points: The rewritten response maintains the original main idea and satisfies most of the rewriting requirements with minor discrepancies or stylistic issues.\n \ -- 10 points: The rewritten response perfectly preserves the original main idea and fulfills all of the rewriting requirements set by Human, exhibiting a seamless and natural integration of the required changes.\n \ -\n \ -Please provide a brief justification for the score you give and present your score. Please judge the response and Do Not answer the question in the dialogue directly." - -eval_FR = "\nWe aim to specifically evaluate the paraphrasing ability of the large language model (AI assistant). The criteria for evaluation are as follows:\n \ -\n \ -1. The content of the AI assistant's rewritten response must maintain the same main idea as the Assistant's response in the first round.\n \ -2. The rewritten content must comply with the specific rewriting requirements set forth by the Human in the current round.\n \ -\n \ -Scoring Guidelines:\n \ -\n \ -- 1-3 points: The rewritten response significantly deviates from the original main idea or fails to meet the rewriting requirements.\n \ -- 4-6 points: The rewritten response captures the original main idea but only partially meets the rewriting requirements or lacks fluency/coherence.\n \ -- 7-9 points: The rewritten response maintains the original main idea and satisfies most of the rewriting requirements with minor discrepancies or stylistic issues.\n \ -- 10 points: The rewritten response perfectly preserves the original main idea and fulfills all of the rewriting requirements set by Human, exhibiting a seamless and natural integration of the required changes.\n \ -\n \ -Please provide a brief justification for the score you give and present your score. Please judge the response and Do Not answer the question in the dialogue directly." - -eval_CC = "\nThe AI assistant's capability to resist interference will be measured against these criteria:\ -\n\n \ -1. The AI assistant's response must directly correspond to the content of the Human's question in the current round, providing true and accurate information.\n\ -2. The response must not be influenced by the question and answer pattern from the previous dialogue, ensuring that it remains relevant and focused on the current question only.\ -\n\n \ -Scoring Guidelines:\ -\n\n \ -- 1-3 points: The AI assistant's response is largely influenced by previous interactions, fails to address the current question accurately, or provides false information.\n\ -- 4-6 points: The AI assistant's response shows some resistance to interference but includes irrelevant details from previous dialogues or only partially addresses the current question.\n\ -- 7-9 points: The AI assistant's response is mostly resistant to interference and accurately addresses the current question, with only minor relevancies to previous interactions.\n\ -- 10 points: The AI assistant's response is completely free from interference, focusing solely on the current question and providing a response that is both accurate and wholly relevant.\ -\n\n \ -Please provide a brief justification for the score you give, focusing on how well the AI assistant's response aligns with the two evaluation criteria. " - -eval_TS = "\nThe AI assistant's ability to handle shifts in conversation topics is crucial for maintaining relevance and adaptability during a dialogue. This skill is particularly important when 'Human' introduces a new topic or changes the subject abruptly. The performance of the AI assistant should be evaluated on its capacity to smoothly transition between topics without being inappropriately influenced by previous dialogue content. The evaluation criteria are as follows:\n \ -\n \ -1. Identify whether the AI assistant can detect and acknowledge the change in topic introduced by 'Human' without reverting back to or becoming stuck on the previous subject.\n \ -2. Evaluate the relevance of the AI assistant's responses to the new topic, ensuring they are not improperly influenced or colored by the preceding dialogue rounds.\n \ -3. Assess the AI assistant's ability to provide coherent and contextually appropriate responses to the new subject, displaying an understanding of the conversation's evolving nature.\n \ -4. Consider the AI assistant's proficiency in offering complete and insightful answers to the new topic, which demonstrate a clear break from past conversation threads.\n \ -Scoring Guidelines:\n \ -\n \ -1-3 points: The AI assistant struggles with topic transitions, frequently reverting to or being influenced by the previous topic, resulting in irrelevant or confused responses to the new subject matter.\n \ -4-6 points: The AI assistant shows a moderate ability to adapt to new topics, but occasionally exhibits lingering effects from earlier discussions, leading to partially relevant or less focused responses to the topic shifts.\n \ -7-9 points: The AI assistant adapts to topic changes well, with minimal reference to or influence from prior topics, providing responses that are largely relevant and well-aligned with the new conversation direction.\n \ -10 points: The AI assistant excels at adapting to topic shifts, seamlessly transitioning to and fully engaging with the new subject matter without any irrelevant carryover from previous dialogue content.\n \ -When scoring, consider the smoothness of the AI assistant's transition between topics and its ability to engage with the new subject matter independently of the prior conversation. If a topic shift is not present or is so subtle that continuity with previous content is warranted, the AI assistant's ability to maintain coherence should not negatively affect the score. However, if a clear topic shift occurs and the AI assistant handles it deftly, providing relevant and insightful input on the new topic, this should be recognized as a positive aspect of its conversational capabilities.\n \ -\n \ -Please provide a rationale for your score, specifically addressing the effectiveness of the AI assistant's topic transition and its relevance to the new subject matter in accordance with the evaluation criteria." - -eval_AR = "The AI assistant's understanding of references is essential for maintaining a coherent dialogue. The following criteria should be used to evaluate its performance:\n\ -\n \ -1. The AI assistant's response must demonstrate a correct understanding of referential information from questions asked by 'Human,' which typically relate to content from the previous dialogue. Ideally, the AI should explicitly acknowledge or clarify these references in its reply.\n\ -2. The response from the AI assistant should be consistent with the content of the 'Human's question in the current round, providing true and accurate information, free from misunderstandings or inaccuracies related to the references.\n\ -\n \ -Scoring Guidelines:\n\ -\n\ -- 1-3 points: The AI assistant fails to recognize or correctly interpret the referential information, leading to responses that are either inaccurate or unrelated to the previous content.\n\ -- 4-6 points: The AI assistant shows a partial understanding of references, but the response might include some inaccuracies or fail to fully utilize the referential information.\n\ -- 7-9 points: The AI assistant's response indicates a good understanding of the references, with only slight inaccuracies or omissions in the connection to the previous dialogue.\n\ -- 10 points: The AI assistant demonstrates excellent understanding and use of referential information, perfectly aligning its response with the previous content and the current question accurately and precisely.\n\ -\n \ -In addition to the score, please provide an explanation that specifically addresses how the AI assistant's response demonstrates its ability or inability to understand and use referential information in accordance with the criteria above. " - -eval_IC = "The AI assistant’s ability to engage in a productive dialogue is often enhanced by its use of counter-questions, particularly when dealing with incomplete or vague queries. The assistant's performance should be assessed based on its ability to recognize when a rhetorical question is necessary and to use it effectively to clarify the 'Human's intent. The evaluation criteria are as follows:\n \ -\n \ -1. Assess whether the question posed by 'Human' contains ambiguities or lacks specific details that would require the AI assistant to use a counter-questions for clarification.\n \ -2. If the question does require clarification through a counter-question, evaluate how the AI assistant employs this strategy to address the ambiguities or missing information in 'Human's query.\n \ -3. Once 'Human' provides the necessary conditions or clarifies the question, evaluate whether the AI assistant offers a true and detailed response that fully addresses the clarified query.\n \ -\n \ -Scoring Guidelines:\n \ -\n \ -- 1-3 points: The AI assistant fails to identify the need for a rhetorical question when necessary, or it employs rhetorical questions ineffectively, leading to answers that do not align with 'Human's query, or lack the detail required to fully clarify the question.\n \ -- 4-6 points: The AI assistant recognizes situations requiring rhetorical questions but uses them suboptimally, only partially addressing the query's deficiencies. Subsequent answers may lack full detail or accuracy even after the query is clarified.\n \ -- 7-9 points: The AI assistant effectively uses rhetorical questions to pinpoint and address the missing or unclear elements in 'Human's query, and provides a largely accurate and detailed response to the perfected question.\n \ -- 10 points: The AI assistant expertly discerns when to use rhetorical questions and employs them precisely to address the ambiguities or missing information in the query. Once clarified, it responds with detailed, accurate information that perfectly satisfies the question.\n \ -\n \ -When scoring, consider whether the use of a counter-question was essential and whether the AI assistant's decision to use or not use one improved the clarity and outcome of the dialogue. If a counter-question was not necessary, and the AI assistant refrained from using one, this should not negatively affect the score. However, if the use of a rhetorical question or follow-up query by the AI assistant brought clarity to an otherwise ambiguous situation, this should be seen as a positive contribution to the dialogue.\n \ -\n \ -Please provide a rationale for your score, specifically addressing how the AI assistant's use or omission of rhetorical questions and its responses align with the evaluation criteria and the necessity of such an approach for each particular query." - -eval_SC = "We aim to specifically evaluate the self-correction ability of the large language model (AI assistant) when the 'Human' identifies an error in the AI assistant's initial response. The criteria for evaluation are as follows:\n\ -\n\ -1. Upon 'Human' pointing out a potential mistake, the AI assistant must thoroughly assess its previous response and engage with 'Human's' observation.\n\ -2. If the 'Human' has correctly identified an error, the AI assistant must acknowledge the mistake, correct it, and provide an updated and accurate response.\n\ -\n\ -Scoring Guidelines:\n\ -\n\ -- 1-3 points: The AI assistant fails to recognize or adequately address the error identified by 'Human,' and does not make the necessary corrections to its response.\n\ -- 4-6 points: The AI assistant recognizes the error identified by 'Human' but may only partially correct the mistake or provide an incomplete updated response.\n\ -- 7-9 points: The AI assistant correctly identifies and acknowledges the error, making a substantial correction to its response and effectively updating the answer.\n\ -- 10 points: The AI assistant exhibits exceptional self-correction ability, promptly acknowledging the error and providing a comprehensive and precise updated response.\n\ -\n\ -In addition to scoring, please provide a justification for your assessment, focusing on how effectively the AI assistant's reaction to 'Human's' identified error demonstrates its ability to self-correct and address the criteria outlined above." - -eval_SA = "We aim to specifically evaluate the ability of the large language model (AI assistant) to maintain confidence in its original response when challenged by 'Human', provided that the challenge is incorrect. The criteria for evaluation are as follows:\n\ -\n\ -1. When 'Human' questions the content of the AI assistant's response, the AI assistant must engage with 'Human's' query and assess the validity of the challenge.\n\ -2. If 'Human's challenge is based on incorrect information or assumptions, the AI assistant must confidently maintain its original answer and provide a clear and reasoned explanation to justify its stance.\n\ -\n\ -Scoring Guidelines:\n\ -\n\ -- 1-3 points: The AI assistant either does not engage with 'Human's' challenge appropriately or changes its original response without proper justification, even when 'Human's' challenge is incorrect.\n\ -- 4-6 points: The AI assistant engages with the challenge but may lack confidence in its response, providing a weak justification for maintaining its original answer.\n\ -- 7-9 points: The AI assistant demonstrates the ability to appropriately assess the incorrect challenge and maintains its original response with a clear and well-supported justification.\n\ -- 10 points: The AI assistant exhibits excellent ability to maintain confidence in its original response, providing a strong and convincing explanation that effectively addresses 'Human's' incorrect challenge.\n\ -\n\ -In addition to scoring, please provide a justification for your assessment, focusing on how the AI assistant's reaction to the challenge reflects its understanding and confidence in its original response, and how well it meets the criteria outlined above." - -eval_PI = "The AI assistant's interactivity, represented by its ability to proactively initiate and sustain engaging dialogues with 'Human', is a key aspect of a dynamic conversational experience. The model should not only respond passively but should also contribute to the momentum of the conversation by introducing questions, suggesting topics, or encouraging further discourse. The performance of the AI assistant should be evaluated on its capacity for active engagement and conversational leadership. The evaluation criteria are as follows:\n\ -\n\ -1. Observe the AI assistant's initiative in contributing to the conversation beyond providing direct answers, including its ability to ask relevant follow-up questions or propose new topics.\n\ -2. Assess the AI assistant's aptness in maintaining the flow of the conversation, including how well it encourages 'Human' to provide more information or share their thoughts.\n\ -3. Examine the appropriateness of the AI assistant's interactive elements in the context of the dialogue, ensuring they foster a natural and engaging conversation rather than derailing it.\n\ -4. Evaluate the AI assistant's responsiveness to 'Human's input while being proactive, ensuring that it listens and adapts to the conversation's direction as set by 'Human'.\n\ -Scoring Guidelines:\n\ -\n\ -1-3 points: The AI assistant exhibits poor interactivity, often providing minimal responses without encouraging further dialogue, or its attempts at interactivity are misplaced and hamper the natural flow of conversation.\n\ -4-6 points: The AI assistant demonstrates moderate interactivity; it occasionally asks questions or suggests new topics but may not consistently maintain the conversational momentum or fully engage 'Human'.\n\ -7-9 points: The AI assistant is highly interactive, regularly using questions and topics to keep the conversation going, while mostly preserving relevancy and a natural exchange with 'Human'.\n\ -10 points: The AI assistant excels at interactivity, skillfully using questions and dialogue prompts to enrich the conversation, actively engaging 'Human', and enhancing the overall dialogue experience without dominating the conversation.\n\ -When scoring, consider the balance the AI assistant strikes between guiding the conversation and allowing 'Human' to steer the dialogue. The AI assistant's interactivity should feel like a natural extension of the conversation, not forced or distracting from 'Human's intent. If the conversation benefits from the AI assistant's interactive elements, leading to a richer dialogue, this should be reflected in a higher score.\n\ -\n\ -Please provide a rationale for your score, specifically addressing how the AI assistant's proactive contributions and interactive strategies align with the evaluation criteria and enrich the conversational experience." - -eval_MR = "The AI assistant's mathematical reasoning capabilities are vital for accurately solving and explaining mathematical problems posed by 'Human'. The model should leverage both the conditions provided in the current question and any relevant information from the historical dialogue. The evaluation of the AI assistant's performance will be based on the correctness of its answers and the clarity of its reasoning process. The evaluation criteria are as follows:\n\ -\n\ -1. Verify the accuracy of the AI assistant's answer against the provided reference solution in the format '### reference solution ###' for the mathematical problem.\n\ -2. Assess the completeness and step-by-step clarity of the AI assistant's reasoning process, ensuring it is logical and follows mathematical principles.\n\ -3. Evaluate the AI assistant's ability to incorporate any relevant historical dialogue information that influences the problem-solving process or the solution itself.\n\ -4. Appraise the AI assistant's communication of the solution in a manner that is understandable and instructive to 'Human', potentially aiding their learning or comprehension.\n\ -Scoring Guidelines:\n\ -\n\ -1-3 points: The AI assistant provides incorrect answers and/or fails to offer a clear and logical reasoning process, missing key steps or providing explanations that do not align with mathematical standards.\n\ -4-6 points: The AI assistant's answer is partially correct with minor errors in the reasoning process, which may lack detail or clarity in some steps, but generally follows mathematical principles.\n\ -7-9 points: The AI assistant gives correct answers with a reasoning process that includes most necessary steps and details, facilitating a good understanding of the solution.\n\ -10 points: The AI assistant provides a completely correct answer accompanied by a detailed and meticulously clear step-by-step reasoning process that is fully aligned with mathematical principles and enhances 'Human's understanding.\n\ -When scoring, focus on the precision of the AI assistant's answer and the extent to which the reasoning process is elaborated. The assistant's ability to effectively communicate complex mathematical solutions in a manner that supports 'Human's learning is indicative of high performance. If the reasoning process is exemplary and the answer is accurate, this should be reflected in a top score.\n\ -\n\ -Please provide a rationale for your score, specifically addressing the accuracy of the AI assistant's answer and the quality of the mathematical reasoning process, considering the evaluation criteria and the comparison with the reference solution." - -eval_GR = "The AI assistant's general reasoning capabilities are crucial for accurately addressing and explaining a wide range of problems posed by 'Human'. The evaluation of the AI assistant's performance will be based on the correctness of its answers and the cogency of its reasoning process. The evaluation criteria are as follows:\n\ -\n\ -1. Verify the accuracy of the AI assistant's answer against the provided reference solution in format ‘### reference solution ###‘ for the specific problem.\n\ -2. Assess the completeness and step-by-step clarity of the AI assistant's reasoning process, ensuring it is logical and follows the principles of sound reasoning.\n\ -3. Evaluate the AI assistant's ability to integrate any relevant historical dialogue information that influences the problem-solving process or the solution itself.\n\ -4. Appraise the AI assistant's communication of the solution in a manner that is understandable and instructive to 'Human', potentially aiding their learning or comprehension.\n\ -Scoring Guidelines:\n\ -\n\ -1-3 points: The AI assistant provides incorrect answers and/or fails to offer a clear and logical reasoning process, missing key steps or providing explanations that do not adhere to standards of sound reasoning.\n\ -4-6 points: The AI assistant's answer is partially correct with minor errors in the reasoning process, which may lack detail or clarity in some steps but generally follows sound reasoning principles.\n\ -7-9 points: The AI assistant gives correct answers with a well-articulated reasoning process that includes most necessary steps and details, facilitating a good understanding of the solution.\n\ -10 points: The AI assistant provides a completely correct answer accompanied by a detailed and meticulously clear step-by-step reasoning process that is fully aligned with sound reasoning principles and enhances 'Human's understanding.\n\ -When scoring, focus on the precision of the AI assistant's answer and the extent to which the reasoning process is elaborated. The assistant's ability to effectively communicate complex solutions in a manner that supports 'Human's learning is indicative of high performance. If the reasoning process is exemplary and the answer is accurate, this should be reflected in a top score.\n\ -\n\ -Please provide a rationale for your score, specifically addressing the accuracy of the AI assistant's answer and the quality of the general reasoning process, considering the evaluation criteria and the comparison with the reference solution." - -unique_prompt = { - 'CM': eval_CM, - 'SI': eval_SI, - 'AR': eval_AR, - 'TS': eval_TS, - 'CC': eval_CC, - 'CR': eval_CR, - 'FR': eval_FR, - 'SC': eval_SC, - 'SA': eval_SA, - 'MR': eval_MR, - 'GR': eval_GR, - 'IC': eval_IC, - 'PI': eval_PI, -} - - -def eval_prompt_construct(task, ref_answer, history): - - if task in need_ref_tasks: - system_prompt = judge + unique_prompt[task] + score_format - prompt_template = 'The dialogue need to be judged is: \n *** \n {history} {prediction} \n ***\n\n\ - The reference solution is: \n ### \n {ref_answer} \n ###\n\n'.format( - history=history, prediction='{prediction}', ref_answer=ref_answer) - - else: - system_prompt = judge + unique_prompt[task] + score_format - prompt_template = 'The dialogue need to be judged is: \n *** \n {history} {prediction} \n ***'.format( - history=history, prediction='{prediction}') - - return system_prompt, prompt_template - - -def add_format(question, answer): - history = [dict(role='user', content=question)] - if answer: - history += [dict(role='assistant', content=answer)] - return history - - -@LOAD_DATASET.register_module() -class MTBench101Dataset(BaseDataset): - - def load(self, path: str, name: str, *args, **kwargs): - import copy - - filename = osp.join(path, f'{name}.jsonl') - filename = get_data_path(filename, local_mode=True) - # filename = osp.join(path, 'mtbench101.jsonl') - dataset = DatasetDict() - raw_data = [] - - lines = open(filename, 'r', encoding='utf-8').readlines() - conversations = [] - for line in lines: - line = json.loads(line) - conversations.append(line) - - for dialogue in conversations: - multi_id = dialogue['id'] - task = dialogue['task'] - if task in skip_first_tasks: - skip_first = True - else: - skip_first = False - - current_multi_id = None - pre_dia = [] - history = '' - dia_list = [] - for turn_index, turn in enumerate(dialogue['history']): - human = turn['user'] - assistant = turn['bot'] - turn_id = str(turn_index + 1) - - if current_multi_id is not None and multi_id != current_multi_id: - pre_dia = [] - history = '' - - current_multi_id = multi_id - - if skip_first and turn_index == 0: - pre_dia = add_format(question=human, answer=assistant) - history = '\n\n Human: ' + human + '\n\nAssistant: ' + assistant - continue - - history = history + '\n\n Human: ' + human + '\n\nAssistant: ' - pre_dia += add_format(question=human, answer=assistant) - - pre_dia_copy = copy.deepcopy(pre_dia) - - system_prompt, prompt_template = eval_prompt_construct( - task, pre_dia, history) - - raw_data.append({ - 'dialogue': pre_dia_copy, - 'task': task, - 'multi_id': current_multi_id, - 'turn_id': turn_id, - 'system_prompt': system_prompt, - 'prompt_template': prompt_template, - 'judge': { - 'task': task, - 'multi_id': current_multi_id, - 'turn_id': turn_id, - } - }) - history = history + assistant - - dataset = Dataset.from_list(raw_data) - return dataset - - -def post_process_mtbench101(judgement: str): - """Input a string like below: - - xxx[[5]]xxx, and extract the score - """ - judgement = judgement['prediction'] - match = re.search(r'\[([0-9]+)\]', judgement) - if match: - score = int(match.group(1)) - - else: - return None - - return {'score': score, 'judgement': judgement} - - -def get_final_results(judged_answers, references): - - task_multi_id_scores = defaultdict(list) - task_scores = defaultdict(list) - - for ans, ref in zip(judged_answers, references): - - task = ref['task'] - multi_id = ref['multi_id'] - score = ans['score'] - - task_multi_id_scores[(task, multi_id)].append(score) - - for (task, multi_id), scores in task_multi_id_scores.items(): - min_score = min(scores) - task_scores[task].append(min_score) - - final_task_scores = { - task: sum(scores) / len(scores) if scores else 0 - for task, scores in task_scores.items() - } - average_score = round( - sum(final_task_scores.values()) / len(final_task_scores), 3) - - return {f'avg': average_score, **final_task_scores} - - -@DICT_POSTPROCESSORS.register_module('mtbench101') -def mtbench101_postprocess(output: dict, - output_path: str, - ) -> dict: - judged_answers, references = get_judgeanswer_and_reference( - output, output_path, post_process_mtbench101) - - results = get_final_results(judged_answers, references) - results['details'] = output +# flake8: noqa: E501 +import json +import os.path as osp +import re + +from datasets import Dataset, DatasetDict +from collections import defaultdict +from opencompass.utils import get_data_path + +from ..base import BaseDataset +from typing import Optional + +from opencompass.registry import DICT_POSTPROCESSORS, LOAD_DATASET + +from .utils import get_judgeanswer_and_reference +skip_first_tasks = ['FR', 'CR', 'AR', 'SA', 'SC', 'CM'] + +need_ref_tasks = ['MR', 'GR'] + +judge = "Please act as an impartial judge follow this instructions: In the following conversations, only the response of the 'assistant' in the last round of conversations is the output of the large language model (AI assistant) that needs to be evaluated. Please act as an impartial judge and score this response on a scale of 1 to 10, where 1 indicates that the response completely fails to meet the criteria, and 10 indicates that the response perfectly meets all the evaluation criteria.\ + Note that only the response of the 'assistant' in the LAST ROUND of conversations is the output of the large language model (the AI assistant) that needs to be evaluated; the previous conversations is the groud truth history which do NOT need to be evaluated." + +score_format = "\n\n Note that only the response of the 'assistant' in the LAST ROUND of conversations is the output of the large language model (the AI assistant) that needs to be evaluated!! You must provide your explanation. After providing your explanation, please show the score by strictly following this format: 'Rating: [[score]]', for example 'Rating: [[6]]'. The DIALGUE need to be judged is in this format: \n *** \n DIALGUE \n ***" + +eval_CM = "The capacity of a large language model to recall and utilize previously mentioned information from earlier in the conversation is a critical indicator of its conversational memory abilities. This competency is essential for maintaining context and coherence throughout an extended dialogue. The performance of the AI assistant should be evaluated based on its ability to consistently reference and integrate past information into current responses. The evaluation criteria are as follows:\n\ +\n\ +1.Analyze whether the AI assistant appropriately recalls relevant details from earlier parts of the conversation when responding to 'Human's inquiries or comments.\n\ +2.Assess the AI assistant's ability to integrate the remembered information into its current responses in a way that is coherent and adds value to the dialogue.\n\ +3.Examine the AI assistant's consistency in maintaining the context established by previous dialogue exchanges throughout the entire conversation.\n\ +4.Evaluate the effectiveness of the AI assistant's memory recall in facilitating a smooth and logical progression of the conversation, avoiding repetitive or contradictory statements.\n\ +Scoring Guidelines:\n\ +\n\ +1-3 points: The AI assistant demonstrates poor recall of previous conversation details, leading to inconsistent or contradictory responses, and fails to maintain the dialogue's context, resulting in a disjointed or unclear conversation flow.\n\ +4-6 points: The AI assistant exhibits a moderate ability to remember past information, but its integration into the conversation is sporadic or partially effective, leading to a conversation that lacks full coherence or occasionally disregards established context.\n\ +7-9 points: The AI assistant reliably recalls and utilizes earlier information, contributing to a coherent dialogue that respects the conversation's context, with minor lapses in memory that do not significantly disrupt the conversation flow.\n\ +10 points: The AI assistant demonstrates exceptional memory recall, seamlessly weaving past details into current responses to enrich the dialogue and preserve context, ensuring a smooth and logical conversation that progresses naturally.\n\ +When scoring, consider the significance of the AI assistant's memory recall to the overall quality of the conversation. If recalling past information was not necessary for a particular exchange, the AI assistant's failure to reference earlier dialogue should not impact the score negatively. However, if recalling previous information enhances the dialogue's clarity, relevance, and continuity, this should be regarded as a positive attribute of the language model's performance.\n\ +\n\ +Please provide a rationale for your score, specifically addressing how the AI assistant's memory recall and the use of past information align with the evaluation criteria and contribute to the conversation's effectiveness." + +eval_SI = "\n We aim to specifically evaluate the command-following ability of the large language model (AI assistant). The criteria for evaluation are as follows:\ +\n \ +1. In the first round, 'Human' will present a task request without providing details about what needs to be done. If the AI Assistant being evaluated generates a response for the first round, it should ask 'Human' for the specific details of the task required or wait for 'Human' to provide specific details of the required tasks, rather than directly attempting to answer the task.\ +2. Starting from the second round, 'Human' will provide the specific content of what needs to be carried out for the task, without repeating the task requirement. The AI Assistant being evaluated should then provide correct and specific answers directly addressing the task requirements.\ +\n \ +Please rate the AI assistant's response using a 1 to 10 scale based on the following guidelines:\ +\n \ +- 1-3 points: The AI assistant failed to understand the ta///sk request and neither asked relevant questions nor provided information related to the task.\ +- 4-6 points: The AI assistant understood some aspects of the task request but the response could be more specific or relevant.\ +- 7-9 points: The AI assistant provided a useful response that was mostly correct and targeted, even though there may be minor oversights.\ +- 10 points: The AI assistant demonstrated a perfect understanding of the task requirements and provided a comprehensive and accurate answer, fully meeting 'Human's expectations.\ +\n \ +Additionally, please provide a brief justification for the score given, particularly highlighting how the AI assistant's response aligns with or deviates from the above criteria. This will help us understand the performance of the AI assistant and take steps for improvement if necessary." + +eval_CR = "\nWe aim to specifically evaluate the paraphrasing ability of the large language model (AI assistant). The criteria for evaluation are as follows:\n \ +\n \ +1. The content of the AI assistant's rewritten response must maintain the same main idea as the Assistant's response in the first round.\n \ +2. The rewritten content must comply with the specific rewriting requirements set forth by the Human in the current round.\n \ +\n \ +Scoring Guidelines:\n \ +\n \ +- 1-3 points: The rewritten response significantly deviates from the original main idea or fails to meet the rewriting requirements.\n \ +- 4-6 points: The rewritten response captures the original main idea but only partially meets the rewriting requirements or lacks fluency/coherence.\n \ +- 7-9 points: The rewritten response maintains the original main idea and satisfies most of the rewriting requirements with minor discrepancies or stylistic issues.\n \ +- 10 points: The rewritten response perfectly preserves the original main idea and fulfills all of the rewriting requirements set by Human, exhibiting a seamless and natural integration of the required changes.\n \ +\n \ +Please provide a brief justification for the score you give and present your score. Please judge the response and Do Not answer the question in the dialogue directly." + +eval_FR = "\nWe aim to specifically evaluate the paraphrasing ability of the large language model (AI assistant). The criteria for evaluation are as follows:\n \ +\n \ +1. The content of the AI assistant's rewritten response must maintain the same main idea as the Assistant's response in the first round.\n \ +2. The rewritten content must comply with the specific rewriting requirements set forth by the Human in the current round.\n \ +\n \ +Scoring Guidelines:\n \ +\n \ +- 1-3 points: The rewritten response significantly deviates from the original main idea or fails to meet the rewriting requirements.\n \ +- 4-6 points: The rewritten response captures the original main idea but only partially meets the rewriting requirements or lacks fluency/coherence.\n \ +- 7-9 points: The rewritten response maintains the original main idea and satisfies most of the rewriting requirements with minor discrepancies or stylistic issues.\n \ +- 10 points: The rewritten response perfectly preserves the original main idea and fulfills all of the rewriting requirements set by Human, exhibiting a seamless and natural integration of the required changes.\n \ +\n \ +Please provide a brief justification for the score you give and present your score. Please judge the response and Do Not answer the question in the dialogue directly." + +eval_CC = "\nThe AI assistant's capability to resist interference will be measured against these criteria:\ +\n\n \ +1. The AI assistant's response must directly correspond to the content of the Human's question in the current round, providing true and accurate information.\n\ +2. The response must not be influenced by the question and answer pattern from the previous dialogue, ensuring that it remains relevant and focused on the current question only.\ +\n\n \ +Scoring Guidelines:\ +\n\n \ +- 1-3 points: The AI assistant's response is largely influenced by previous interactions, fails to address the current question accurately, or provides false information.\n\ +- 4-6 points: The AI assistant's response shows some resistance to interference but includes irrelevant details from previous dialogues or only partially addresses the current question.\n\ +- 7-9 points: The AI assistant's response is mostly resistant to interference and accurately addresses the current question, with only minor relevancies to previous interactions.\n\ +- 10 points: The AI assistant's response is completely free from interference, focusing solely on the current question and providing a response that is both accurate and wholly relevant.\ +\n\n \ +Please provide a brief justification for the score you give, focusing on how well the AI assistant's response aligns with the two evaluation criteria. " + +eval_TS = "\nThe AI assistant's ability to handle shifts in conversation topics is crucial for maintaining relevance and adaptability during a dialogue. This skill is particularly important when 'Human' introduces a new topic or changes the subject abruptly. The performance of the AI assistant should be evaluated on its capacity to smoothly transition between topics without being inappropriately influenced by previous dialogue content. The evaluation criteria are as follows:\n \ +\n \ +1. Identify whether the AI assistant can detect and acknowledge the change in topic introduced by 'Human' without reverting back to or becoming stuck on the previous subject.\n \ +2. Evaluate the relevance of the AI assistant's responses to the new topic, ensuring they are not improperly influenced or colored by the preceding dialogue rounds.\n \ +3. Assess the AI assistant's ability to provide coherent and contextually appropriate responses to the new subject, displaying an understanding of the conversation's evolving nature.\n \ +4. Consider the AI assistant's proficiency in offering complete and insightful answers to the new topic, which demonstrate a clear break from past conversation threads.\n \ +Scoring Guidelines:\n \ +\n \ +1-3 points: The AI assistant struggles with topic transitions, frequently reverting to or being influenced by the previous topic, resulting in irrelevant or confused responses to the new subject matter.\n \ +4-6 points: The AI assistant shows a moderate ability to adapt to new topics, but occasionally exhibits lingering effects from earlier discussions, leading to partially relevant or less focused responses to the topic shifts.\n \ +7-9 points: The AI assistant adapts to topic changes well, with minimal reference to or influence from prior topics, providing responses that are largely relevant and well-aligned with the new conversation direction.\n \ +10 points: The AI assistant excels at adapting to topic shifts, seamlessly transitioning to and fully engaging with the new subject matter without any irrelevant carryover from previous dialogue content.\n \ +When scoring, consider the smoothness of the AI assistant's transition between topics and its ability to engage with the new subject matter independently of the prior conversation. If a topic shift is not present or is so subtle that continuity with previous content is warranted, the AI assistant's ability to maintain coherence should not negatively affect the score. However, if a clear topic shift occurs and the AI assistant handles it deftly, providing relevant and insightful input on the new topic, this should be recognized as a positive aspect of its conversational capabilities.\n \ +\n \ +Please provide a rationale for your score, specifically addressing the effectiveness of the AI assistant's topic transition and its relevance to the new subject matter in accordance with the evaluation criteria." + +eval_AR = "The AI assistant's understanding of references is essential for maintaining a coherent dialogue. The following criteria should be used to evaluate its performance:\n\ +\n \ +1. The AI assistant's response must demonstrate a correct understanding of referential information from questions asked by 'Human,' which typically relate to content from the previous dialogue. Ideally, the AI should explicitly acknowledge or clarify these references in its reply.\n\ +2. The response from the AI assistant should be consistent with the content of the 'Human's question in the current round, providing true and accurate information, free from misunderstandings or inaccuracies related to the references.\n\ +\n \ +Scoring Guidelines:\n\ +\n\ +- 1-3 points: The AI assistant fails to recognize or correctly interpret the referential information, leading to responses that are either inaccurate or unrelated to the previous content.\n\ +- 4-6 points: The AI assistant shows a partial understanding of references, but the response might include some inaccuracies or fail to fully utilize the referential information.\n\ +- 7-9 points: The AI assistant's response indicates a good understanding of the references, with only slight inaccuracies or omissions in the connection to the previous dialogue.\n\ +- 10 points: The AI assistant demonstrates excellent understanding and use of referential information, perfectly aligning its response with the previous content and the current question accurately and precisely.\n\ +\n \ +In addition to the score, please provide an explanation that specifically addresses how the AI assistant's response demonstrates its ability or inability to understand and use referential information in accordance with the criteria above. " + +eval_IC = "The AI assistant’s ability to engage in a productive dialogue is often enhanced by its use of counter-questions, particularly when dealing with incomplete or vague queries. The assistant's performance should be assessed based on its ability to recognize when a rhetorical question is necessary and to use it effectively to clarify the 'Human's intent. The evaluation criteria are as follows:\n \ +\n \ +1. Assess whether the question posed by 'Human' contains ambiguities or lacks specific details that would require the AI assistant to use a counter-questions for clarification.\n \ +2. If the question does require clarification through a counter-question, evaluate how the AI assistant employs this strategy to address the ambiguities or missing information in 'Human's query.\n \ +3. Once 'Human' provides the necessary conditions or clarifies the question, evaluate whether the AI assistant offers a true and detailed response that fully addresses the clarified query.\n \ +\n \ +Scoring Guidelines:\n \ +\n \ +- 1-3 points: The AI assistant fails to identify the need for a rhetorical question when necessary, or it employs rhetorical questions ineffectively, leading to answers that do not align with 'Human's query, or lack the detail required to fully clarify the question.\n \ +- 4-6 points: The AI assistant recognizes situations requiring rhetorical questions but uses them suboptimally, only partially addressing the query's deficiencies. Subsequent answers may lack full detail or accuracy even after the query is clarified.\n \ +- 7-9 points: The AI assistant effectively uses rhetorical questions to pinpoint and address the missing or unclear elements in 'Human's query, and provides a largely accurate and detailed response to the perfected question.\n \ +- 10 points: The AI assistant expertly discerns when to use rhetorical questions and employs them precisely to address the ambiguities or missing information in the query. Once clarified, it responds with detailed, accurate information that perfectly satisfies the question.\n \ +\n \ +When scoring, consider whether the use of a counter-question was essential and whether the AI assistant's decision to use or not use one improved the clarity and outcome of the dialogue. If a counter-question was not necessary, and the AI assistant refrained from using one, this should not negatively affect the score. However, if the use of a rhetorical question or follow-up query by the AI assistant brought clarity to an otherwise ambiguous situation, this should be seen as a positive contribution to the dialogue.\n \ +\n \ +Please provide a rationale for your score, specifically addressing how the AI assistant's use or omission of rhetorical questions and its responses align with the evaluation criteria and the necessity of such an approach for each particular query." + +eval_SC = "We aim to specifically evaluate the self-correction ability of the large language model (AI assistant) when the 'Human' identifies an error in the AI assistant's initial response. The criteria for evaluation are as follows:\n\ +\n\ +1. Upon 'Human' pointing out a potential mistake, the AI assistant must thoroughly assess its previous response and engage with 'Human's' observation.\n\ +2. If the 'Human' has correctly identified an error, the AI assistant must acknowledge the mistake, correct it, and provide an updated and accurate response.\n\ +\n\ +Scoring Guidelines:\n\ +\n\ +- 1-3 points: The AI assistant fails to recognize or adequately address the error identified by 'Human,' and does not make the necessary corrections to its response.\n\ +- 4-6 points: The AI assistant recognizes the error identified by 'Human' but may only partially correct the mistake or provide an incomplete updated response.\n\ +- 7-9 points: The AI assistant correctly identifies and acknowledges the error, making a substantial correction to its response and effectively updating the answer.\n\ +- 10 points: The AI assistant exhibits exceptional self-correction ability, promptly acknowledging the error and providing a comprehensive and precise updated response.\n\ +\n\ +In addition to scoring, please provide a justification for your assessment, focusing on how effectively the AI assistant's reaction to 'Human's' identified error demonstrates its ability to self-correct and address the criteria outlined above." + +eval_SA = "We aim to specifically evaluate the ability of the large language model (AI assistant) to maintain confidence in its original response when challenged by 'Human', provided that the challenge is incorrect. The criteria for evaluation are as follows:\n\ +\n\ +1. When 'Human' questions the content of the AI assistant's response, the AI assistant must engage with 'Human's' query and assess the validity of the challenge.\n\ +2. If 'Human's challenge is based on incorrect information or assumptions, the AI assistant must confidently maintain its original answer and provide a clear and reasoned explanation to justify its stance.\n\ +\n\ +Scoring Guidelines:\n\ +\n\ +- 1-3 points: The AI assistant either does not engage with 'Human's' challenge appropriately or changes its original response without proper justification, even when 'Human's' challenge is incorrect.\n\ +- 4-6 points: The AI assistant engages with the challenge but may lack confidence in its response, providing a weak justification for maintaining its original answer.\n\ +- 7-9 points: The AI assistant demonstrates the ability to appropriately assess the incorrect challenge and maintains its original response with a clear and well-supported justification.\n\ +- 10 points: The AI assistant exhibits excellent ability to maintain confidence in its original response, providing a strong and convincing explanation that effectively addresses 'Human's' incorrect challenge.\n\ +\n\ +In addition to scoring, please provide a justification for your assessment, focusing on how the AI assistant's reaction to the challenge reflects its understanding and confidence in its original response, and how well it meets the criteria outlined above." + +eval_PI = "The AI assistant's interactivity, represented by its ability to proactively initiate and sustain engaging dialogues with 'Human', is a key aspect of a dynamic conversational experience. The model should not only respond passively but should also contribute to the momentum of the conversation by introducing questions, suggesting topics, or encouraging further discourse. The performance of the AI assistant should be evaluated on its capacity for active engagement and conversational leadership. The evaluation criteria are as follows:\n\ +\n\ +1. Observe the AI assistant's initiative in contributing to the conversation beyond providing direct answers, including its ability to ask relevant follow-up questions or propose new topics.\n\ +2. Assess the AI assistant's aptness in maintaining the flow of the conversation, including how well it encourages 'Human' to provide more information or share their thoughts.\n\ +3. Examine the appropriateness of the AI assistant's interactive elements in the context of the dialogue, ensuring they foster a natural and engaging conversation rather than derailing it.\n\ +4. Evaluate the AI assistant's responsiveness to 'Human's input while being proactive, ensuring that it listens and adapts to the conversation's direction as set by 'Human'.\n\ +Scoring Guidelines:\n\ +\n\ +1-3 points: The AI assistant exhibits poor interactivity, often providing minimal responses without encouraging further dialogue, or its attempts at interactivity are misplaced and hamper the natural flow of conversation.\n\ +4-6 points: The AI assistant demonstrates moderate interactivity; it occasionally asks questions or suggests new topics but may not consistently maintain the conversational momentum or fully engage 'Human'.\n\ +7-9 points: The AI assistant is highly interactive, regularly using questions and topics to keep the conversation going, while mostly preserving relevancy and a natural exchange with 'Human'.\n\ +10 points: The AI assistant excels at interactivity, skillfully using questions and dialogue prompts to enrich the conversation, actively engaging 'Human', and enhancing the overall dialogue experience without dominating the conversation.\n\ +When scoring, consider the balance the AI assistant strikes between guiding the conversation and allowing 'Human' to steer the dialogue. The AI assistant's interactivity should feel like a natural extension of the conversation, not forced or distracting from 'Human's intent. If the conversation benefits from the AI assistant's interactive elements, leading to a richer dialogue, this should be reflected in a higher score.\n\ +\n\ +Please provide a rationale for your score, specifically addressing how the AI assistant's proactive contributions and interactive strategies align with the evaluation criteria and enrich the conversational experience." + +eval_MR = "The AI assistant's mathematical reasoning capabilities are vital for accurately solving and explaining mathematical problems posed by 'Human'. The model should leverage both the conditions provided in the current question and any relevant information from the historical dialogue. The evaluation of the AI assistant's performance will be based on the correctness of its answers and the clarity of its reasoning process. The evaluation criteria are as follows:\n\ +\n\ +1. Verify the accuracy of the AI assistant's answer against the provided reference solution in the format '### reference solution ###' for the mathematical problem.\n\ +2. Assess the completeness and step-by-step clarity of the AI assistant's reasoning process, ensuring it is logical and follows mathematical principles.\n\ +3. Evaluate the AI assistant's ability to incorporate any relevant historical dialogue information that influences the problem-solving process or the solution itself.\n\ +4. Appraise the AI assistant's communication of the solution in a manner that is understandable and instructive to 'Human', potentially aiding their learning or comprehension.\n\ +Scoring Guidelines:\n\ +\n\ +1-3 points: The AI assistant provides incorrect answers and/or fails to offer a clear and logical reasoning process, missing key steps or providing explanations that do not align with mathematical standards.\n\ +4-6 points: The AI assistant's answer is partially correct with minor errors in the reasoning process, which may lack detail or clarity in some steps, but generally follows mathematical principles.\n\ +7-9 points: The AI assistant gives correct answers with a reasoning process that includes most necessary steps and details, facilitating a good understanding of the solution.\n\ +10 points: The AI assistant provides a completely correct answer accompanied by a detailed and meticulously clear step-by-step reasoning process that is fully aligned with mathematical principles and enhances 'Human's understanding.\n\ +When scoring, focus on the precision of the AI assistant's answer and the extent to which the reasoning process is elaborated. The assistant's ability to effectively communicate complex mathematical solutions in a manner that supports 'Human's learning is indicative of high performance. If the reasoning process is exemplary and the answer is accurate, this should be reflected in a top score.\n\ +\n\ +Please provide a rationale for your score, specifically addressing the accuracy of the AI assistant's answer and the quality of the mathematical reasoning process, considering the evaluation criteria and the comparison with the reference solution." + +eval_GR = "The AI assistant's general reasoning capabilities are crucial for accurately addressing and explaining a wide range of problems posed by 'Human'. The evaluation of the AI assistant's performance will be based on the correctness of its answers and the cogency of its reasoning process. The evaluation criteria are as follows:\n\ +\n\ +1. Verify the accuracy of the AI assistant's answer against the provided reference solution in format ‘### reference solution ###‘ for the specific problem.\n\ +2. Assess the completeness and step-by-step clarity of the AI assistant's reasoning process, ensuring it is logical and follows the principles of sound reasoning.\n\ +3. Evaluate the AI assistant's ability to integrate any relevant historical dialogue information that influences the problem-solving process or the solution itself.\n\ +4. Appraise the AI assistant's communication of the solution in a manner that is understandable and instructive to 'Human', potentially aiding their learning or comprehension.\n\ +Scoring Guidelines:\n\ +\n\ +1-3 points: The AI assistant provides incorrect answers and/or fails to offer a clear and logical reasoning process, missing key steps or providing explanations that do not adhere to standards of sound reasoning.\n\ +4-6 points: The AI assistant's answer is partially correct with minor errors in the reasoning process, which may lack detail or clarity in some steps but generally follows sound reasoning principles.\n\ +7-9 points: The AI assistant gives correct answers with a well-articulated reasoning process that includes most necessary steps and details, facilitating a good understanding of the solution.\n\ +10 points: The AI assistant provides a completely correct answer accompanied by a detailed and meticulously clear step-by-step reasoning process that is fully aligned with sound reasoning principles and enhances 'Human's understanding.\n\ +When scoring, focus on the precision of the AI assistant's answer and the extent to which the reasoning process is elaborated. The assistant's ability to effectively communicate complex solutions in a manner that supports 'Human's learning is indicative of high performance. If the reasoning process is exemplary and the answer is accurate, this should be reflected in a top score.\n\ +\n\ +Please provide a rationale for your score, specifically addressing the accuracy of the AI assistant's answer and the quality of the general reasoning process, considering the evaluation criteria and the comparison with the reference solution." + +unique_prompt = { + 'CM': eval_CM, + 'SI': eval_SI, + 'AR': eval_AR, + 'TS': eval_TS, + 'CC': eval_CC, + 'CR': eval_CR, + 'FR': eval_FR, + 'SC': eval_SC, + 'SA': eval_SA, + 'MR': eval_MR, + 'GR': eval_GR, + 'IC': eval_IC, + 'PI': eval_PI, +} + + +def eval_prompt_construct(task, ref_answer, history): + + if task in need_ref_tasks: + system_prompt = judge + unique_prompt[task] + score_format + prompt_template = 'The dialogue need to be judged is: \n *** \n {history} {prediction} \n ***\n\n\ + The reference solution is: \n ### \n {ref_answer} \n ###\n\n'.format( + history=history, prediction='{prediction}', ref_answer=ref_answer) + + else: + system_prompt = judge + unique_prompt[task] + score_format + prompt_template = 'The dialogue need to be judged is: \n *** \n {history} {prediction} \n ***'.format( + history=history, prediction='{prediction}') + + return system_prompt, prompt_template + + +def add_format(question, answer): + history = [dict(role='user', content=question)] + if answer: + history += [dict(role='assistant', content=answer)] + return history + + +@LOAD_DATASET.register_module() +class MTBench101Dataset(BaseDataset): + + def load(self, path: str, name: str, *args, **kwargs): + import copy + + filename = osp.join(path, f'{name}.jsonl') + filename = get_data_path(filename, local_mode=True) + # filename = osp.join(path, 'mtbench101.jsonl') + dataset = DatasetDict() + raw_data = [] + + lines = open(filename, 'r', encoding='utf-8').readlines() + conversations = [] + for line in lines: + line = json.loads(line) + conversations.append(line) + + for dialogue in conversations: + multi_id = dialogue['id'] + task = dialogue['task'] + if task in skip_first_tasks: + skip_first = True + else: + skip_first = False + + current_multi_id = None + pre_dia = [] + history = '' + dia_list = [] + for turn_index, turn in enumerate(dialogue['history']): + human = turn['user'] + assistant = turn['bot'] + turn_id = str(turn_index + 1) + + if current_multi_id is not None and multi_id != current_multi_id: + pre_dia = [] + history = '' + + current_multi_id = multi_id + + if skip_first and turn_index == 0: + pre_dia = add_format(question=human, answer=assistant) + history = '\n\n Human: ' + human + '\n\nAssistant: ' + assistant + continue + + history = history + '\n\n Human: ' + human + '\n\nAssistant: ' + pre_dia += add_format(question=human, answer=assistant) + + pre_dia_copy = copy.deepcopy(pre_dia) + + system_prompt, prompt_template = eval_prompt_construct( + task, pre_dia, history) + + raw_data.append({ + 'dialogue': pre_dia_copy, + 'task': task, + 'multi_id': current_multi_id, + 'turn_id': turn_id, + 'system_prompt': system_prompt, + 'prompt_template': prompt_template, + 'judge': { + 'task': task, + 'multi_id': current_multi_id, + 'turn_id': turn_id, + } + }) + history = history + assistant + + dataset = Dataset.from_list(raw_data) + return dataset + + +def post_process_mtbench101(judgement: str): + """Input a string like below: + + xxx[[5]]xxx, and extract the score + """ + judgement = judgement['prediction'] + match = re.search(r'\[([0-9]+)\]', judgement) + if match: + score = int(match.group(1)) + + else: + return None + + return {'score': score, 'judgement': judgement} + + +def get_final_results(judged_answers, references): + + task_multi_id_scores = defaultdict(list) + task_scores = defaultdict(list) + + for ans, ref in zip(judged_answers, references): + + task = ref['task'] + multi_id = ref['multi_id'] + score = ans['score'] + + task_multi_id_scores[(task, multi_id)].append(score) + + for (task, multi_id), scores in task_multi_id_scores.items(): + min_score = min(scores) + task_scores[task].append(min_score) + + final_task_scores = { + task: sum(scores) / len(scores) if scores else 0 + for task, scores in task_scores.items() + } + average_score = round( + sum(final_task_scores.values()) / len(final_task_scores), 3) + + return {f'avg': average_score, **final_task_scores} + + +@DICT_POSTPROCESSORS.register_module('mtbench101') +def mtbench101_postprocess(output: dict, + output_path: str, + ) -> dict: + judged_answers, references = get_judgeanswer_and_reference( + output, output_path, post_process_mtbench101) + + results = get_final_results(judged_answers, references) + results['details'] = output return results \ No newline at end of file diff --git a/opencompass/datasets/supergpqa/supergpqa_dataset_config/config_default.yaml b/opencompass/datasets/supergpqa/supergpqa_dataset_config/config_default.yaml index 9cbb006b..dc47b9da 100644 --- a/opencompass/datasets/supergpqa/supergpqa_dataset_config/config_default.yaml +++ b/opencompass/datasets/supergpqa/supergpqa_dataset_config/config_default.yaml @@ -14,4 +14,4 @@ max_tokens: 4096 temperatrue: 0.0 max_rounds: 30 -BoN: 32 \ No newline at end of file +BoN: 32 diff --git a/opencompass/datasets/supergpqa/supergpqa_dataset_config/config_reasoning_models.yaml b/opencompass/datasets/supergpqa/supergpqa_dataset_config/config_reasoning_models.yaml index c1fd105c..f380c412 100644 --- a/opencompass/datasets/supergpqa/supergpqa_dataset_config/config_reasoning_models.yaml +++ b/opencompass/datasets/supergpqa/supergpqa_dataset_config/config_reasoning_models.yaml @@ -14,4 +14,4 @@ max_tokens: 32768 temperatrue: 0.0 max_rounds: 30 -BoN: 32 \ No newline at end of file +BoN: 32 diff --git a/opencompass/datasets/supergpqa/supergpqa_dataset_config/prompt/robustness-exp.yaml b/opencompass/datasets/supergpqa/supergpqa_dataset_config/prompt/robustness-exp.yaml index 6cffdd93..38632ac2 100644 --- a/opencompass/datasets/supergpqa/supergpqa_dataset_config/prompt/robustness-exp.yaml +++ b/opencompass/datasets/supergpqa/supergpqa_dataset_config/prompt/robustness-exp.yaml @@ -20,4 +20,4 @@ initial_prompt_3: - | Review the following multiple-choice question and choose the one correct answer. Ensure that your response concludes with a line exactly formatted as 'The correct answer is: $LETTER', where LETTER represents one of A, B, C, D, E, F, G, H, I, or J. - {} \ No newline at end of file + {} diff --git a/opencompass/datasets/supergpqa/supergpqa_dataset_config/prompt/zero-shot-with-subfield.yaml b/opencompass/datasets/supergpqa/supergpqa_dataset_config/prompt/zero-shot-with-subfield.yaml index 991c6a4d..20e79daf 100644 --- a/opencompass/datasets/supergpqa/supergpqa_dataset_config/prompt/zero-shot-with-subfield.yaml +++ b/opencompass/datasets/supergpqa/supergpqa_dataset_config/prompt/zero-shot-with-subfield.yaml @@ -2,4 +2,4 @@ prompt_format: - | Answer the following multiple choice question about {}. There is only one correct answer. The last line of your response should be in the format 'Answer: $LETTER' (without quotes), where LETTER is one of A, B, C, D, E, F, G, H, I, or J. - {} \ No newline at end of file + {} diff --git a/opencompass/datasets/teval/__init__.py b/opencompass/datasets/teval/__init__.py index 631a9ae5..ea0117dd 100644 --- a/opencompass/datasets/teval/__init__.py +++ b/opencompass/datasets/teval/__init__.py @@ -1,60 +1,60 @@ -import json -import os.path as osp -from typing import Dict, Optional - -import mmengine -from datasets import Dataset, DatasetDict - -from opencompass.registry import TEXT_POSTPROCESSORS -from opencompass.utils import get_data_path - -from ..base import BaseDataset - - -class TEvalDataset(BaseDataset): - - def __init__(self, reader_cfg: Optional[Dict] = {}, **kwargs): - super().__init__(reader_cfg=reader_cfg, **kwargs) - - def load(self, path: str, name: str): - path = get_data_path(path, local_mode=True) - - dataset = DatasetDict() - data = mmengine.load(osp.join(path, f'{name}.json')) - raw_data = [] - for i in data.keys(): - origin_prompt = data[i]['origin_prompt'] - if isinstance(origin_prompt, str): - origin_prompt = json.loads(origin_prompt) - # Aligning the default roles of opencompass - prompt = origin_prompt + [ - dict(role='assistant', - content=str(data[i].get('ground_truth'))) - ] - raw_data.append({ - 'prompt': prompt, - 'ground_truth': json.dumps(data[i]) - }) - dataset['test'] = Dataset.from_list(raw_data) - dataset['train'] = Dataset.from_list(raw_data) - return dataset - - - - -@TEXT_POSTPROCESSORS.register_module('teval') -def teval_postprocess(text: str) -> str: - if isinstance(text, str): - text = text.split('')[0] - text = text.split('')[0] - text = text.split('<|im_end|>')[0] - text = text.split('\nuser')[0] - text = text.split('\nUSER')[0] - text = text.split('[INST]')[0] - text = text.strip() - if text.startswith('```json'): - text = text[len('```json'):] - text = text.strip('`').strip() - if text[:2] == '{{' and text[-2:] == '}}': - text = text[1:-1] - return str(text) +import json +import os.path as osp +from typing import Dict, Optional + +import mmengine +from datasets import Dataset, DatasetDict + +from opencompass.registry import TEXT_POSTPROCESSORS +from opencompass.utils import get_data_path + +from ..base import BaseDataset + + +class TEvalDataset(BaseDataset): + + def __init__(self, reader_cfg: Optional[Dict] = {}, **kwargs): + super().__init__(reader_cfg=reader_cfg, **kwargs) + + def load(self, path: str, name: str): + path = get_data_path(path, local_mode=True) + + dataset = DatasetDict() + data = mmengine.load(osp.join(path, f'{name}.json')) + raw_data = [] + for i in data.keys(): + origin_prompt = data[i]['origin_prompt'] + if isinstance(origin_prompt, str): + origin_prompt = json.loads(origin_prompt) + # Aligning the default roles of opencompass + prompt = origin_prompt + [ + dict(role='assistant', + content=str(data[i].get('ground_truth'))) + ] + raw_data.append({ + 'prompt': prompt, + 'ground_truth': json.dumps(data[i]) + }) + dataset['test'] = Dataset.from_list(raw_data) + dataset['train'] = Dataset.from_list(raw_data) + return dataset + + + + +@TEXT_POSTPROCESSORS.register_module('teval') +def teval_postprocess(text: str) -> str: + if isinstance(text, str): + text = text.split('')[0] + text = text.split('')[0] + text = text.split('<|im_end|>')[0] + text = text.split('\nuser')[0] + text = text.split('\nUSER')[0] + text = text.split('[INST]')[0] + text = text.strip() + if text.startswith('```json'): + text = text[len('```json'):] + text = text.strip('`').strip() + if text[:2] == '{{' and text[-2:] == '}}': + text = text[1:-1] + return str(text) diff --git a/opencompass/datasets/teval/evaluators/__init__.py b/opencompass/datasets/teval/evaluators/__init__.py index 8f222150..56189ca4 100644 --- a/opencompass/datasets/teval/evaluators/__init__.py +++ b/opencompass/datasets/teval/evaluators/__init__.py @@ -1,5 +1,5 @@ -from .instruct_evaluator import InstructEvaluator -from .planning_evaluator import PlanningEvaluator -from .review_evaluator import ReviewEvaluator -from .reason_retrieve_understand_evaluator import ReasonRetrieveUnderstandEvaluator -__all__ = ['InstructEvaluator', 'PlanningEvaluator', 'ReviewEvaluator', 'ReasonRetrieveUnderstandEvaluator'] +from .instruct_evaluator import InstructEvaluator +from .planning_evaluator import PlanningEvaluator +from .review_evaluator import ReviewEvaluator +from .reason_retrieve_understand_evaluator import ReasonRetrieveUnderstandEvaluator +__all__ = ['InstructEvaluator', 'PlanningEvaluator', 'ReviewEvaluator', 'ReasonRetrieveUnderstandEvaluator'] diff --git a/opencompass/datasets/teval/evaluators/instruct_evaluator.py b/opencompass/datasets/teval/evaluators/instruct_evaluator.py index d90629c4..fecf9e97 100644 --- a/opencompass/datasets/teval/evaluators/instruct_evaluator.py +++ b/opencompass/datasets/teval/evaluators/instruct_evaluator.py @@ -1,152 +1,152 @@ -from collections import defaultdict -from mmengine import load - -from ..utils.template import parse_string -from ..utils.format_load import format_load -from ..schema import ResponseDataSample -import ast -import numpy as np - -class InstructEvaluator: - """Instruct Following Evaluation - - Args: - dataset_path(str): File path of evaluation dataset. - - """ - - def __init__( - self, - dataset_path: str, - **kwargs, - ) -> None: - self.dataset_path = dataset_path - - def _load_dataset(self): - self.dataset = [] - dataset = load(self.dataset_path) - - for key in dataset.keys(): - datum = dataset[key] - data_sample = self._process_response(datum) - - self.dataset.append( - dict( - origin_prompt=datum["origin_prompt"], - response_data_sample=data_sample)) - self.num_samples = len(self.dataset) - - def _process_response( - self, - datum: dict, - ) -> ResponseDataSample: - """Process the response to needed format. - - Args: - datum(dict): inputs. - - Returns: - dict: Processed response data sample. - """ - - # Dict with keyword-only arguments. - template = datum['template'] - # Generated response. - pred_data = datum['prediction'] - # Response of ground truth. - gt_data = datum['ground_truth'] - meta_data = datum['meta_data'] - - return ResponseDataSample( - template=template, pred=pred_data, gt=gt_data, meta_data=meta_data) - - def _evaluate(self, data_sample: dict) -> dict: - metrics_result = dict() - response_format = data_sample.meta_data['response_format'] - if response_format == 'json': - pred_data = self.json_format_parse(data_sample) - else: - pred_data = self.string_format_parse(data_sample) - - if pred_data is None: - # directly set to 0 for all metrics - metrics_result[f'{response_format}_format_metric'] = 0 - metrics_result[f'{response_format}_args_em_metric'] = 0 - return metrics_result - - # Exact matching - metrics_result[f'{response_format}_format_metric'] = 1 - metrics_result[f'{response_format}_args_em_metric'] = self.compute_args_em_metric( - gt_action=data_sample.gt['action'], pred_action=pred_data['action'], - gt_args=data_sample.gt['args'], pred_args=pred_data['args'] - ) - return metrics_result - - def compute_args_em_metric(self, gt_action, pred_action, gt_args, pred_args): - cnt = 0. - if gt_action == pred_action: - cnt += 1. - num_args = len(gt_args) + 1 # 1 means action name match - for gt_key in gt_args: - pred_val = pred_args.get(gt_key, "") - if pred_val == gt_args[gt_key]: - cnt += 1. - return cnt / num_args - - def string_format_parse(self, data_sample): - pred_data = data_sample.pred - template = data_sample.template - thought_start = template['thought_start'] - thought_end = template['thought_end'] - action_start = template['action_start'] - action_end = template['action_end'] - args_start = template['args_start'] - args_end = template['args_end'] - - parse_template = thought_start + "{thought}" + thought_end \ - + action_start + "{action}" + action_end \ - + args_start + "{args}" + args_end - res = parse_string(parse_template, pred_data, allow_newline=True) - try: - if res is not None: - args = ast.literal_eval(res['args'].strip()) - res['args'] = args if isinstance(args, dict) else {} - res['action'] = res['action'].strip() - return res - except: - return dict(thought=res['thought'], action=res['action'].strip(), args=dict()) - - def json_format_parse(self, data_sample): - try: - pred_data = format_load(data_sample.pred) - template = data_sample.template - new_data = dict() - new_data['thought'] = pred_data[template['thought']] - new_data['action'] = pred_data[template['action']] - args = pred_data[template['args']] - new_data['args'] = args if isinstance(args, dict) else {} - except Exception as e: - return None - - return new_data - - def evaluate(self): - self._load_dataset() - results_list = [] - for data_sample in self.dataset: - metrics_result = self._evaluate(data_sample['response_data_sample']) - results_list.append(metrics_result) - return self._post_process(results_list) - - def _post_process(self, results_list): - # list of dict to dict of list - results_dict = defaultdict(list) - { - results_dict[key].append(sub[key]) - for sub in results_list for key in sub - } - metric_list = ['json_format_metric', 'json_args_em_metric', - 'string_format_metric', 'string_args_em_metric'] - for metric in metric_list: - results_dict[metric] = np.round(np.mean(results_dict[metric]), decimals=4) - return results_dict +from collections import defaultdict +from mmengine import load + +from ..utils.template import parse_string +from ..utils.format_load import format_load +from ..schema import ResponseDataSample +import ast +import numpy as np + +class InstructEvaluator: + """Instruct Following Evaluation + + Args: + dataset_path(str): File path of evaluation dataset. + + """ + + def __init__( + self, + dataset_path: str, + **kwargs, + ) -> None: + self.dataset_path = dataset_path + + def _load_dataset(self): + self.dataset = [] + dataset = load(self.dataset_path) + + for key in dataset.keys(): + datum = dataset[key] + data_sample = self._process_response(datum) + + self.dataset.append( + dict( + origin_prompt=datum["origin_prompt"], + response_data_sample=data_sample)) + self.num_samples = len(self.dataset) + + def _process_response( + self, + datum: dict, + ) -> ResponseDataSample: + """Process the response to needed format. + + Args: + datum(dict): inputs. + + Returns: + dict: Processed response data sample. + """ + + # Dict with keyword-only arguments. + template = datum['template'] + # Generated response. + pred_data = datum['prediction'] + # Response of ground truth. + gt_data = datum['ground_truth'] + meta_data = datum['meta_data'] + + return ResponseDataSample( + template=template, pred=pred_data, gt=gt_data, meta_data=meta_data) + + def _evaluate(self, data_sample: dict) -> dict: + metrics_result = dict() + response_format = data_sample.meta_data['response_format'] + if response_format == 'json': + pred_data = self.json_format_parse(data_sample) + else: + pred_data = self.string_format_parse(data_sample) + + if pred_data is None: + # directly set to 0 for all metrics + metrics_result[f'{response_format}_format_metric'] = 0 + metrics_result[f'{response_format}_args_em_metric'] = 0 + return metrics_result + + # Exact matching + metrics_result[f'{response_format}_format_metric'] = 1 + metrics_result[f'{response_format}_args_em_metric'] = self.compute_args_em_metric( + gt_action=data_sample.gt['action'], pred_action=pred_data['action'], + gt_args=data_sample.gt['args'], pred_args=pred_data['args'] + ) + return metrics_result + + def compute_args_em_metric(self, gt_action, pred_action, gt_args, pred_args): + cnt = 0. + if gt_action == pred_action: + cnt += 1. + num_args = len(gt_args) + 1 # 1 means action name match + for gt_key in gt_args: + pred_val = pred_args.get(gt_key, "") + if pred_val == gt_args[gt_key]: + cnt += 1. + return cnt / num_args + + def string_format_parse(self, data_sample): + pred_data = data_sample.pred + template = data_sample.template + thought_start = template['thought_start'] + thought_end = template['thought_end'] + action_start = template['action_start'] + action_end = template['action_end'] + args_start = template['args_start'] + args_end = template['args_end'] + + parse_template = thought_start + "{thought}" + thought_end \ + + action_start + "{action}" + action_end \ + + args_start + "{args}" + args_end + res = parse_string(parse_template, pred_data, allow_newline=True) + try: + if res is not None: + args = ast.literal_eval(res['args'].strip()) + res['args'] = args if isinstance(args, dict) else {} + res['action'] = res['action'].strip() + return res + except: + return dict(thought=res['thought'], action=res['action'].strip(), args=dict()) + + def json_format_parse(self, data_sample): + try: + pred_data = format_load(data_sample.pred) + template = data_sample.template + new_data = dict() + new_data['thought'] = pred_data[template['thought']] + new_data['action'] = pred_data[template['action']] + args = pred_data[template['args']] + new_data['args'] = args if isinstance(args, dict) else {} + except Exception as e: + return None + + return new_data + + def evaluate(self): + self._load_dataset() + results_list = [] + for data_sample in self.dataset: + metrics_result = self._evaluate(data_sample['response_data_sample']) + results_list.append(metrics_result) + return self._post_process(results_list) + + def _post_process(self, results_list): + # list of dict to dict of list + results_dict = defaultdict(list) + { + results_dict[key].append(sub[key]) + for sub in results_list for key in sub + } + metric_list = ['json_format_metric', 'json_args_em_metric', + 'string_format_metric', 'string_args_em_metric'] + for metric in metric_list: + results_dict[metric] = np.round(np.mean(results_dict[metric]), decimals=4) + return results_dict diff --git a/opencompass/datasets/teval/evaluators/planning_evaluator.py b/opencompass/datasets/teval/evaluators/planning_evaluator.py index 59056de5..96f3770b 100644 --- a/opencompass/datasets/teval/evaluators/planning_evaluator.py +++ b/opencompass/datasets/teval/evaluators/planning_evaluator.py @@ -1,394 +1,394 @@ -from numpy import mean -from mmengine import load -from ..utils.format_load import format_load -import itertools -import networkx as nx -import numpy as np -import copy -import re -from tqdm import tqdm - -from ..schema import ResponseDataSample -from sentence_transformers import SentenceTransformer, util - - -class PlanningEvaluator: - """Planning Evaluation - Args: - dataset_path(str): File path of evaluation dataset - name_weight(float): the weight of action_name in bert_score match, default = 0.9 - args_weight(float): the weight of action_args in bert_score match, default = 0.1 - match_threshold(float): the threshold of matching - match_strategy(str): matching method, can choose 'bertscore' or 'permutation' - bert_score_model(str): the bert_score model for sentence similarity, default = "all-mpnet-base-v2". - Refer to https://www.sbert.net/docs/pretrained_models.html for more models. - """ - def __init__( - self, - dataset_path: str, - name_weight = 0.75, - args_weight = 0.25, - match_threshold = 0.7, - match_strategy: str = 'bertscore', # ["bertscore", "permutation"] - bert_score_model: str = "all-mpnet-base-v2", # ['thenlper/gte-large-zh', 'all-mpnet-base-v2'] - default_prompt_type: str = 'json', # ["json", "ReWOO"] - **kwargs, - ) -> None: - self.bert_score_model = bert_score_model - print(bert_score_model) - self.dataset_path = dataset_path - self.name_weight = name_weight - self.args_weight = args_weight - self.match_threshold = match_threshold - self.default_prompt_type = default_prompt_type # ["json", "ReWOO"] - assert match_strategy in ["bertscore", "permutation"], f"match strategy must in [\"bertscore\", \"permutation\"], but get {match_strategy}" - self.match_strategy = match_strategy - self.valid_data_count = None - self.sentence_model = SentenceTransformer(self.bert_score_model) - - def _load_dataset(self): - self.dataset = [] - dataset = load(self.dataset_path) - total_error = 0 - total_count = 0 - for key in dataset.keys(): - datum = dataset[key] - data_sample, error = self._process_response(datum) - total_error += error - total_count += 1 - self.dataset.append( - dict(response_data_sample=data_sample)) - - self.num_samples = len(self.dataset) - print("total_data_count:", total_count, "valid_data_count:", total_count - total_error) - self.valid_data_count = total_count - total_error - - def format_load(self, data): - r''' - ensure evaluator can work correctly under any data input - ''' - try: - json_format = format_load(data, start_character='[', end_character=']') - except Exception as e: - return [] - if type(json_format) != list: - return [] - for i in range(len(json_format)): - try: - json_format[i] = { - 'name': str(json_format[i]['name']), - 'id': int(json_format[i]['id']), - 'args': str(json_format[i]['args']) - } - except Exception as e: - return [] - return json_format - - def _process_response( - self, - datum, - ) -> ResponseDataSample: - """Process the response to needed format. - Args: - datum(dict): inputs. - Returns: - dict: Processed response data sample. - """ - - # Generated response, which can be a string or list - pred_data = datum['prediction'] - # Response of ground truth, which can be a string or list - gt_data = datum['ground_truth'] - # prompt_type: The type of planning prompt, supporting "json" and "ReWOO" - if "meta" in datum: - prompt_type = datum["meta"].get("prompt_type", self.default_prompt_type) - else: - prompt_type = self.default_prompt_type - - error = 0 - pred = dict() - gt = dict() - gt['planning'] = self.format_load(gt_data) - if prompt_type == 'json': - pred['planning'] = self.format_load(pred_data) - if pred['planning'] == [] or gt['planning'] == []: - error = 1 - - elif prompt_type == 'ReWOO': - """ - This type is deprecated - The planning prediction data should in this format: - Plan 1: description about the first action - Dependency 1: the first action depends on which previous actions - Action 1: #E1 = api_name1(args1) - ... - Which will be passed only if "number of plan lines == number of dependency lines == number of action lines" - The passed data's format is: - [ - dict( - id = i, - name = curr_name, - args = args_str - ) - ... - ] - - The golden answer prediction is a json that is the same as the json format. - """ - thoughts = re.findall(r'(Plan [0-9]+: .+)', pred_data) - dependencies = re.findall(r'(Dependency [0-9]+: .+)', pred_data) - action_units = re.findall(r'Action [0-9]+: (.+)', pred_data) - - if not (len(thoughts) == len(dependencies) and len(thoughts) == len(action_units)): - pred['planning'] = [] - gt['planning'] = [] - return ResponseDataSample(template = '', pred=pred, gt=gt), 1 - - plan_action = [] - for i in range(len(action_units)): - dependency_list = re.findall(r'Dependency [0-9]+: (.+)', dependencies[i]) - if action_units[i][0] == '#': - # The action has a return #E - args_str_list = re.findall(r'#E[0-9]+ = .+\((.+)\)', action_units[i]) - name_list = re.findall(r'#E[0-9]+ = (.+)\(', action_units[i]) - else: - # The action does not have a return - args_str_list = re.findall(r'.+\((.+)\)', action_units[i]) - name_list = re.findall(r'(.+)\(', action_units[i]) - if (len(name_list) > 0): - curr_name = name_list[0] - else: - curr_name = "" - if (len(args_str_list) > 0): - args_str = "{" + args_str_list[0] + "}" - else: - args_str = "{}" - if (len(dependency_list) > 0): - dependency_str = dependency_list[0] - else: - dependency_str = "" - dependency = re.findall('([0-9]+)', dependency_str) - dependency = list(set([int(x) - 1 for x in dependency])) - plan_action.append( - dict( - id = i, - name = curr_name, - prev = dependency, - args = args_str - )) - pred['planning'] = plan_action - #Turn dict into args str - for i in range(len(gt['planning'])): - args_str = "" - if type(gt['planning'][i]['args']) == str: - args_dict = eval(gt['planning'][i]['args']) - else: - assert type(gt['planning'][i]['args']) == dict - args_dict = gt['planning'][i]['args'] - for it in args_dict: - if args_str == "": args_str += f"{it}=\"{args_dict[it]}\"" - else: args_str += f", {it}=\"{args_dict[it]}\"" - gt['planning'][i]['args'] = '{' + args_str + '}' - - elif prompt_type == 'str': - pred_data_format = pred_data.replace('. ', '\n').split('\n') - pred_actions = [] - for pred_step in pred_data_format: - first_occur_time = 1e9 - pred_action = "" - for api_name in datum['meta']['API_list']: - occur_time = pred_step.find(api_name) - if occur_time != -1 and occur_time < first_occur_time: - first_occur_time = occur_time - pred_action = api_name - if pred_action != "": - pred_actions.append({ - 'id': len(pred_actions), - 'name': pred_action, - 'args': pred_step - }) - pred['planning'] = pred_actions - if len(pred['planning']) == 0: - error = 1 - else: - raise NotImplementedError(f"Currently, we only support json and ReWOO format, but get {prompt_type}") - - return ResponseDataSample(template = '', pred=pred, gt=gt), error - - def _evaluate(self, data_sample) -> dict: - if self.match_strategy == 'bertscore': - metrics_result = self.bertscore_match( - data_sample.pred['planning'], data_sample.gt['planning']) - elif self.match_strategy == 'permutation': - metrics_result = self.permutation_match( - data_sample.pred['planning'], data_sample.gt['planning']) - else: - raise NotImplementedError - if len(data_sample.pred['planning']) == 0 or len(data_sample.gt['planning']) == 0: - metrics_result['parse_rate'] = 0 - else: - metrics_result['parse_rate'] = 1 - return metrics_result - - def evaluate(self): - self._load_dataset() - results_list = [] - for data_sample in tqdm(self.dataset): - metrics_result = self._evaluate( - data_sample['response_data_sample']) - results_list.append(metrics_result) - return self._post_process(results_list) - - def permutation_match(self, pred_plan, gt_plan) -> dict: - ''' - The function calculates all the permutation matches' score and selects the max f1_score; - Since permutation is time consuming, we truncate the length of plans to 9 - ''' - if pred_plan[-1]['name'] != 'FinishAction': - pred_plan.append( - {'id': len(pred_plan), 'prev': [], 'name': 'FinishAction', 'args': r'\{\}'} - ) - - if gt_plan[-1]['name'] != 'FinishAction': - gt_plan.append( - {'id': len(gt_plan), 'prev': [], 'name': 'FinishAction', 'args': r'\{\}'} - ) - - # truncate plans to 9 since it is too long for permutation. - if len(pred_plan) > 9: pred_plan = pred_plan[:9] - if len(gt_plan) > 9: gt_plan = pred_plan[:9] - - pred_plan = sorted(pred_plan, key=lambda x: x['id']) - gt_plan = sorted(gt_plan, key=lambda x: x['id']) - len_pred = len(pred_plan) - len_gt = len(gt_plan) - map_id_max = max(len_pred, len_gt) - numbers = [i for i in range(map_id_max)] - perms = itertools.permutations(numbers, len_pred) - gt_prev_count, pred_prev_count = 0, 0 - for i in range(len_gt): - gt_plan[i]['prev'].append(i) - gt_prev_count += len(gt_plan[i]['prev']) - for i in range(len_pred): - pred_plan[i]['prev'].append(i) - pred_prev_count += len(pred_plan[i]['prev']) - if gt_prev_count == 0 or pred_prev_count == 0: - return { - 'precision': 0, - 'recall': 0, - 'f1_score': 0 - } - max_recall, max_precision, max_f1 = 0, 0, 0 - for perm in perms: - correct_count = 0 - for i in range(len_pred): - if perm[i] >= len_gt: - continue - for j in pred_plan[i]['prev']: - if perm[j] in gt_plan[perm[i]]['prev']: - correct_count += 1 - now_recall, now_precision = correct_count / gt_prev_count, correct_count / pred_prev_count - if now_recall + now_precision == 0: - continue - now_f1 = 2 * now_recall * now_precision / (now_recall + now_precision) - if now_f1 > max_f1: - max_f1, max_recall, max_precision = now_f1, now_recall, now_precision - return { - 'precision': max_precision, - 'recall': max_recall, - 'f1_score': max_f1 - } - - def bertscore_match(self, pred_plan, gt_plan) -> dict: - """ - Calculate the similarity between predicted plan and golden answer, - A plan can be regarded a sequence of actions, and each action has a name and args. - Firstly, use bertscore to calculate pointwise similarity by: - similarity(u, v) = bertscore(u.name, v.name) * name_weight + bertscore(u.args, v.args) * args_weight; - Secondly, use Hungarian matching to match the points; - Finally, use LIS to calculate the number of matched nodes. - """ - if len(pred_plan) == 0 or len(gt_plan) == 0: - return { - 'precision': 0, - 'recall': 0, - 'f1_score': 0 - } - - pred_plan = copy.deepcopy(sorted(pred_plan, key=lambda x: x['id'])) - gt_plan = copy.deepcopy(sorted(gt_plan, key=lambda x: x['id'])) - - #Add end action - #Currently it is hard-code - if pred_plan[-1]['name'] == 'FinishAction': - pred_plan = pred_plan[:-1] - if gt_plan[-1]['name'] == 'FinishAction': - gt_plan = gt_plan[:-1] - #The total counts of nodes and edges. - len_pred = len(pred_plan) - len_gt = len(gt_plan) - - bert_score_matrix = np.zeros((len_pred, len_gt)) - name_pred, args_pred = [], [] - name_gt, args_gt = [], [] - for i in range(len_pred): - name_pred.append(pred_plan[i]['name']) - args_pred.append(str(pred_plan[i]['args'])) - for i in range(len_gt): - name_gt.append(gt_plan[i]['name']) - args_gt.append(str(gt_plan[i]['args'])) - name_pred_emb = self.sentence_model.encode(name_pred, convert_to_tensor=True) - name_gt_emb = self.sentence_model.encode(name_gt, convert_to_tensor=True) - args_pred_emb = self.sentence_model.encode(args_pred, convert_to_tensor=True) - args_gt_emb = self.sentence_model.encode(args_gt, convert_to_tensor=True) - name_cosine_scores = np.maximum(util.cos_sim(name_pred_emb, name_gt_emb).cpu().numpy(), 0) - args_cosine_scores = np.maximum(util.cos_sim(args_pred_emb, args_gt_emb).cpu().numpy(), 0) - for i in range(len_pred): - for j in range(len_gt): - bert_score_matrix[i][j] = \ - name_cosine_scores[i][j] * self.name_weight \ - + args_cosine_scores[i][j] * self.args_weight - G = nx.Graph() - for i in range(len_pred): - for j in range(len_gt): - if bert_score_matrix[i][j] > self.match_threshold: - G.add_edge(i, str(j), weight=bert_score_matrix[i][j]) - max_weight_matching = nx.max_weight_matching(G) - - pred_to_gt_mapping = dict() - for key in max_weight_matching: - if type(key[0]) == int: - pred_to_gt_mapping[int(key[0])] = int(key[1]) - else: - pred_to_gt_mapping[int(key[1])] = int(key[0]) - - #If a prediction node does not match any golden answer node, we mark the node as -1. - for i in range(len_pred): - if i not in pred_to_gt_mapping: - pred_to_gt_mapping[i] = -1 - #Calculate how many nodes are matched by Longest Increasing Subsequence (LIS) - dp = np.ones(len_pred) - for i in range(len_pred): - for j in range(i): - if pred_to_gt_mapping[i] == -1 or pred_to_gt_mapping[j] == -1: - continue - if pred_to_gt_mapping[i] > pred_to_gt_mapping[j]: - dp[i] = max(dp[i], dp[j] + 1) - correct_count = int(max(dp)) - - recall, precision = correct_count / len(gt_plan), correct_count / len(pred_plan) - f1_score = 2 * recall * precision / (recall + precision) - result = { - 'precision': precision, - 'recall': recall, - 'f1_score': f1_score - } - return result - - def _post_process(self, results_list): - # list of dict to dict of list - results = dict() - planning_metric_keys = ["precision", "recall", "f1_score", 'parse_rate'] - for key in planning_metric_keys: - results[key] = mean([result[key] for result in results_list]) - return results +from numpy import mean +from mmengine import load +from ..utils.format_load import format_load +import itertools +import networkx as nx +import numpy as np +import copy +import re +from tqdm import tqdm + +from ..schema import ResponseDataSample +from sentence_transformers import SentenceTransformer, util + + +class PlanningEvaluator: + """Planning Evaluation + Args: + dataset_path(str): File path of evaluation dataset + name_weight(float): the weight of action_name in bert_score match, default = 0.9 + args_weight(float): the weight of action_args in bert_score match, default = 0.1 + match_threshold(float): the threshold of matching + match_strategy(str): matching method, can choose 'bertscore' or 'permutation' + bert_score_model(str): the bert_score model for sentence similarity, default = "all-mpnet-base-v2". + Refer to https://www.sbert.net/docs/pretrained_models.html for more models. + """ + def __init__( + self, + dataset_path: str, + name_weight = 0.75, + args_weight = 0.25, + match_threshold = 0.7, + match_strategy: str = 'bertscore', # ["bertscore", "permutation"] + bert_score_model: str = "all-mpnet-base-v2", # ['thenlper/gte-large-zh', 'all-mpnet-base-v2'] + default_prompt_type: str = 'json', # ["json", "ReWOO"] + **kwargs, + ) -> None: + self.bert_score_model = bert_score_model + print(bert_score_model) + self.dataset_path = dataset_path + self.name_weight = name_weight + self.args_weight = args_weight + self.match_threshold = match_threshold + self.default_prompt_type = default_prompt_type # ["json", "ReWOO"] + assert match_strategy in ["bertscore", "permutation"], f"match strategy must in [\"bertscore\", \"permutation\"], but get {match_strategy}" + self.match_strategy = match_strategy + self.valid_data_count = None + self.sentence_model = SentenceTransformer(self.bert_score_model) + + def _load_dataset(self): + self.dataset = [] + dataset = load(self.dataset_path) + total_error = 0 + total_count = 0 + for key in dataset.keys(): + datum = dataset[key] + data_sample, error = self._process_response(datum) + total_error += error + total_count += 1 + self.dataset.append( + dict(response_data_sample=data_sample)) + + self.num_samples = len(self.dataset) + print("total_data_count:", total_count, "valid_data_count:", total_count - total_error) + self.valid_data_count = total_count - total_error + + def format_load(self, data): + r''' + ensure evaluator can work correctly under any data input + ''' + try: + json_format = format_load(data, start_character='[', end_character=']') + except Exception as e: + return [] + if type(json_format) != list: + return [] + for i in range(len(json_format)): + try: + json_format[i] = { + 'name': str(json_format[i]['name']), + 'id': int(json_format[i]['id']), + 'args': str(json_format[i]['args']) + } + except Exception as e: + return [] + return json_format + + def _process_response( + self, + datum, + ) -> ResponseDataSample: + """Process the response to needed format. + Args: + datum(dict): inputs. + Returns: + dict: Processed response data sample. + """ + + # Generated response, which can be a string or list + pred_data = datum['prediction'] + # Response of ground truth, which can be a string or list + gt_data = datum['ground_truth'] + # prompt_type: The type of planning prompt, supporting "json" and "ReWOO" + if "meta" in datum: + prompt_type = datum["meta"].get("prompt_type", self.default_prompt_type) + else: + prompt_type = self.default_prompt_type + + error = 0 + pred = dict() + gt = dict() + gt['planning'] = self.format_load(gt_data) + if prompt_type == 'json': + pred['planning'] = self.format_load(pred_data) + if pred['planning'] == [] or gt['planning'] == []: + error = 1 + + elif prompt_type == 'ReWOO': + """ + This type is deprecated + The planning prediction data should in this format: + Plan 1: description about the first action + Dependency 1: the first action depends on which previous actions + Action 1: #E1 = api_name1(args1) + ... + Which will be passed only if "number of plan lines == number of dependency lines == number of action lines" + The passed data's format is: + [ + dict( + id = i, + name = curr_name, + args = args_str + ) + ... + ] + + The golden answer prediction is a json that is the same as the json format. + """ + thoughts = re.findall(r'(Plan [0-9]+: .+)', pred_data) + dependencies = re.findall(r'(Dependency [0-9]+: .+)', pred_data) + action_units = re.findall(r'Action [0-9]+: (.+)', pred_data) + + if not (len(thoughts) == len(dependencies) and len(thoughts) == len(action_units)): + pred['planning'] = [] + gt['planning'] = [] + return ResponseDataSample(template = '', pred=pred, gt=gt), 1 + + plan_action = [] + for i in range(len(action_units)): + dependency_list = re.findall(r'Dependency [0-9]+: (.+)', dependencies[i]) + if action_units[i][0] == '#': + # The action has a return #E + args_str_list = re.findall(r'#E[0-9]+ = .+\((.+)\)', action_units[i]) + name_list = re.findall(r'#E[0-9]+ = (.+)\(', action_units[i]) + else: + # The action does not have a return + args_str_list = re.findall(r'.+\((.+)\)', action_units[i]) + name_list = re.findall(r'(.+)\(', action_units[i]) + if (len(name_list) > 0): + curr_name = name_list[0] + else: + curr_name = "" + if (len(args_str_list) > 0): + args_str = "{" + args_str_list[0] + "}" + else: + args_str = "{}" + if (len(dependency_list) > 0): + dependency_str = dependency_list[0] + else: + dependency_str = "" + dependency = re.findall('([0-9]+)', dependency_str) + dependency = list(set([int(x) - 1 for x in dependency])) + plan_action.append( + dict( + id = i, + name = curr_name, + prev = dependency, + args = args_str + )) + pred['planning'] = plan_action + #Turn dict into args str + for i in range(len(gt['planning'])): + args_str = "" + if type(gt['planning'][i]['args']) == str: + args_dict = eval(gt['planning'][i]['args']) + else: + assert type(gt['planning'][i]['args']) == dict + args_dict = gt['planning'][i]['args'] + for it in args_dict: + if args_str == "": args_str += f"{it}=\"{args_dict[it]}\"" + else: args_str += f", {it}=\"{args_dict[it]}\"" + gt['planning'][i]['args'] = '{' + args_str + '}' + + elif prompt_type == 'str': + pred_data_format = pred_data.replace('. ', '\n').split('\n') + pred_actions = [] + for pred_step in pred_data_format: + first_occur_time = 1e9 + pred_action = "" + for api_name in datum['meta']['API_list']: + occur_time = pred_step.find(api_name) + if occur_time != -1 and occur_time < first_occur_time: + first_occur_time = occur_time + pred_action = api_name + if pred_action != "": + pred_actions.append({ + 'id': len(pred_actions), + 'name': pred_action, + 'args': pred_step + }) + pred['planning'] = pred_actions + if len(pred['planning']) == 0: + error = 1 + else: + raise NotImplementedError(f"Currently, we only support json and ReWOO format, but get {prompt_type}") + + return ResponseDataSample(template = '', pred=pred, gt=gt), error + + def _evaluate(self, data_sample) -> dict: + if self.match_strategy == 'bertscore': + metrics_result = self.bertscore_match( + data_sample.pred['planning'], data_sample.gt['planning']) + elif self.match_strategy == 'permutation': + metrics_result = self.permutation_match( + data_sample.pred['planning'], data_sample.gt['planning']) + else: + raise NotImplementedError + if len(data_sample.pred['planning']) == 0 or len(data_sample.gt['planning']) == 0: + metrics_result['parse_rate'] = 0 + else: + metrics_result['parse_rate'] = 1 + return metrics_result + + def evaluate(self): + self._load_dataset() + results_list = [] + for data_sample in tqdm(self.dataset): + metrics_result = self._evaluate( + data_sample['response_data_sample']) + results_list.append(metrics_result) + return self._post_process(results_list) + + def permutation_match(self, pred_plan, gt_plan) -> dict: + ''' + The function calculates all the permutation matches' score and selects the max f1_score; + Since permutation is time consuming, we truncate the length of plans to 9 + ''' + if pred_plan[-1]['name'] != 'FinishAction': + pred_plan.append( + {'id': len(pred_plan), 'prev': [], 'name': 'FinishAction', 'args': r'\{\}'} + ) + + if gt_plan[-1]['name'] != 'FinishAction': + gt_plan.append( + {'id': len(gt_plan), 'prev': [], 'name': 'FinishAction', 'args': r'\{\}'} + ) + + # truncate plans to 9 since it is too long for permutation. + if len(pred_plan) > 9: pred_plan = pred_plan[:9] + if len(gt_plan) > 9: gt_plan = pred_plan[:9] + + pred_plan = sorted(pred_plan, key=lambda x: x['id']) + gt_plan = sorted(gt_plan, key=lambda x: x['id']) + len_pred = len(pred_plan) + len_gt = len(gt_plan) + map_id_max = max(len_pred, len_gt) + numbers = [i for i in range(map_id_max)] + perms = itertools.permutations(numbers, len_pred) + gt_prev_count, pred_prev_count = 0, 0 + for i in range(len_gt): + gt_plan[i]['prev'].append(i) + gt_prev_count += len(gt_plan[i]['prev']) + for i in range(len_pred): + pred_plan[i]['prev'].append(i) + pred_prev_count += len(pred_plan[i]['prev']) + if gt_prev_count == 0 or pred_prev_count == 0: + return { + 'precision': 0, + 'recall': 0, + 'f1_score': 0 + } + max_recall, max_precision, max_f1 = 0, 0, 0 + for perm in perms: + correct_count = 0 + for i in range(len_pred): + if perm[i] >= len_gt: + continue + for j in pred_plan[i]['prev']: + if perm[j] in gt_plan[perm[i]]['prev']: + correct_count += 1 + now_recall, now_precision = correct_count / gt_prev_count, correct_count / pred_prev_count + if now_recall + now_precision == 0: + continue + now_f1 = 2 * now_recall * now_precision / (now_recall + now_precision) + if now_f1 > max_f1: + max_f1, max_recall, max_precision = now_f1, now_recall, now_precision + return { + 'precision': max_precision, + 'recall': max_recall, + 'f1_score': max_f1 + } + + def bertscore_match(self, pred_plan, gt_plan) -> dict: + """ + Calculate the similarity between predicted plan and golden answer, + A plan can be regarded a sequence of actions, and each action has a name and args. + Firstly, use bertscore to calculate pointwise similarity by: + similarity(u, v) = bertscore(u.name, v.name) * name_weight + bertscore(u.args, v.args) * args_weight; + Secondly, use Hungarian matching to match the points; + Finally, use LIS to calculate the number of matched nodes. + """ + if len(pred_plan) == 0 or len(gt_plan) == 0: + return { + 'precision': 0, + 'recall': 0, + 'f1_score': 0 + } + + pred_plan = copy.deepcopy(sorted(pred_plan, key=lambda x: x['id'])) + gt_plan = copy.deepcopy(sorted(gt_plan, key=lambda x: x['id'])) + + #Add end action + #Currently it is hard-code + if pred_plan[-1]['name'] == 'FinishAction': + pred_plan = pred_plan[:-1] + if gt_plan[-1]['name'] == 'FinishAction': + gt_plan = gt_plan[:-1] + #The total counts of nodes and edges. + len_pred = len(pred_plan) + len_gt = len(gt_plan) + + bert_score_matrix = np.zeros((len_pred, len_gt)) + name_pred, args_pred = [], [] + name_gt, args_gt = [], [] + for i in range(len_pred): + name_pred.append(pred_plan[i]['name']) + args_pred.append(str(pred_plan[i]['args'])) + for i in range(len_gt): + name_gt.append(gt_plan[i]['name']) + args_gt.append(str(gt_plan[i]['args'])) + name_pred_emb = self.sentence_model.encode(name_pred, convert_to_tensor=True) + name_gt_emb = self.sentence_model.encode(name_gt, convert_to_tensor=True) + args_pred_emb = self.sentence_model.encode(args_pred, convert_to_tensor=True) + args_gt_emb = self.sentence_model.encode(args_gt, convert_to_tensor=True) + name_cosine_scores = np.maximum(util.cos_sim(name_pred_emb, name_gt_emb).cpu().numpy(), 0) + args_cosine_scores = np.maximum(util.cos_sim(args_pred_emb, args_gt_emb).cpu().numpy(), 0) + for i in range(len_pred): + for j in range(len_gt): + bert_score_matrix[i][j] = \ + name_cosine_scores[i][j] * self.name_weight \ + + args_cosine_scores[i][j] * self.args_weight + G = nx.Graph() + for i in range(len_pred): + for j in range(len_gt): + if bert_score_matrix[i][j] > self.match_threshold: + G.add_edge(i, str(j), weight=bert_score_matrix[i][j]) + max_weight_matching = nx.max_weight_matching(G) + + pred_to_gt_mapping = dict() + for key in max_weight_matching: + if type(key[0]) == int: + pred_to_gt_mapping[int(key[0])] = int(key[1]) + else: + pred_to_gt_mapping[int(key[1])] = int(key[0]) + + #If a prediction node does not match any golden answer node, we mark the node as -1. + for i in range(len_pred): + if i not in pred_to_gt_mapping: + pred_to_gt_mapping[i] = -1 + #Calculate how many nodes are matched by Longest Increasing Subsequence (LIS) + dp = np.ones(len_pred) + for i in range(len_pred): + for j in range(i): + if pred_to_gt_mapping[i] == -1 or pred_to_gt_mapping[j] == -1: + continue + if pred_to_gt_mapping[i] > pred_to_gt_mapping[j]: + dp[i] = max(dp[i], dp[j] + 1) + correct_count = int(max(dp)) + + recall, precision = correct_count / len(gt_plan), correct_count / len(pred_plan) + f1_score = 2 * recall * precision / (recall + precision) + result = { + 'precision': precision, + 'recall': recall, + 'f1_score': f1_score + } + return result + + def _post_process(self, results_list): + # list of dict to dict of list + results = dict() + planning_metric_keys = ["precision", "recall", "f1_score", 'parse_rate'] + for key in planning_metric_keys: + results[key] = mean([result[key] for result in results_list]) + return results diff --git a/opencompass/datasets/teval/evaluators/reason_retrieve_understand_evaluator.py b/opencompass/datasets/teval/evaluators/reason_retrieve_understand_evaluator.py index c137407f..bf866641 100644 --- a/opencompass/datasets/teval/evaluators/reason_retrieve_understand_evaluator.py +++ b/opencompass/datasets/teval/evaluators/reason_retrieve_understand_evaluator.py @@ -1,455 +1,455 @@ -import json -from numpy import mean -from mmengine import load -import numpy as np -import json -import re -from tqdm import tqdm - -from ..schema import ResponseDataSample -from ..utils.format_load import format_load -from sentence_transformers import SentenceTransformer, util - - -def input_postprocess(text: str) -> str: - if isinstance(text, str): - text = text.split('<|')[0] - text = text.split('\n')[0] - text = text.split('\n')[0] - text = text.split('<|im_end|>')[0] - if len(text) > 1 and text[:2] == '{{' and text[-2:] == '}}': - text = text[1:-1] - while len(text) > 0 and text[-1] == '\n': - text = text[:-1] - return str(text) - -class ReasonRetrieveUnderstandEvaluator: - """Planning Evaluation - Args: - dataset_path(str): File path of evaluation dataset - bert_score_model(str): the bert_score model for sentence similarity, default = "all-mpnet-base-v2". - Refer to https://www.sbert.net/docs/pretrained_models.html for more models. - """ - def __init__( - self, - dataset_path: str, - bert_score_model: str = "all-mpnet-base-v2", # ['thenlper/gte-large-zh', 'all-mpnet-base-v2'] - default_prompt_type: str = 'json', - eval_type: str = 'reason', - **kwargs, - ) -> None: - self.bert_score_model = bert_score_model - print(bert_score_model) - self.dataset_path = dataset_path - # self.bertscore = evaluate.load('bertscore') - self.default_prompt_type = default_prompt_type # ["json", "str"] - self.eval_type = eval_type - self.valid_data_count = None - self.sentence_model = SentenceTransformer(self.bert_score_model) - - def _load_dataset(self): - self.dataset = [] - dataset = load(self.dataset_path) - total_error = 0 - total_count = 0 - for key in dataset.keys(): - datum = dataset[key] - data_sample, error = self._process_response(datum) - total_error += error - total_count += 1 - self.dataset.append( - dict(response_data_sample=data_sample)) - - self.num_samples = len(self.dataset) - # print("total_data_count:", total_count, "valid_data_count:", total_count - total_error) - self.valid_data_count = total_count - total_error - - def format_load(self, data): - r''' - ensure evaluator can work correctly under any data input - ''' - try: - json_format = format_load(data, start_character='{', end_character='}') - except Exception as e: - return {} - if type(json_format) != dict: - return {} - prepared_json_format = dict() - try: - prepared_json_format['thought'] = str(json_format['thought']) - except Exception as e: - prepared_json_format['thought'] = '' - try: - prepared_json_format['name'] = str(json_format['name']) - except Exception as e: - prepared_json_format['name'] = '' - - if self.default_prompt_type == 'json': - try: - if isinstance(json_format['args'], dict): - prepared_json_format['args'] = json_format['args'] - else: - prepared_json_format['args'] = dict() - except: - prepared_json_format['args'] = dict() - else: - try: - prepared_json_format['args'] = str(json_format['args']) - except Exception as e: - prepared_json_format['args'] = "" - - return prepared_json_format - - def _process_response( - self, - datum, - ) -> ResponseDataSample: - """Process the response to needed format. - Args: - datum(dict): inputs. - Returns: - dict: Processed response data sample. - """ - - # Generated response, which can be a string or list - pred_data = datum['prediction'] - # Response of ground truth, which can be a string or list - gt_data = datum['ground_truth'] - # prompt_type: The type of planning prompt, supporting "json" and "ReWOO" - if "meta_data" in datum: - prompt_type = datum["meta_data"].get("response_format", self.default_prompt_type) - else: - prompt_type = self.default_prompt_type - - error = 0 - gt = self.format_load(gt_data) - # pred_data = input_postprocess(pred_data) - - if prompt_type == 'json': - pred = self.format_load(pred_data) - if pred == {} or gt == {}: - error = 1 - elif prompt_type == 'str': - # choose the first line - pred = dict() - if self.eval_type == 'reason': - pred['thought'] = pred_data - if self.eval_type == 'retrieve': - pred['name'] = pred_data - if self.eval_type == 'understand': - pred['args'] = pred_data - else: - raise NotImplementedError(f"Currently, we only support json and str format, but get {prompt_type}") - - if error == 1: - pred = dict() - return ResponseDataSample(template = '', pred=pred, gt=gt), error - - def _evaluate(self, data_sample): - """Evaluate the response data sample. - """ - # To enable batch evaluation, the evaluator is written at post_process. - return data_sample - - def evaluate(self): - self._load_dataset() - results_list = [] - for data_sample in tqdm(self.dataset): - metrics_result = self._evaluate( - data_sample['response_data_sample']) - results_list.append(metrics_result) - return self._post_process(results_list) - - def find_a_dot_b_structure(self, text): - # find a.b structure - pattern = r'\w+\.\w+' - return re.findall(pattern, text) - - def find_FinishAction(self, text): - # find FinishAction - pattern = r'FinishAction' - return re.findall(pattern, text) - - def _post_process(self, results_list): - # list of dict to dict of list - if self.default_prompt_type == 'json': - metric_keys = ['thought', 'name', 'args', 'parse_rate'] - if self.default_prompt_type == 'str': - if self.eval_type == 'reason': - metric_keys = ['thought', 'parse_rate'] - if self.eval_type == 'retrieve': - metric_keys = ['name', 'parse_rate'] - if self.eval_type == 'understand': - metric_keys = ['args', 'parse_rate'] - metrics_results = [] - batch_data = []; batch_arg_data = [] - batch_id = []; batch_arg_id = [] - BATCH_LIMIT = 32 - for id, data in enumerate(results_list): - metrics_results.append( - {metric_keys[x]: 0 for x in range(len(metric_keys))} - ) - if len(data.pred.keys()) != 0: - metrics_results[id]['parse_rate'] = 1 - if 'thought' in data.pred and 'thought' in data.gt: - batch_data.extend([data.pred['thought'], data.gt['thought']]) - batch_id.extend([id]) - if len(batch_data) >= BATCH_LIMIT: - pred_emb = self.sentence_model.encode(batch_data, convert_to_tensor=True) - for i in range(0, len(batch_data), 2): - cosine_score = np.maximum(util.cos_sim(pred_emb[i], pred_emb[i+1]).cpu().numpy(), 0) - metrics_results[batch_id[i // 2]]['thought'] = cosine_score[0, 0] - batch_data = [] - batch_id = [] - if 'name' in data.pred and 'name' in data.gt: - if self.default_prompt_type == 'json': - if data.pred['name'] == data.gt['name']: - metrics_results[id]['name'] = 1 - else: - metrics_results[id]['name'] = 0 - else: - if data.gt['name'] not in data.pred['name']: - metrics_results[id]['name'] = 0 - else: - metrics_results[id]['name'] = 1 - find_all_name = self.find_a_dot_b_structure(data.pred['name']) + self.find_FinishAction(data.pred['name']) - for name in find_all_name: - if name != data.gt['name']: - metrics_results[id]['name'] = 0 - - if 'args' in data.pred and 'args' in data.gt: - batch_arg_data.extend([str(data.pred['args']), str(data.gt['args'])]) - batch_arg_id.extend([id]) - if len(batch_arg_data) >= BATCH_LIMIT: - pred_emb = self.sentence_model.encode(batch_arg_data, convert_to_tensor=True) - for i in range(0, len(batch_arg_data), 2): - cosine_score = np.maximum(util.cos_sim(pred_emb[i], pred_emb[i+1]).cpu().numpy(), 0) - metrics_results[batch_arg_id[i // 2]]['args'] = cosine_score[0, 0] - batch_arg_data = [] - batch_arg_id = [] - - if len(batch_data) > 0: - pred_emb = self.sentence_model.encode(batch_data, convert_to_tensor=True) - for i in range(0, len(batch_data), 2): - cosine_score = np.maximum(util.cos_sim(pred_emb[i], pred_emb[i+1]).cpu().numpy(), 0) - metrics_results[batch_id[i // 2]]['thought'] = cosine_score[0, 0] - batch_data = [] - batch_id = [] - - if len(batch_arg_data) > 0: - pred_emb = self.sentence_model.encode(batch_arg_data, convert_to_tensor=True) - for i in range(0, len(batch_arg_data), 2): - cosine_score = np.maximum(util.cos_sim(pred_emb[i], pred_emb[i+1]).cpu().numpy(), 0) - metrics_results[batch_arg_id[i // 2]]['args'] = cosine_score[0, 0] - batch_arg_data = [] - batch_arg_id = [] - - results = dict() - for key in metric_keys: - results[key] = mean([metrics_results[key] for metrics_results in metrics_results]) - return results - -class ReasonRetrieveUnderstandEvaluatorNoBatch: - """Planning Evaluation - Args: - dataset_path(str): File path of evaluation dataset - bert_score_model(str): the bert_score model for sentence similarity, default = "all-mpnet-base-v2". - Refer to https://www.sbert.net/docs/pretrained_models.html for more models. - """ - def __init__( - self, - dataset_path: str, - bert_score_model: str = "all-mpnet-base-v2", - default_prompt_type: str = 'json', - eval_type: str = 'reason', - ) -> None: - self.bert_score_model = bert_score_model - self.dataset_path = dataset_path - # self.bertscore = evaluate.load('bertscore') - self.default_prompt_type = default_prompt_type # ["json", "str"] - self.eval_type = eval_type - self.valid_data_count = None - self.sentence_model = SentenceTransformer(self.bert_score_model) - - def _load_dataset(self): - self.dataset = [] - dataset = load(self.dataset_path) - total_error = 0 - total_count = 0 - for key in dataset.keys(): - datum = dataset[key] - data_sample, error = self._process_response(datum) - total_error += error - total_count += 1 - self.dataset.append( - dict(response_data_sample=data_sample)) - - self.num_samples = len(self.dataset) - # print("total_data_count:", total_count, "valid_data_count:", total_count - total_error) - self.valid_data_count = total_count - total_error - - def format_load(self, data): - r''' - ensure evaluator can work correctly under any data input - ''' - if type(data) == dict: - json_format = data - else: - try: - json_format = json.loads(data) #json.loads(pred_data) - except Exception as e: - return {} - if type(json_format) != dict: - return {} - prepared_json_format = dict() - try: - prepared_json_format['thought'] = str(json_format['thought']) - except Exception as e: - prepared_json_format['thought'] = '' - try: - prepared_json_format['name'] = str(json_format['name']) - except Exception as e: - prepared_json_format['name'] = '' - try: - if prepared_json_format["name"] != "FinishAction": - arg_inputs = json_format["args"] - if type(arg_inputs) == str: - arg_inputs = json.loads(arg_inputs) - if type(arg_inputs) == dict: - prepared_json_format['args'] = arg_inputs - else: - prepared_json_format["args"] = {} - else: - prepared_json_format["args"] = {} - except Exception as e: - prepared_json_format['args'] = {} - return prepared_json_format - - def _process_response( - self, - datum, - ) -> ResponseDataSample: - """Process the response to needed format. - Args: - datum(dict): inputs. - Returns: - dict: Processed response data sample. - """ - - # Generated response, which can be a string or list - pred_data = datum['prediction'] - # Response of ground truth, which can be a string or list - gt_data = datum['ground_truth'] - # prompt_type: The type of planning prompt, supporting "json" and "ReWOO" - if "meta" in datum: - prompt_type = datum["meta"].get("response_format", self.default_prompt_type) - else: - prompt_type = self.default_prompt_type - - error = 0 - gt = self.format_load(gt_data) - # pred_data = input_postprocess(pred_data) - if prompt_type == 'json': - # pred_data = pred_data.replace('\'', '\"') - pred = self.format_load(pred_data) - if pred == {} or gt == {}: - error = 1 - elif prompt_type == 'str': - # choose the first line - pred = dict() - if self.eval_type == 'reason': - pred['thought'] = pred_data - if self.eval_type == 'retrieve': - pred['name'] = pred_data - if self.eval_type == 'understand': - # pred_data = pred_data.replace('\'', '\"') - # try: - # pred['args'] = json.loads(pred_data) - # if type(pred['args']) != dict: - # pred['args'] = {} - # except Exception as e: - # error = 1 - pred['args'] = pred_data - else: - raise NotImplementedError(f"Currently, we only support json and str format, but get {prompt_type}") - - if error == 1: - pred = dict() - return ResponseDataSample(template = '', pred=pred, gt=gt), error - - def _evaluate(self, data_sample) -> dict: - """Evaluate the response data sample. - """ - metrics_result = { - 'thought': 0, - 'name': 0, - 'args_precision': 0, - 'args_recall': 0, - 'args_f1_score': 0, - 'parse_rate': 0, - } - if 'thought' in data_sample.pred and 'thought' in data_sample.gt: - pred_emb = self.sentence_model.encode(data_sample.pred['thought'], convert_to_tensor=True) - gt_emb = self.sentence_model.encode(data_sample.gt['thought'], convert_to_tensor=True) - cosine_scores = np.maximum(util.cos_sim(pred_emb, gt_emb).cpu().numpy(), 0) - metrics_result['thought'] = cosine_scores[0, 0] - - if 'name' in data_sample.pred and 'name' in data_sample.gt: - if data_sample.pred['name'] == data_sample.gt['name']: - metrics_result['name'] = 1 - else: - metrics_result['name'] = 0 - if 'args' in data_sample.pred and 'args' in data_sample.gt: - gt_num_keys = len(data_sample.gt['args'].keys()) - pred_num_keys = len(data_sample.pred['args'].keys()) - if pred_num_keys == 0 and gt_num_keys == 0: - metrics_result['args_precision'] = 1 - metrics_result['args_recall'] = 1 - metrics_result['args_f1_score'] = 1 - elif pred_num_keys == 0 or gt_num_keys == 0: - metrics_result['args_precision'] = 0 - metrics_result['args_recall'] = 0 - metrics_result['args_f1_score'] = 0 - else: - correct_count = 0 - for key in data_sample.gt['args'].keys(): - if key in data_sample.pred['args'] and str(data_sample.pred['args'][key]) == str(data_sample.gt['args'][key]): - correct_count += 1 - metrics_result['args_precision'] = correct_count / pred_num_keys - metrics_result['args_recall'] = correct_count / gt_num_keys - if metrics_result['args_precision'] + metrics_result['args_recall'] == 0: - metrics_result['args_f1_score'] = 0 - else: - metrics_result['args_f1_score'] = 2 * metrics_result['args_precision'] * metrics_result['args_recall'] / \ - (metrics_result['args_precision'] + metrics_result['args_recall']) - - if len(data_sample.pred.keys()) == 0: - metrics_result['parse_rate'] = 0 - else: - metrics_result['parse_rate'] = 1 - return metrics_result - - def evaluate(self): - self._load_dataset() - results_list = [] - for data_sample in tqdm(self.dataset): - metrics_result = self._evaluate( - data_sample['response_data_sample']) - results_list.append(metrics_result) - return self._post_process(results_list) - - def _post_process(self, results_list): - # list of dict to dict of list - results = dict() - if self.default_prompt_type == 'json': - metric_keys = ['thought', 'name', 'args_precision', 'args_recall', 'args_f1_score', 'parse_rate'] - if self.default_prompt_type == 'str': - if self.eval_type == 'reason': - metric_keys = ['thought', 'parse_rate'] - if self.eval_type == 'retrieve': - metric_keys = ['name', 'parse_rate'] - if self.eval_type == 'understand': - metric_keys = ['args_precision', 'args_recall', 'args_f1_score', 'parse_rate'] - for key in metric_keys: - results[key] = mean([result[key] for result in results_list]) - return results +import json +from numpy import mean +from mmengine import load +import numpy as np +import json +import re +from tqdm import tqdm + +from ..schema import ResponseDataSample +from ..utils.format_load import format_load +from sentence_transformers import SentenceTransformer, util + + +def input_postprocess(text: str) -> str: + if isinstance(text, str): + text = text.split('<|')[0] + text = text.split('\n')[0] + text = text.split('\n')[0] + text = text.split('<|im_end|>')[0] + if len(text) > 1 and text[:2] == '{{' and text[-2:] == '}}': + text = text[1:-1] + while len(text) > 0 and text[-1] == '\n': + text = text[:-1] + return str(text) + +class ReasonRetrieveUnderstandEvaluator: + """Planning Evaluation + Args: + dataset_path(str): File path of evaluation dataset + bert_score_model(str): the bert_score model for sentence similarity, default = "all-mpnet-base-v2". + Refer to https://www.sbert.net/docs/pretrained_models.html for more models. + """ + def __init__( + self, + dataset_path: str, + bert_score_model: str = "all-mpnet-base-v2", # ['thenlper/gte-large-zh', 'all-mpnet-base-v2'] + default_prompt_type: str = 'json', + eval_type: str = 'reason', + **kwargs, + ) -> None: + self.bert_score_model = bert_score_model + print(bert_score_model) + self.dataset_path = dataset_path + # self.bertscore = evaluate.load('bertscore') + self.default_prompt_type = default_prompt_type # ["json", "str"] + self.eval_type = eval_type + self.valid_data_count = None + self.sentence_model = SentenceTransformer(self.bert_score_model) + + def _load_dataset(self): + self.dataset = [] + dataset = load(self.dataset_path) + total_error = 0 + total_count = 0 + for key in dataset.keys(): + datum = dataset[key] + data_sample, error = self._process_response(datum) + total_error += error + total_count += 1 + self.dataset.append( + dict(response_data_sample=data_sample)) + + self.num_samples = len(self.dataset) + # print("total_data_count:", total_count, "valid_data_count:", total_count - total_error) + self.valid_data_count = total_count - total_error + + def format_load(self, data): + r''' + ensure evaluator can work correctly under any data input + ''' + try: + json_format = format_load(data, start_character='{', end_character='}') + except Exception as e: + return {} + if type(json_format) != dict: + return {} + prepared_json_format = dict() + try: + prepared_json_format['thought'] = str(json_format['thought']) + except Exception as e: + prepared_json_format['thought'] = '' + try: + prepared_json_format['name'] = str(json_format['name']) + except Exception as e: + prepared_json_format['name'] = '' + + if self.default_prompt_type == 'json': + try: + if isinstance(json_format['args'], dict): + prepared_json_format['args'] = json_format['args'] + else: + prepared_json_format['args'] = dict() + except: + prepared_json_format['args'] = dict() + else: + try: + prepared_json_format['args'] = str(json_format['args']) + except Exception as e: + prepared_json_format['args'] = "" + + return prepared_json_format + + def _process_response( + self, + datum, + ) -> ResponseDataSample: + """Process the response to needed format. + Args: + datum(dict): inputs. + Returns: + dict: Processed response data sample. + """ + + # Generated response, which can be a string or list + pred_data = datum['prediction'] + # Response of ground truth, which can be a string or list + gt_data = datum['ground_truth'] + # prompt_type: The type of planning prompt, supporting "json" and "ReWOO" + if "meta_data" in datum: + prompt_type = datum["meta_data"].get("response_format", self.default_prompt_type) + else: + prompt_type = self.default_prompt_type + + error = 0 + gt = self.format_load(gt_data) + # pred_data = input_postprocess(pred_data) + + if prompt_type == 'json': + pred = self.format_load(pred_data) + if pred == {} or gt == {}: + error = 1 + elif prompt_type == 'str': + # choose the first line + pred = dict() + if self.eval_type == 'reason': + pred['thought'] = pred_data + if self.eval_type == 'retrieve': + pred['name'] = pred_data + if self.eval_type == 'understand': + pred['args'] = pred_data + else: + raise NotImplementedError(f"Currently, we only support json and str format, but get {prompt_type}") + + if error == 1: + pred = dict() + return ResponseDataSample(template = '', pred=pred, gt=gt), error + + def _evaluate(self, data_sample): + """Evaluate the response data sample. + """ + # To enable batch evaluation, the evaluator is written at post_process. + return data_sample + + def evaluate(self): + self._load_dataset() + results_list = [] + for data_sample in tqdm(self.dataset): + metrics_result = self._evaluate( + data_sample['response_data_sample']) + results_list.append(metrics_result) + return self._post_process(results_list) + + def find_a_dot_b_structure(self, text): + # find a.b structure + pattern = r'\w+\.\w+' + return re.findall(pattern, text) + + def find_FinishAction(self, text): + # find FinishAction + pattern = r'FinishAction' + return re.findall(pattern, text) + + def _post_process(self, results_list): + # list of dict to dict of list + if self.default_prompt_type == 'json': + metric_keys = ['thought', 'name', 'args', 'parse_rate'] + if self.default_prompt_type == 'str': + if self.eval_type == 'reason': + metric_keys = ['thought', 'parse_rate'] + if self.eval_type == 'retrieve': + metric_keys = ['name', 'parse_rate'] + if self.eval_type == 'understand': + metric_keys = ['args', 'parse_rate'] + metrics_results = [] + batch_data = []; batch_arg_data = [] + batch_id = []; batch_arg_id = [] + BATCH_LIMIT = 32 + for id, data in enumerate(results_list): + metrics_results.append( + {metric_keys[x]: 0 for x in range(len(metric_keys))} + ) + if len(data.pred.keys()) != 0: + metrics_results[id]['parse_rate'] = 1 + if 'thought' in data.pred and 'thought' in data.gt: + batch_data.extend([data.pred['thought'], data.gt['thought']]) + batch_id.extend([id]) + if len(batch_data) >= BATCH_LIMIT: + pred_emb = self.sentence_model.encode(batch_data, convert_to_tensor=True) + for i in range(0, len(batch_data), 2): + cosine_score = np.maximum(util.cos_sim(pred_emb[i], pred_emb[i+1]).cpu().numpy(), 0) + metrics_results[batch_id[i // 2]]['thought'] = cosine_score[0, 0] + batch_data = [] + batch_id = [] + if 'name' in data.pred and 'name' in data.gt: + if self.default_prompt_type == 'json': + if data.pred['name'] == data.gt['name']: + metrics_results[id]['name'] = 1 + else: + metrics_results[id]['name'] = 0 + else: + if data.gt['name'] not in data.pred['name']: + metrics_results[id]['name'] = 0 + else: + metrics_results[id]['name'] = 1 + find_all_name = self.find_a_dot_b_structure(data.pred['name']) + self.find_FinishAction(data.pred['name']) + for name in find_all_name: + if name != data.gt['name']: + metrics_results[id]['name'] = 0 + + if 'args' in data.pred and 'args' in data.gt: + batch_arg_data.extend([str(data.pred['args']), str(data.gt['args'])]) + batch_arg_id.extend([id]) + if len(batch_arg_data) >= BATCH_LIMIT: + pred_emb = self.sentence_model.encode(batch_arg_data, convert_to_tensor=True) + for i in range(0, len(batch_arg_data), 2): + cosine_score = np.maximum(util.cos_sim(pred_emb[i], pred_emb[i+1]).cpu().numpy(), 0) + metrics_results[batch_arg_id[i // 2]]['args'] = cosine_score[0, 0] + batch_arg_data = [] + batch_arg_id = [] + + if len(batch_data) > 0: + pred_emb = self.sentence_model.encode(batch_data, convert_to_tensor=True) + for i in range(0, len(batch_data), 2): + cosine_score = np.maximum(util.cos_sim(pred_emb[i], pred_emb[i+1]).cpu().numpy(), 0) + metrics_results[batch_id[i // 2]]['thought'] = cosine_score[0, 0] + batch_data = [] + batch_id = [] + + if len(batch_arg_data) > 0: + pred_emb = self.sentence_model.encode(batch_arg_data, convert_to_tensor=True) + for i in range(0, len(batch_arg_data), 2): + cosine_score = np.maximum(util.cos_sim(pred_emb[i], pred_emb[i+1]).cpu().numpy(), 0) + metrics_results[batch_arg_id[i // 2]]['args'] = cosine_score[0, 0] + batch_arg_data = [] + batch_arg_id = [] + + results = dict() + for key in metric_keys: + results[key] = mean([metrics_results[key] for metrics_results in metrics_results]) + return results + +class ReasonRetrieveUnderstandEvaluatorNoBatch: + """Planning Evaluation + Args: + dataset_path(str): File path of evaluation dataset + bert_score_model(str): the bert_score model for sentence similarity, default = "all-mpnet-base-v2". + Refer to https://www.sbert.net/docs/pretrained_models.html for more models. + """ + def __init__( + self, + dataset_path: str, + bert_score_model: str = "all-mpnet-base-v2", + default_prompt_type: str = 'json', + eval_type: str = 'reason', + ) -> None: + self.bert_score_model = bert_score_model + self.dataset_path = dataset_path + # self.bertscore = evaluate.load('bertscore') + self.default_prompt_type = default_prompt_type # ["json", "str"] + self.eval_type = eval_type + self.valid_data_count = None + self.sentence_model = SentenceTransformer(self.bert_score_model) + + def _load_dataset(self): + self.dataset = [] + dataset = load(self.dataset_path) + total_error = 0 + total_count = 0 + for key in dataset.keys(): + datum = dataset[key] + data_sample, error = self._process_response(datum) + total_error += error + total_count += 1 + self.dataset.append( + dict(response_data_sample=data_sample)) + + self.num_samples = len(self.dataset) + # print("total_data_count:", total_count, "valid_data_count:", total_count - total_error) + self.valid_data_count = total_count - total_error + + def format_load(self, data): + r''' + ensure evaluator can work correctly under any data input + ''' + if type(data) == dict: + json_format = data + else: + try: + json_format = json.loads(data) #json.loads(pred_data) + except Exception as e: + return {} + if type(json_format) != dict: + return {} + prepared_json_format = dict() + try: + prepared_json_format['thought'] = str(json_format['thought']) + except Exception as e: + prepared_json_format['thought'] = '' + try: + prepared_json_format['name'] = str(json_format['name']) + except Exception as e: + prepared_json_format['name'] = '' + try: + if prepared_json_format["name"] != "FinishAction": + arg_inputs = json_format["args"] + if type(arg_inputs) == str: + arg_inputs = json.loads(arg_inputs) + if type(arg_inputs) == dict: + prepared_json_format['args'] = arg_inputs + else: + prepared_json_format["args"] = {} + else: + prepared_json_format["args"] = {} + except Exception as e: + prepared_json_format['args'] = {} + return prepared_json_format + + def _process_response( + self, + datum, + ) -> ResponseDataSample: + """Process the response to needed format. + Args: + datum(dict): inputs. + Returns: + dict: Processed response data sample. + """ + + # Generated response, which can be a string or list + pred_data = datum['prediction'] + # Response of ground truth, which can be a string or list + gt_data = datum['ground_truth'] + # prompt_type: The type of planning prompt, supporting "json" and "ReWOO" + if "meta" in datum: + prompt_type = datum["meta"].get("response_format", self.default_prompt_type) + else: + prompt_type = self.default_prompt_type + + error = 0 + gt = self.format_load(gt_data) + # pred_data = input_postprocess(pred_data) + if prompt_type == 'json': + # pred_data = pred_data.replace('\'', '\"') + pred = self.format_load(pred_data) + if pred == {} or gt == {}: + error = 1 + elif prompt_type == 'str': + # choose the first line + pred = dict() + if self.eval_type == 'reason': + pred['thought'] = pred_data + if self.eval_type == 'retrieve': + pred['name'] = pred_data + if self.eval_type == 'understand': + # pred_data = pred_data.replace('\'', '\"') + # try: + # pred['args'] = json.loads(pred_data) + # if type(pred['args']) != dict: + # pred['args'] = {} + # except Exception as e: + # error = 1 + pred['args'] = pred_data + else: + raise NotImplementedError(f"Currently, we only support json and str format, but get {prompt_type}") + + if error == 1: + pred = dict() + return ResponseDataSample(template = '', pred=pred, gt=gt), error + + def _evaluate(self, data_sample) -> dict: + """Evaluate the response data sample. + """ + metrics_result = { + 'thought': 0, + 'name': 0, + 'args_precision': 0, + 'args_recall': 0, + 'args_f1_score': 0, + 'parse_rate': 0, + } + if 'thought' in data_sample.pred and 'thought' in data_sample.gt: + pred_emb = self.sentence_model.encode(data_sample.pred['thought'], convert_to_tensor=True) + gt_emb = self.sentence_model.encode(data_sample.gt['thought'], convert_to_tensor=True) + cosine_scores = np.maximum(util.cos_sim(pred_emb, gt_emb).cpu().numpy(), 0) + metrics_result['thought'] = cosine_scores[0, 0] + + if 'name' in data_sample.pred and 'name' in data_sample.gt: + if data_sample.pred['name'] == data_sample.gt['name']: + metrics_result['name'] = 1 + else: + metrics_result['name'] = 0 + if 'args' in data_sample.pred and 'args' in data_sample.gt: + gt_num_keys = len(data_sample.gt['args'].keys()) + pred_num_keys = len(data_sample.pred['args'].keys()) + if pred_num_keys == 0 and gt_num_keys == 0: + metrics_result['args_precision'] = 1 + metrics_result['args_recall'] = 1 + metrics_result['args_f1_score'] = 1 + elif pred_num_keys == 0 or gt_num_keys == 0: + metrics_result['args_precision'] = 0 + metrics_result['args_recall'] = 0 + metrics_result['args_f1_score'] = 0 + else: + correct_count = 0 + for key in data_sample.gt['args'].keys(): + if key in data_sample.pred['args'] and str(data_sample.pred['args'][key]) == str(data_sample.gt['args'][key]): + correct_count += 1 + metrics_result['args_precision'] = correct_count / pred_num_keys + metrics_result['args_recall'] = correct_count / gt_num_keys + if metrics_result['args_precision'] + metrics_result['args_recall'] == 0: + metrics_result['args_f1_score'] = 0 + else: + metrics_result['args_f1_score'] = 2 * metrics_result['args_precision'] * metrics_result['args_recall'] / \ + (metrics_result['args_precision'] + metrics_result['args_recall']) + + if len(data_sample.pred.keys()) == 0: + metrics_result['parse_rate'] = 0 + else: + metrics_result['parse_rate'] = 1 + return metrics_result + + def evaluate(self): + self._load_dataset() + results_list = [] + for data_sample in tqdm(self.dataset): + metrics_result = self._evaluate( + data_sample['response_data_sample']) + results_list.append(metrics_result) + return self._post_process(results_list) + + def _post_process(self, results_list): + # list of dict to dict of list + results = dict() + if self.default_prompt_type == 'json': + metric_keys = ['thought', 'name', 'args_precision', 'args_recall', 'args_f1_score', 'parse_rate'] + if self.default_prompt_type == 'str': + if self.eval_type == 'reason': + metric_keys = ['thought', 'parse_rate'] + if self.eval_type == 'retrieve': + metric_keys = ['name', 'parse_rate'] + if self.eval_type == 'understand': + metric_keys = ['args_precision', 'args_recall', 'args_f1_score', 'parse_rate'] + for key in metric_keys: + results[key] = mean([result[key] for result in results_list]) + return results diff --git a/opencompass/datasets/teval/evaluators/review_evaluator.py b/opencompass/datasets/teval/evaluators/review_evaluator.py index 68a14668..27c595b8 100644 --- a/opencompass/datasets/teval/evaluators/review_evaluator.py +++ b/opencompass/datasets/teval/evaluators/review_evaluator.py @@ -1,123 +1,123 @@ -from collections import defaultdict -from mmengine import load - -from ..schema import ResponseDataSample -import numpy as np -from ..utils.format_load import format_load - -class ReviewEvaluator: - """Review Capability Evaluation - - Args: - dataset_path(str): File path of evaluation dataset. - - """ - - def __init__( - self, - dataset_path: str, - # bert_score_model: str = "all-mpnet-base-v2", - **kwargs, - ) -> None: - self.dataset_path = dataset_path - # self.bert_score_model = bert_score_model - # self.sentence_model = SentenceTransformer(self.bert_score_model) - - def _load_dataset(self): - self.dataset = [] - dataset = load(self.dataset_path) - - for key in dataset.keys(): - datum = dataset[key] - data_sample = self._process_response(datum) - - self.dataset.append( - dict( - origin_prompt=datum['origin_prompt'], - response_data_sample=data_sample)) - self.num_samples = len(self.dataset) - - def _process_response( - self, - datum: dict, - ) -> ResponseDataSample: - """Process the response to needed format. - - Args: - datum(dict): inputs. - - Returns: - dict: Processed response data sample. - """ - - template = datum['template'] - pred_data = datum['prediction'] - gt_data = datum['ground_truth']['answer'] - meta_data = datum['meta_data'] - - if meta_data['response_format'] == 'json': - pred_data = self.json_format_parse(pred_data) - else: - pred_data = pred_data[pred_data.find(":") + 1:] - pred_data = pred_data.strip() - if len(pred_data) > 0 and pred_data[0] in ['A', 'B', 'C', 'D', 'E']: - pred_data = pred_data[0] - else: - pred_data = None - - return ResponseDataSample( - template=template, pred=pred_data, gt=gt_data, meta_data=meta_data) - - def _evaluate(self, data_sample) -> dict: - metrics_result = dict( - parse_rate=0, - review_quality=0, - ) - - pred_data = data_sample.pred - if pred_data is not None: - # import pdb; pdb.set_trace() - metrics_result['review_quality'] = 1.0 if pred_data == \ - data_sample.gt else 0.0 - metrics_result['parse_rate'] = 1.0 - return metrics_result - - # def compute_sen_similarity(self, gt, pred): - # gt_embed = self.sentence_model.encode(gt, convert_to_tensor=True) - # pred_embed = self.sentence_model.encode(pred, convert_to_tensor=True) - # sen_sim = max(0, util.cos_sim(gt_embed, pred_embed).item()) - # return sen_sim - - def json_format_parse(self, pred_data): - try: - data = format_load(pred_data) - except Exception as e: - return None - try: - new_data = dict() - new_data['review'] = data['is_finished'] - assert new_data['review'] in [True, False] - except Exception as e: - return None - return new_data - - def evaluate(self): - self._load_dataset() - results_list = [] - for data_sample in self.dataset: - metrics_result = self._evaluate( - data_sample['response_data_sample']) - results_list.append(metrics_result) - return self._post_process(results_list) - - def _post_process(self, results_list): - # list of dict to dict of list - results_dict = defaultdict(list) - { - results_dict[key].append(sub[key]) - for sub in results_list for key in sub - } - metric_list = ['parse_rate', 'review_quality'] - for metric in metric_list: - results_dict[metric] = np.round(np.mean(results_dict[metric]), decimals=4) - return results_dict +from collections import defaultdict +from mmengine import load + +from ..schema import ResponseDataSample +import numpy as np +from ..utils.format_load import format_load + +class ReviewEvaluator: + """Review Capability Evaluation + + Args: + dataset_path(str): File path of evaluation dataset. + + """ + + def __init__( + self, + dataset_path: str, + # bert_score_model: str = "all-mpnet-base-v2", + **kwargs, + ) -> None: + self.dataset_path = dataset_path + # self.bert_score_model = bert_score_model + # self.sentence_model = SentenceTransformer(self.bert_score_model) + + def _load_dataset(self): + self.dataset = [] + dataset = load(self.dataset_path) + + for key in dataset.keys(): + datum = dataset[key] + data_sample = self._process_response(datum) + + self.dataset.append( + dict( + origin_prompt=datum['origin_prompt'], + response_data_sample=data_sample)) + self.num_samples = len(self.dataset) + + def _process_response( + self, + datum: dict, + ) -> ResponseDataSample: + """Process the response to needed format. + + Args: + datum(dict): inputs. + + Returns: + dict: Processed response data sample. + """ + + template = datum['template'] + pred_data = datum['prediction'] + gt_data = datum['ground_truth']['answer'] + meta_data = datum['meta_data'] + + if meta_data['response_format'] == 'json': + pred_data = self.json_format_parse(pred_data) + else: + pred_data = pred_data[pred_data.find(":") + 1:] + pred_data = pred_data.strip() + if len(pred_data) > 0 and pred_data[0] in ['A', 'B', 'C', 'D', 'E']: + pred_data = pred_data[0] + else: + pred_data = None + + return ResponseDataSample( + template=template, pred=pred_data, gt=gt_data, meta_data=meta_data) + + def _evaluate(self, data_sample) -> dict: + metrics_result = dict( + parse_rate=0, + review_quality=0, + ) + + pred_data = data_sample.pred + if pred_data is not None: + # import pdb; pdb.set_trace() + metrics_result['review_quality'] = 1.0 if pred_data == \ + data_sample.gt else 0.0 + metrics_result['parse_rate'] = 1.0 + return metrics_result + + # def compute_sen_similarity(self, gt, pred): + # gt_embed = self.sentence_model.encode(gt, convert_to_tensor=True) + # pred_embed = self.sentence_model.encode(pred, convert_to_tensor=True) + # sen_sim = max(0, util.cos_sim(gt_embed, pred_embed).item()) + # return sen_sim + + def json_format_parse(self, pred_data): + try: + data = format_load(pred_data) + except Exception as e: + return None + try: + new_data = dict() + new_data['review'] = data['is_finished'] + assert new_data['review'] in [True, False] + except Exception as e: + return None + return new_data + + def evaluate(self): + self._load_dataset() + results_list = [] + for data_sample in self.dataset: + metrics_result = self._evaluate( + data_sample['response_data_sample']) + results_list.append(metrics_result) + return self._post_process(results_list) + + def _post_process(self, results_list): + # list of dict to dict of list + results_dict = defaultdict(list) + { + results_dict[key].append(sub[key]) + for sub in results_list for key in sub + } + metric_list = ['parse_rate', 'review_quality'] + for metric in metric_list: + results_dict[metric] = np.round(np.mean(results_dict[metric]), decimals=4) + return results_dict diff --git a/opencompass/datasets/teval/schema.py b/opencompass/datasets/teval/schema.py index 7188a658..4b732334 100644 --- a/opencompass/datasets/teval/schema.py +++ b/opencompass/datasets/teval/schema.py @@ -1,19 +1,19 @@ -from dataclasses import asdict, dataclass, field -from typing import Any, Dict - - -@dataclass -class ResponseDataSample: - """ - Args: - template(str): Format string with keyword-only arguments. For - example '{who} like {what}' - pred(Any): Parsed data from LLM generating response. - gt(Any): Ground truth data - meta_data(dict, optional): Meta information will be used to evaluate - LLM's response - """ - template: str - pred: Any - gt: Any - meta_data: dict = None +from dataclasses import asdict, dataclass, field +from typing import Any, Dict + + +@dataclass +class ResponseDataSample: + """ + Args: + template(str): Format string with keyword-only arguments. For + example '{who} like {what}' + pred(Any): Parsed data from LLM generating response. + gt(Any): Ground truth data + meta_data(dict, optional): Meta information will be used to evaluate + LLM's response + """ + template: str + pred: Any + gt: Any + meta_data: dict = None diff --git a/opencompass/datasets/teval/utils/convert_results.py b/opencompass/datasets/teval/utils/convert_results.py index d6b99471..0d5b7a1c 100644 --- a/opencompass/datasets/teval/utils/convert_results.py +++ b/opencompass/datasets/teval/utils/convert_results.py @@ -1,35 +1,35 @@ -import mmengine -import os -import argparse -import numpy as np -# np.set_printoptions(precision=1) - -def parse_args(): - parser = argparse.ArgumentParser() - parser.add_argument('--result_path', type=str) - args = parser.parse_args() - return args - -def convert_results(result_path): - result = mmengine.load(result_path) - instruct_list = [(result['instruct_json']['json_format_metric'] + result['instruct_json']['json_args_em_metric']) / 2, \ - (result['instruct_json']['string_format_metric'] + result['instruct_json']['string_args_em_metric']) / 2] - plan_list = [result['plan_str']['f1_score'], result['plan_json']['f1_score']] - reason_list = [result['reason_str']['thought'], result['rru_json']['thought']] - retrieve_list = [result['retrieve_str']['name'], result['rru_json']['name']] - understand_list = [result['understand_str']['args'], result['rru_json']['args']] - review_list = [result['review_str']['review_quality'], result['review_str']['review_quality']] - - final_score = [np.mean(instruct_list), np.mean(plan_list), np.mean(reason_list), \ - np.mean(retrieve_list), np.mean(understand_list), np.mean(review_list)] - overall = np.mean(final_score) - final_score.insert(0, overall) - name_list = ['Overall', 'Instruct', 'Plan', 'Reason', 'Retrieve', 'Understand', 'Review'] - print("Cut Paste Results: ", np.array(final_score) * 100) - for i in range(len(name_list)): - print("%s: %.1f" % (name_list[i], final_score[i]*100), end='\t') - - -if __name__ == '__main__': - args = parse_args() - convert_results(args.result_path) +import mmengine +import os +import argparse +import numpy as np +# np.set_printoptions(precision=1) + +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument('--result_path', type=str) + args = parser.parse_args() + return args + +def convert_results(result_path): + result = mmengine.load(result_path) + instruct_list = [(result['instruct_json']['json_format_metric'] + result['instruct_json']['json_args_em_metric']) / 2, \ + (result['instruct_json']['string_format_metric'] + result['instruct_json']['string_args_em_metric']) / 2] + plan_list = [result['plan_str']['f1_score'], result['plan_json']['f1_score']] + reason_list = [result['reason_str']['thought'], result['rru_json']['thought']] + retrieve_list = [result['retrieve_str']['name'], result['rru_json']['name']] + understand_list = [result['understand_str']['args'], result['rru_json']['args']] + review_list = [result['review_str']['review_quality'], result['review_str']['review_quality']] + + final_score = [np.mean(instruct_list), np.mean(plan_list), np.mean(reason_list), \ + np.mean(retrieve_list), np.mean(understand_list), np.mean(review_list)] + overall = np.mean(final_score) + final_score.insert(0, overall) + name_list = ['Overall', 'Instruct', 'Plan', 'Reason', 'Retrieve', 'Understand', 'Review'] + print("Cut Paste Results: ", np.array(final_score) * 100) + for i in range(len(name_list)): + print("%s: %.1f" % (name_list[i], final_score[i]*100), end='\t') + + +if __name__ == '__main__': + args = parse_args() + convert_results(args.result_path) diff --git a/opencompass/datasets/teval/utils/format_load.py b/opencompass/datasets/teval/utils/format_load.py index 8419fe65..ecef715c 100644 --- a/opencompass/datasets/teval/utils/format_load.py +++ b/opencompass/datasets/teval/utils/format_load.py @@ -1,44 +1,44 @@ -import ast -import json -def format_load(raw_data: str, start_character: str = '', end_character: str = ''): - """Format the raw data into the format that can be evaluated. - - Args: - raw_data (str): The raw data. - start_character (str, optional): The start character. Defaults to '', if using it, the string will be sliced from the first start_character. - end_character (str, optional): The end character. Defaults to '', if using it, the string will be sliced to the last end_character. - - Returns: - str: The formatted data. - """ - if type(raw_data) != str: - # the data has been evaluated - return raw_data - if "```json" in raw_data: - raw_data = raw_data[raw_data.find("```json") + len("```json"):] - raw_data = raw_data.strip("`") - if start_character != '': - raw_data = raw_data[raw_data.find(start_character):] - if end_character != '': - raw_data = raw_data[:raw_data.rfind(end_character) + len(end_character)] - successful_parse = False - try: - data = ast.literal_eval(raw_data) - successful_parse = True - except Exception as e: - pass - try: - if not successful_parse: - data = json.loads(raw_data) - successful_parse = True - except Exception as e: - pass - try: - if not successful_parse: - data = json.loads(raw_data.replace("\'", "\"")) - successful_parse = True - except Exception as e: - pass - if not successful_parse: - raise Exception("Cannot parse raw data") - return data +import ast +import json +def format_load(raw_data: str, start_character: str = '', end_character: str = ''): + """Format the raw data into the format that can be evaluated. + + Args: + raw_data (str): The raw data. + start_character (str, optional): The start character. Defaults to '', if using it, the string will be sliced from the first start_character. + end_character (str, optional): The end character. Defaults to '', if using it, the string will be sliced to the last end_character. + + Returns: + str: The formatted data. + """ + if type(raw_data) != str: + # the data has been evaluated + return raw_data + if "```json" in raw_data: + raw_data = raw_data[raw_data.find("```json") + len("```json"):] + raw_data = raw_data.strip("`") + if start_character != '': + raw_data = raw_data[raw_data.find(start_character):] + if end_character != '': + raw_data = raw_data[:raw_data.rfind(end_character) + len(end_character)] + successful_parse = False + try: + data = ast.literal_eval(raw_data) + successful_parse = True + except Exception as e: + pass + try: + if not successful_parse: + data = json.loads(raw_data) + successful_parse = True + except Exception as e: + pass + try: + if not successful_parse: + data = json.loads(raw_data.replace("\'", "\"")) + successful_parse = True + except Exception as e: + pass + if not successful_parse: + raise Exception("Cannot parse raw data") + return data diff --git a/opencompass/datasets/teval/utils/meta_template.py b/opencompass/datasets/teval/utils/meta_template.py index 8603d1b5..2dcd77d8 100644 --- a/opencompass/datasets/teval/utils/meta_template.py +++ b/opencompass/datasets/teval/utils/meta_template.py @@ -1,11 +1,11 @@ -meta_template_dict = dict( - internlm = [ - dict(role='system', begin='<|System|>:', end='\n'), - dict(role='user', begin='<|User|>:', end='\n'), - dict( - role='assistant', - begin='<|Bot|>:', - end='\n', - generate=True) - ], -) +meta_template_dict = dict( + internlm = [ + dict(role='system', begin='<|System|>:', end='\n'), + dict(role='user', begin='<|User|>:', end='\n'), + dict( + role='assistant', + begin='<|Bot|>:', + end='\n', + generate=True) + ], +) diff --git a/opencompass/datasets/teval/utils/template.py b/opencompass/datasets/teval/utils/template.py index d2958470..1ded41dd 100644 --- a/opencompass/datasets/teval/utils/template.py +++ b/opencompass/datasets/teval/utils/template.py @@ -1,76 +1,76 @@ -import re -from string import Formatter - - -def format_string(template: str, input_data: dict) -> str: - """Return string with input content according input format template. - - Args: - template (str): Format string with keyword-only argument. For - example '{who} like {what}' - input_data (dict): Input data to fill in the input template. - - Returns: - str: Return string. - """ - - return template.format(**input_data) - - -def parse_string(template: str, input_string: str, allow_newline: bool=False) -> dict: - """Return a dictionary whose keys are from input template and value is - responding content from input_string. - - Args: - template (str): Format template with keyword-only argument. For - example '{who} like {what}' - input_string (str): Input string will be parsed. - allow_newline (boolen): Whether allow '\n' in {} during RE match, default to False. - - Returns: - dict: Parsed data from input string according to format string. If - input string doesn't match template, It will return None. - - Examples: - >>> template = '{who} like {what}' - >>> input_string = 'monkey like banana' - >>> data = parse_string(template, input_string) - >>> data - >>> {'who': 'monkey', 'what': 'banana'} - >>> input_string = 'monkey likes banana' - >>> data = parse_string(template, input_string) - >>> data - >>> None - >>> template = '{what} like {what}' - >>> input_string = 'monkey like banana' - >>> data = parse_string(template, input_string) - >>> data - >>> {'what': ['monkey', 'banana']} - """ - - formatter = Formatter() - context = [] - keys = [] - for v in formatter.parse(template): - # v is (literal_text, field_name, format_spec, conversion) - if v[1] is not None: - keys.append(v[1]) - context.append(v[0]) - pattern = template - for k in keys: - pattern = pattern.replace('{' + f'{k}' + '}', '(.*)') - # pattern = re.compile(rf'{pattern}') - values = re.findall(pattern, input_string, re.S if allow_newline else 0) - if len(values) < 1: - return None - data = dict() - for k, v in zip(keys, values[0]): - if k in data: - tmp = data[k] - if isinstance(tmp, list): - data[k].append(v) - else: - data[k] = [tmp, v] - else: - data[k] = v - return data +import re +from string import Formatter + + +def format_string(template: str, input_data: dict) -> str: + """Return string with input content according input format template. + + Args: + template (str): Format string with keyword-only argument. For + example '{who} like {what}' + input_data (dict): Input data to fill in the input template. + + Returns: + str: Return string. + """ + + return template.format(**input_data) + + +def parse_string(template: str, input_string: str, allow_newline: bool=False) -> dict: + """Return a dictionary whose keys are from input template and value is + responding content from input_string. + + Args: + template (str): Format template with keyword-only argument. For + example '{who} like {what}' + input_string (str): Input string will be parsed. + allow_newline (boolen): Whether allow '\n' in {} during RE match, default to False. + + Returns: + dict: Parsed data from input string according to format string. If + input string doesn't match template, It will return None. + + Examples: + >>> template = '{who} like {what}' + >>> input_string = 'monkey like banana' + >>> data = parse_string(template, input_string) + >>> data + >>> {'who': 'monkey', 'what': 'banana'} + >>> input_string = 'monkey likes banana' + >>> data = parse_string(template, input_string) + >>> data + >>> None + >>> template = '{what} like {what}' + >>> input_string = 'monkey like banana' + >>> data = parse_string(template, input_string) + >>> data + >>> {'what': ['monkey', 'banana']} + """ + + formatter = Formatter() + context = [] + keys = [] + for v in formatter.parse(template): + # v is (literal_text, field_name, format_spec, conversion) + if v[1] is not None: + keys.append(v[1]) + context.append(v[0]) + pattern = template + for k in keys: + pattern = pattern.replace('{' + f'{k}' + '}', '(.*)') + # pattern = re.compile(rf'{pattern}') + values = re.findall(pattern, input_string, re.S if allow_newline else 0) + if len(values) < 1: + return None + data = dict() + for k, v in zip(keys, values[0]): + if k in data: + tmp = data[k] + if isinstance(tmp, list): + data[k].append(v) + else: + data[k] = [tmp, v] + else: + data[k] = v + return data diff --git a/opencompass/openicl/icl_evaluator/hf_metrics/accuracy.py b/opencompass/openicl/icl_evaluator/hf_metrics/accuracy.py index aa5a0732..ba6deede 100644 --- a/opencompass/openicl/icl_evaluator/hf_metrics/accuracy.py +++ b/opencompass/openicl/icl_evaluator/hf_metrics/accuracy.py @@ -1,106 +1,106 @@ -# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Accuracy metric.""" - -import datasets -from sklearn.metrics import accuracy_score - -import evaluate - - -_DESCRIPTION = """ -Accuracy is the proportion of correct predictions among the total number of cases processed. It can be computed with: -Accuracy = (TP + TN) / (TP + TN + FP + FN) - Where: -TP: True positive -TN: True negative -FP: False positive -FN: False negative -""" - - -_KWARGS_DESCRIPTION = """ -Args: - predictions (`list` of `int`): Predicted labels. - references (`list` of `int`): Ground truth labels. - normalize (`boolean`): If set to False, returns the number of correctly classified samples. Otherwise, returns the fraction of correctly classified samples. Defaults to True. - sample_weight (`list` of `float`): Sample weights Defaults to None. - -Returns: - accuracy (`float` or `int`): Accuracy score. Minimum possible value is 0. Maximum possible value is 1.0, or the number of examples input, if `normalize` is set to `True`.. A higher score means higher accuracy. - -Examples: - - Example 1-A simple example - >>> accuracy_metric = evaluate.load("accuracy") - >>> results = accuracy_metric.compute(references=[0, 1, 2, 0, 1, 2], predictions=[0, 1, 1, 2, 1, 0]) - >>> print(results) - {'accuracy': 0.5} - - Example 2-The same as Example 1, except with `normalize` set to `False`. - >>> accuracy_metric = evaluate.load("accuracy") - >>> results = accuracy_metric.compute(references=[0, 1, 2, 0, 1, 2], predictions=[0, 1, 1, 2, 1, 0], normalize=False) - >>> print(results) - {'accuracy': 3.0} - - Example 3-The same as Example 1, except with `sample_weight` set. - >>> accuracy_metric = evaluate.load("accuracy") - >>> results = accuracy_metric.compute(references=[0, 1, 2, 0, 1, 2], predictions=[0, 1, 1, 2, 1, 0], sample_weight=[0.5, 2, 0.7, 0.5, 9, 0.4]) - >>> print(results) - {'accuracy': 0.8778625954198473} -""" - - -_CITATION = """ -@article{scikit-learn, - title={Scikit-learn: Machine Learning in {P}ython}, - author={Pedregosa, F. and Varoquaux, G. and Gramfort, A. and Michel, V. - and Thirion, B. and Grisel, O. and Blondel, M. and Prettenhofer, P. - and Weiss, R. and Dubourg, V. and Vanderplas, J. and Passos, A. and - Cournapeau, D. and Brucher, M. and Perrot, M. and Duchesnay, E.}, - journal={Journal of Machine Learning Research}, - volume={12}, - pages={2825--2830}, - year={2011} -} -""" - - -@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION) -class Accuracy(evaluate.Metric): - def _info(self): - return evaluate.MetricInfo( - description=_DESCRIPTION, - citation=_CITATION, - inputs_description=_KWARGS_DESCRIPTION, - features=datasets.Features( - { - "predictions": datasets.Sequence(datasets.Value("int32")), - "references": datasets.Sequence(datasets.Value("int32")), - } - if self.config_name == "multilabel" - else { - "predictions": datasets.Value("int32"), - "references": datasets.Value("int32"), - } - ), - reference_urls=["https://scikit-learn.org/stable/modules/generated/sklearn.metrics.accuracy_score.html"], - ) - - def _compute(self, predictions, references, normalize=True, sample_weight=None): - return { - "accuracy": float( - accuracy_score(references, predictions, normalize=normalize, sample_weight=sample_weight) - ) - } +# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Accuracy metric.""" + +import datasets +from sklearn.metrics import accuracy_score + +import evaluate + + +_DESCRIPTION = """ +Accuracy is the proportion of correct predictions among the total number of cases processed. It can be computed with: +Accuracy = (TP + TN) / (TP + TN + FP + FN) + Where: +TP: True positive +TN: True negative +FP: False positive +FN: False negative +""" + + +_KWARGS_DESCRIPTION = """ +Args: + predictions (`list` of `int`): Predicted labels. + references (`list` of `int`): Ground truth labels. + normalize (`boolean`): If set to False, returns the number of correctly classified samples. Otherwise, returns the fraction of correctly classified samples. Defaults to True. + sample_weight (`list` of `float`): Sample weights Defaults to None. + +Returns: + accuracy (`float` or `int`): Accuracy score. Minimum possible value is 0. Maximum possible value is 1.0, or the number of examples input, if `normalize` is set to `True`.. A higher score means higher accuracy. + +Examples: + + Example 1-A simple example + >>> accuracy_metric = evaluate.load("accuracy") + >>> results = accuracy_metric.compute(references=[0, 1, 2, 0, 1, 2], predictions=[0, 1, 1, 2, 1, 0]) + >>> print(results) + {'accuracy': 0.5} + + Example 2-The same as Example 1, except with `normalize` set to `False`. + >>> accuracy_metric = evaluate.load("accuracy") + >>> results = accuracy_metric.compute(references=[0, 1, 2, 0, 1, 2], predictions=[0, 1, 1, 2, 1, 0], normalize=False) + >>> print(results) + {'accuracy': 3.0} + + Example 3-The same as Example 1, except with `sample_weight` set. + >>> accuracy_metric = evaluate.load("accuracy") + >>> results = accuracy_metric.compute(references=[0, 1, 2, 0, 1, 2], predictions=[0, 1, 1, 2, 1, 0], sample_weight=[0.5, 2, 0.7, 0.5, 9, 0.4]) + >>> print(results) + {'accuracy': 0.8778625954198473} +""" + + +_CITATION = """ +@article{scikit-learn, + title={Scikit-learn: Machine Learning in {P}ython}, + author={Pedregosa, F. and Varoquaux, G. and Gramfort, A. and Michel, V. + and Thirion, B. and Grisel, O. and Blondel, M. and Prettenhofer, P. + and Weiss, R. and Dubourg, V. and Vanderplas, J. and Passos, A. and + Cournapeau, D. and Brucher, M. and Perrot, M. and Duchesnay, E.}, + journal={Journal of Machine Learning Research}, + volume={12}, + pages={2825--2830}, + year={2011} +} +""" + + +@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION) +class Accuracy(evaluate.Metric): + def _info(self): + return evaluate.MetricInfo( + description=_DESCRIPTION, + citation=_CITATION, + inputs_description=_KWARGS_DESCRIPTION, + features=datasets.Features( + { + "predictions": datasets.Sequence(datasets.Value("int32")), + "references": datasets.Sequence(datasets.Value("int32")), + } + if self.config_name == "multilabel" + else { + "predictions": datasets.Value("int32"), + "references": datasets.Value("int32"), + } + ), + reference_urls=["https://scikit-learn.org/stable/modules/generated/sklearn.metrics.accuracy_score.html"], + ) + + def _compute(self, predictions, references, normalize=True, sample_weight=None): + return { + "accuracy": float( + accuracy_score(references, predictions, normalize=normalize, sample_weight=sample_weight) + ) + } diff --git a/opencompass/openicl/icl_evaluator/hf_metrics/rouge.py b/opencompass/openicl/icl_evaluator/hf_metrics/rouge.py index 353301cc..194a5415 100644 --- a/opencompass/openicl/icl_evaluator/hf_metrics/rouge.py +++ b/opencompass/openicl/icl_evaluator/hf_metrics/rouge.py @@ -1,158 +1,158 @@ -# Copyright 2020 The HuggingFace Evaluate Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" ROUGE metric from Google Research github repo. """ - -# The dependencies in https://github.com/google-research/google-research/blob/master/rouge/requirements.txt -import absl # Here to have a nice missing dependency error message early on -import datasets -import nltk # Here to have a nice missing dependency error message early on -import numpy # Here to have a nice missing dependency error message early on -import six # Here to have a nice missing dependency error message early on -from rouge_score import rouge_scorer, scoring - -import evaluate - - -_CITATION = """\ -@inproceedings{lin-2004-rouge, - title = "{ROUGE}: A Package for Automatic Evaluation of Summaries", - author = "Lin, Chin-Yew", - booktitle = "Text Summarization Branches Out", - month = jul, - year = "2004", - address = "Barcelona, Spain", - publisher = "Association for Computational Linguistics", - url = "https://www.aclweb.org/anthology/W04-1013", - pages = "74--81", -} -""" - -_DESCRIPTION = """\ -ROUGE, or Recall-Oriented Understudy for Gisting Evaluation, is a set of metrics and a software package used for -evaluating automatic summarization and machine translation software in natural language processing. -The metrics compare an automatically produced summary or translation against a reference or a set of references (human-produced) summary or translation. - -Note that ROUGE is case insensitive, meaning that upper case letters are treated the same way as lower case letters. - -This metrics is a wrapper around Google Research reimplementation of ROUGE: -https://github.com/google-research/google-research/tree/master/rouge -""" - -_KWARGS_DESCRIPTION = """ -Calculates average rouge scores for a list of hypotheses and references -Args: - predictions: list of predictions to score. Each prediction - should be a string with tokens separated by spaces. - references: list of reference for each prediction. Each - reference should be a string with tokens separated by spaces. - rouge_types: A list of rouge types to calculate. - Valid names: - `"rouge{n}"` (e.g. `"rouge1"`, `"rouge2"`) where: {n} is the n-gram based scoring, - `"rougeL"`: Longest common subsequence based scoring. - `"rougeLsum"`: rougeLsum splits text using `"\n"`. - See details in https://github.com/huggingface/datasets/issues/617 - use_stemmer: Bool indicating whether Porter stemmer should be used to strip word suffixes. - use_aggregator: Return aggregates if this is set to True -Returns: - rouge1: rouge_1 (f1), - rouge2: rouge_2 (f1), - rougeL: rouge_l (f1), - rougeLsum: rouge_lsum (f1) -Examples: - - >>> rouge = evaluate.load('rouge') - >>> predictions = ["hello there", "general kenobi"] - >>> references = ["hello there", "general kenobi"] - >>> results = rouge.compute(predictions=predictions, references=references) - >>> print(results) - {'rouge1': 1.0, 'rouge2': 1.0, 'rougeL': 1.0, 'rougeLsum': 1.0} -""" - - -class Tokenizer: - """Helper class to wrap a callable into a class with a `tokenize` method as used by rouge-score.""" - - def __init__(self, tokenizer_func): - self.tokenizer_func = tokenizer_func - - def tokenize(self, text): - return self.tokenizer_func(text) - - -@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION) -class Rouge(evaluate.Metric): - def _info(self): - return evaluate.MetricInfo( - description=_DESCRIPTION, - citation=_CITATION, - inputs_description=_KWARGS_DESCRIPTION, - features=[ - datasets.Features( - { - "predictions": datasets.Value("string", id="sequence"), - "references": datasets.Sequence(datasets.Value("string", id="sequence")), - } - ), - datasets.Features( - { - "predictions": datasets.Value("string", id="sequence"), - "references": datasets.Value("string", id="sequence"), - } - ), - ], - codebase_urls=["https://github.com/google-research/google-research/tree/master/rouge"], - reference_urls=[ - "https://en.wikipedia.org/wiki/ROUGE_(metric)", - "https://github.com/google-research/google-research/tree/master/rouge", - ], - ) - - def _compute( - self, predictions, references, rouge_types=None, use_aggregator=True, use_stemmer=False, tokenizer=None - ): - if rouge_types is None: - rouge_types = ["rouge1", "rouge2", "rougeL", "rougeLsum"] - - multi_ref = isinstance(references[0], list) - - if tokenizer is not None: - tokenizer = Tokenizer(tokenizer) - - scorer = rouge_scorer.RougeScorer(rouge_types=rouge_types, use_stemmer=use_stemmer, tokenizer=tokenizer) - if use_aggregator: - aggregator = scoring.BootstrapAggregator() - else: - scores = [] - - for ref, pred in zip(references, predictions): - if multi_ref: - score = scorer.score_multi(ref, pred) - else: - score = scorer.score(ref, pred) - if use_aggregator: - aggregator.add_scores(score) - else: - scores.append(score) - - if use_aggregator: - result = aggregator.aggregate() - for key in result: - result[key] = result[key].mid.fmeasure - - else: - result = {} - for key in scores[0]: - result[key] = list(score[key].fmeasure for score in scores) - - return result +# Copyright 2020 The HuggingFace Evaluate Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" ROUGE metric from Google Research github repo. """ + +# The dependencies in https://github.com/google-research/google-research/blob/master/rouge/requirements.txt +import absl # Here to have a nice missing dependency error message early on +import datasets +import nltk # Here to have a nice missing dependency error message early on +import numpy # Here to have a nice missing dependency error message early on +import six # Here to have a nice missing dependency error message early on +from rouge_score import rouge_scorer, scoring + +import evaluate + + +_CITATION = """\ +@inproceedings{lin-2004-rouge, + title = "{ROUGE}: A Package for Automatic Evaluation of Summaries", + author = "Lin, Chin-Yew", + booktitle = "Text Summarization Branches Out", + month = jul, + year = "2004", + address = "Barcelona, Spain", + publisher = "Association for Computational Linguistics", + url = "https://www.aclweb.org/anthology/W04-1013", + pages = "74--81", +} +""" + +_DESCRIPTION = """\ +ROUGE, or Recall-Oriented Understudy for Gisting Evaluation, is a set of metrics and a software package used for +evaluating automatic summarization and machine translation software in natural language processing. +The metrics compare an automatically produced summary or translation against a reference or a set of references (human-produced) summary or translation. + +Note that ROUGE is case insensitive, meaning that upper case letters are treated the same way as lower case letters. + +This metrics is a wrapper around Google Research reimplementation of ROUGE: +https://github.com/google-research/google-research/tree/master/rouge +""" + +_KWARGS_DESCRIPTION = """ +Calculates average rouge scores for a list of hypotheses and references +Args: + predictions: list of predictions to score. Each prediction + should be a string with tokens separated by spaces. + references: list of reference for each prediction. Each + reference should be a string with tokens separated by spaces. + rouge_types: A list of rouge types to calculate. + Valid names: + `"rouge{n}"` (e.g. `"rouge1"`, `"rouge2"`) where: {n} is the n-gram based scoring, + `"rougeL"`: Longest common subsequence based scoring. + `"rougeLsum"`: rougeLsum splits text using `"\n"`. + See details in https://github.com/huggingface/datasets/issues/617 + use_stemmer: Bool indicating whether Porter stemmer should be used to strip word suffixes. + use_aggregator: Return aggregates if this is set to True +Returns: + rouge1: rouge_1 (f1), + rouge2: rouge_2 (f1), + rougeL: rouge_l (f1), + rougeLsum: rouge_lsum (f1) +Examples: + + >>> rouge = evaluate.load('rouge') + >>> predictions = ["hello there", "general kenobi"] + >>> references = ["hello there", "general kenobi"] + >>> results = rouge.compute(predictions=predictions, references=references) + >>> print(results) + {'rouge1': 1.0, 'rouge2': 1.0, 'rougeL': 1.0, 'rougeLsum': 1.0} +""" + + +class Tokenizer: + """Helper class to wrap a callable into a class with a `tokenize` method as used by rouge-score.""" + + def __init__(self, tokenizer_func): + self.tokenizer_func = tokenizer_func + + def tokenize(self, text): + return self.tokenizer_func(text) + + +@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION) +class Rouge(evaluate.Metric): + def _info(self): + return evaluate.MetricInfo( + description=_DESCRIPTION, + citation=_CITATION, + inputs_description=_KWARGS_DESCRIPTION, + features=[ + datasets.Features( + { + "predictions": datasets.Value("string", id="sequence"), + "references": datasets.Sequence(datasets.Value("string", id="sequence")), + } + ), + datasets.Features( + { + "predictions": datasets.Value("string", id="sequence"), + "references": datasets.Value("string", id="sequence"), + } + ), + ], + codebase_urls=["https://github.com/google-research/google-research/tree/master/rouge"], + reference_urls=[ + "https://en.wikipedia.org/wiki/ROUGE_(metric)", + "https://github.com/google-research/google-research/tree/master/rouge", + ], + ) + + def _compute( + self, predictions, references, rouge_types=None, use_aggregator=True, use_stemmer=False, tokenizer=None + ): + if rouge_types is None: + rouge_types = ["rouge1", "rouge2", "rougeL", "rougeLsum"] + + multi_ref = isinstance(references[0], list) + + if tokenizer is not None: + tokenizer = Tokenizer(tokenizer) + + scorer = rouge_scorer.RougeScorer(rouge_types=rouge_types, use_stemmer=use_stemmer, tokenizer=tokenizer) + if use_aggregator: + aggregator = scoring.BootstrapAggregator() + else: + scores = [] + + for ref, pred in zip(references, predictions): + if multi_ref: + score = scorer.score_multi(ref, pred) + else: + score = scorer.score(ref, pred) + if use_aggregator: + aggregator.add_scores(score) + else: + scores.append(score) + + if use_aggregator: + result = aggregator.aggregate() + for key in result: + result[key] = result[key].mid.fmeasure + + else: + result = {} + for key in scores[0]: + result[key] = list(score[key].fmeasure for score in scores) + + return result diff --git a/opencompass/openicl/icl_evaluator/hf_metrics/sacrebleu.py b/opencompass/openicl/icl_evaluator/hf_metrics/sacrebleu.py index 6e756f4d..79bce9de 100644 --- a/opencompass/openicl/icl_evaluator/hf_metrics/sacrebleu.py +++ b/opencompass/openicl/icl_evaluator/hf_metrics/sacrebleu.py @@ -1,178 +1,178 @@ -# Copyright 2020 The HuggingFace Evaluate Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" SACREBLEU metric. """ - -import datasets -import sacrebleu as scb -from packaging import version - -import evaluate - - -_CITATION = """\ -@inproceedings{post-2018-call, - title = "A Call for Clarity in Reporting {BLEU} Scores", - author = "Post, Matt", - booktitle = "Proceedings of the Third Conference on Machine Translation: Research Papers", - month = oct, - year = "2018", - address = "Belgium, Brussels", - publisher = "Association for Computational Linguistics", - url = "https://www.aclweb.org/anthology/W18-6319", - pages = "186--191", -} -""" - -_DESCRIPTION = """\ -SacreBLEU provides hassle-free computation of shareable, comparable, and reproducible BLEU scores. -Inspired by Rico Sennrich's `multi-bleu-detok.perl`, it produces the official WMT scores but works with plain text. -It also knows all the standard test sets and handles downloading, processing, and tokenization for you. - -See the [README.md] file at https://github.com/mjpost/sacreBLEU for more information. -""" - -_KWARGS_DESCRIPTION = """ -Produces BLEU scores along with its sufficient statistics -from a source against one or more references. - -Args: - predictions (`list` of `str`): list of translations to score. Each translation should be tokenized into a list of tokens. - references (`list` of `list` of `str`): A list of lists of references. The contents of the first sub-list are the references for the first prediction, the contents of the second sub-list are for the second prediction, etc. Note that there must be the same number of references for each prediction (i.e. all sub-lists must be of the same length). - smooth_method (`str`): The smoothing method to use, defaults to `'exp'`. Possible values are: - - `'none'`: no smoothing - - `'floor'`: increment zero counts - - `'add-k'`: increment num/denom by k for n>1 - - `'exp'`: exponential decay - smooth_value (`float`): The smoothing value. Only valid when `smooth_method='floor'` (in which case `smooth_value` defaults to `0.1`) or `smooth_method='add-k'` (in which case `smooth_value` defaults to `1`). - tokenize (`str`): Tokenization method to use for BLEU. If not provided, defaults to `'zh'` for Chinese, `'ja-mecab'` for Japanese and `'13a'` (mteval) otherwise. Possible values are: - - `'none'`: No tokenization. - - `'zh'`: Chinese tokenization. - - `'13a'`: mimics the `mteval-v13a` script from Moses. - - `'intl'`: International tokenization, mimics the `mteval-v14` script from Moses - - `'char'`: Language-agnostic character-level tokenization. - - `'ja-mecab'`: Japanese tokenization. Uses the [MeCab tokenizer](https://pypi.org/project/mecab-python3). - lowercase (`bool`): If `True`, lowercases the input, enabling case-insensitivity. Defaults to `False`. - force (`bool`): If `True`, insists that your tokenized input is actually detokenized. Defaults to `False`. - use_effective_order (`bool`): If `True`, stops including n-gram orders for which precision is 0. This should be `True`, if sentence-level BLEU will be computed. Defaults to `False`. - -Returns: - 'score': BLEU score, - 'counts': Counts, - 'totals': Totals, - 'precisions': Precisions, - 'bp': Brevity penalty, - 'sys_len': predictions length, - 'ref_len': reference length, - -Examples: - - Example 1: - >>> predictions = ["hello there general kenobi", "foo bar foobar"] - >>> references = [["hello there general kenobi", "hello there !"], ["foo bar foobar", "foo bar foobar"]] - >>> sacrebleu = evaluate.load("sacrebleu") - >>> results = sacrebleu.compute(predictions=predictions, references=references) - >>> print(list(results.keys())) - ['score', 'counts', 'totals', 'precisions', 'bp', 'sys_len', 'ref_len'] - >>> print(round(results["score"], 1)) - 100.0 - - Example 2: - >>> predictions = ["hello there general kenobi", - ... "on our way to ankh morpork"] - >>> references = [["hello there general kenobi", "hello there !"], - ... ["goodbye ankh morpork", "ankh morpork"]] - >>> sacrebleu = evaluate.load("sacrebleu") - >>> results = sacrebleu.compute(predictions=predictions, - ... references=references) - >>> print(list(results.keys())) - ['score', 'counts', 'totals', 'precisions', 'bp', 'sys_len', 'ref_len'] - >>> print(round(results["score"], 1)) - 39.8 -""" - - -@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION) -class Sacrebleu(evaluate.Metric): - def _info(self): - if version.parse(scb.__version__) < version.parse("1.4.12"): - raise ImportWarning( - "To use `sacrebleu`, the module `sacrebleu>=1.4.12` is required, and the current version of `sacrebleu` doesn't match this condition.\n" - 'You can install it with `pip install "sacrebleu>=1.4.12"`.' - ) - return evaluate.MetricInfo( - description=_DESCRIPTION, - citation=_CITATION, - homepage="https://github.com/mjpost/sacreBLEU", - inputs_description=_KWARGS_DESCRIPTION, - features=[ - datasets.Features( - { - "predictions": datasets.Value("string", id="sequence"), - "references": datasets.Sequence(datasets.Value("string", id="sequence"), id="references"), - } - ), - datasets.Features( - { - "predictions": datasets.Value("string", id="sequence"), - "references": datasets.Value("string", id="sequence"), - } - ), - ], - codebase_urls=["https://github.com/mjpost/sacreBLEU"], - reference_urls=[ - "https://github.com/mjpost/sacreBLEU", - "https://en.wikipedia.org/wiki/BLEU", - "https://towardsdatascience.com/evaluating-text-output-in-nlp-bleu-at-your-own-risk-e8609665a213", - ], - ) - - def _compute( - self, - predictions, - references, - smooth_method="exp", - smooth_value=None, - force=False, - lowercase=False, - tokenize=None, - use_effective_order=False, - ): - # if only one reference is provided make sure we still use list of lists - if isinstance(references[0], str): - references = [[ref] for ref in references] - - references_per_prediction = len(references[0]) - if any(len(refs) != references_per_prediction for refs in references): - raise ValueError("Sacrebleu requires the same number of references for each prediction") - transformed_references = [[refs[i] for refs in references] for i in range(references_per_prediction)] - output = scb.corpus_bleu( - predictions, - transformed_references, - smooth_method=smooth_method, - smooth_value=smooth_value, - force=force, - lowercase=lowercase, - use_effective_order=use_effective_order, - **(dict(tokenize=tokenize) if tokenize else {}), - ) - output_dict = { - "score": output.score, - "counts": output.counts, - "totals": output.totals, - "precisions": output.precisions, - "bp": output.bp, - "sys_len": output.sys_len, - "ref_len": output.ref_len, - } - return output_dict +# Copyright 2020 The HuggingFace Evaluate Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" SACREBLEU metric. """ + +import datasets +import sacrebleu as scb +from packaging import version + +import evaluate + + +_CITATION = """\ +@inproceedings{post-2018-call, + title = "A Call for Clarity in Reporting {BLEU} Scores", + author = "Post, Matt", + booktitle = "Proceedings of the Third Conference on Machine Translation: Research Papers", + month = oct, + year = "2018", + address = "Belgium, Brussels", + publisher = "Association for Computational Linguistics", + url = "https://www.aclweb.org/anthology/W18-6319", + pages = "186--191", +} +""" + +_DESCRIPTION = """\ +SacreBLEU provides hassle-free computation of shareable, comparable, and reproducible BLEU scores. +Inspired by Rico Sennrich's `multi-bleu-detok.perl`, it produces the official WMT scores but works with plain text. +It also knows all the standard test sets and handles downloading, processing, and tokenization for you. + +See the [README.md] file at https://github.com/mjpost/sacreBLEU for more information. +""" + +_KWARGS_DESCRIPTION = """ +Produces BLEU scores along with its sufficient statistics +from a source against one or more references. + +Args: + predictions (`list` of `str`): list of translations to score. Each translation should be tokenized into a list of tokens. + references (`list` of `list` of `str`): A list of lists of references. The contents of the first sub-list are the references for the first prediction, the contents of the second sub-list are for the second prediction, etc. Note that there must be the same number of references for each prediction (i.e. all sub-lists must be of the same length). + smooth_method (`str`): The smoothing method to use, defaults to `'exp'`. Possible values are: + - `'none'`: no smoothing + - `'floor'`: increment zero counts + - `'add-k'`: increment num/denom by k for n>1 + - `'exp'`: exponential decay + smooth_value (`float`): The smoothing value. Only valid when `smooth_method='floor'` (in which case `smooth_value` defaults to `0.1`) or `smooth_method='add-k'` (in which case `smooth_value` defaults to `1`). + tokenize (`str`): Tokenization method to use for BLEU. If not provided, defaults to `'zh'` for Chinese, `'ja-mecab'` for Japanese and `'13a'` (mteval) otherwise. Possible values are: + - `'none'`: No tokenization. + - `'zh'`: Chinese tokenization. + - `'13a'`: mimics the `mteval-v13a` script from Moses. + - `'intl'`: International tokenization, mimics the `mteval-v14` script from Moses + - `'char'`: Language-agnostic character-level tokenization. + - `'ja-mecab'`: Japanese tokenization. Uses the [MeCab tokenizer](https://pypi.org/project/mecab-python3). + lowercase (`bool`): If `True`, lowercases the input, enabling case-insensitivity. Defaults to `False`. + force (`bool`): If `True`, insists that your tokenized input is actually detokenized. Defaults to `False`. + use_effective_order (`bool`): If `True`, stops including n-gram orders for which precision is 0. This should be `True`, if sentence-level BLEU will be computed. Defaults to `False`. + +Returns: + 'score': BLEU score, + 'counts': Counts, + 'totals': Totals, + 'precisions': Precisions, + 'bp': Brevity penalty, + 'sys_len': predictions length, + 'ref_len': reference length, + +Examples: + + Example 1: + >>> predictions = ["hello there general kenobi", "foo bar foobar"] + >>> references = [["hello there general kenobi", "hello there !"], ["foo bar foobar", "foo bar foobar"]] + >>> sacrebleu = evaluate.load("sacrebleu") + >>> results = sacrebleu.compute(predictions=predictions, references=references) + >>> print(list(results.keys())) + ['score', 'counts', 'totals', 'precisions', 'bp', 'sys_len', 'ref_len'] + >>> print(round(results["score"], 1)) + 100.0 + + Example 2: + >>> predictions = ["hello there general kenobi", + ... "on our way to ankh morpork"] + >>> references = [["hello there general kenobi", "hello there !"], + ... ["goodbye ankh morpork", "ankh morpork"]] + >>> sacrebleu = evaluate.load("sacrebleu") + >>> results = sacrebleu.compute(predictions=predictions, + ... references=references) + >>> print(list(results.keys())) + ['score', 'counts', 'totals', 'precisions', 'bp', 'sys_len', 'ref_len'] + >>> print(round(results["score"], 1)) + 39.8 +""" + + +@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION) +class Sacrebleu(evaluate.Metric): + def _info(self): + if version.parse(scb.__version__) < version.parse("1.4.12"): + raise ImportWarning( + "To use `sacrebleu`, the module `sacrebleu>=1.4.12` is required, and the current version of `sacrebleu` doesn't match this condition.\n" + 'You can install it with `pip install "sacrebleu>=1.4.12"`.' + ) + return evaluate.MetricInfo( + description=_DESCRIPTION, + citation=_CITATION, + homepage="https://github.com/mjpost/sacreBLEU", + inputs_description=_KWARGS_DESCRIPTION, + features=[ + datasets.Features( + { + "predictions": datasets.Value("string", id="sequence"), + "references": datasets.Sequence(datasets.Value("string", id="sequence"), id="references"), + } + ), + datasets.Features( + { + "predictions": datasets.Value("string", id="sequence"), + "references": datasets.Value("string", id="sequence"), + } + ), + ], + codebase_urls=["https://github.com/mjpost/sacreBLEU"], + reference_urls=[ + "https://github.com/mjpost/sacreBLEU", + "https://en.wikipedia.org/wiki/BLEU", + "https://towardsdatascience.com/evaluating-text-output-in-nlp-bleu-at-your-own-risk-e8609665a213", + ], + ) + + def _compute( + self, + predictions, + references, + smooth_method="exp", + smooth_value=None, + force=False, + lowercase=False, + tokenize=None, + use_effective_order=False, + ): + # if only one reference is provided make sure we still use list of lists + if isinstance(references[0], str): + references = [[ref] for ref in references] + + references_per_prediction = len(references[0]) + if any(len(refs) != references_per_prediction for refs in references): + raise ValueError("Sacrebleu requires the same number of references for each prediction") + transformed_references = [[refs[i] for refs in references] for i in range(references_per_prediction)] + output = scb.corpus_bleu( + predictions, + transformed_references, + smooth_method=smooth_method, + smooth_value=smooth_value, + force=force, + lowercase=lowercase, + use_effective_order=use_effective_order, + **(dict(tokenize=tokenize) if tokenize else {}), + ) + output_dict = { + "score": output.score, + "counts": output.counts, + "totals": output.totals, + "precisions": output.precisions, + "bp": output.bp, + "sys_len": output.sys_len, + "ref_len": output.ref_len, + } + return output_dict diff --git a/opencompass/openicl/icl_evaluator/hf_metrics/squad.py b/opencompass/openicl/icl_evaluator/hf_metrics/squad.py index 84658b12..4886d0ef 100644 --- a/opencompass/openicl/icl_evaluator/hf_metrics/squad.py +++ b/opencompass/openicl/icl_evaluator/hf_metrics/squad.py @@ -1,111 +1,111 @@ -# Copyright 2020 The HuggingFace Evaluate Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" SQuAD metric. """ - -import datasets - -import evaluate - -from .compute_score import compute_score - - -_CITATION = """\ -@inproceedings{Rajpurkar2016SQuAD10, - title={SQuAD: 100, 000+ Questions for Machine Comprehension of Text}, - author={Pranav Rajpurkar and Jian Zhang and Konstantin Lopyrev and Percy Liang}, - booktitle={EMNLP}, - year={2016} -} -""" - -_DESCRIPTION = """ -This metric wrap the official scoring script for version 1 of the Stanford Question Answering Dataset (SQuAD). - -Stanford Question Answering Dataset (SQuAD) is a reading comprehension dataset, consisting of questions posed by -crowdworkers on a set of Wikipedia articles, where the answer to every question is a segment of text, or span, -from the corresponding reading passage, or the question might be unanswerable. -""" - -_KWARGS_DESCRIPTION = """ -Computes SQuAD scores (F1 and EM). -Args: - predictions: List of question-answers dictionaries with the following key-values: - - 'id': id of the question-answer pair as given in the references (see below) - - 'prediction_text': the text of the answer - references: List of question-answers dictionaries with the following key-values: - - 'id': id of the question-answer pair (see above), - - 'answers': a Dict in the SQuAD dataset format - { - 'text': list of possible texts for the answer, as a list of strings - 'answer_start': list of start positions for the answer, as a list of ints - } - Note that answer_start values are not taken into account to compute the metric. -Returns: - 'exact_match': Exact match (the normalized answer exactly match the gold answer) - 'f1': The F-score of predicted tokens versus the gold answer -Examples: - - >>> predictions = [{'prediction_text': '1976', 'id': '56e10a3be3433e1400422b22'}] - >>> references = [{'answers': {'answer_start': [97], 'text': ['1976']}, 'id': '56e10a3be3433e1400422b22'}] - >>> squad_metric = evaluate.load("squad") - >>> results = squad_metric.compute(predictions=predictions, references=references) - >>> print(results) - {'exact_match': 100.0, 'f1': 100.0} -""" - - -@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION) -class Squad(evaluate.Metric): - def _info(self): - return evaluate.MetricInfo( - description=_DESCRIPTION, - citation=_CITATION, - inputs_description=_KWARGS_DESCRIPTION, - features=datasets.Features( - { - "predictions": {"id": datasets.Value("string"), "prediction_text": datasets.Value("string")}, - "references": { - "id": datasets.Value("string"), - "answers": datasets.features.Sequence( - { - "text": datasets.Value("string"), - "answer_start": datasets.Value("int32"), - } - ), - }, - } - ), - codebase_urls=["https://rajpurkar.github.io/SQuAD-explorer/"], - reference_urls=["https://rajpurkar.github.io/SQuAD-explorer/"], - ) - - def _compute(self, predictions, references): - pred_dict = {prediction["id"]: prediction["prediction_text"] for prediction in predictions} - dataset = [ - { - "paragraphs": [ - { - "qas": [ - { - "answers": [{"text": answer_text} for answer_text in ref["answers"]["text"]], - "id": ref["id"], - } - for ref in references - ] - } - ] - } - ] - score = compute_score(dataset=dataset, predictions=pred_dict) - return score +# Copyright 2020 The HuggingFace Evaluate Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" SQuAD metric. """ + +import datasets + +import evaluate + +from .compute_score import compute_score + + +_CITATION = """\ +@inproceedings{Rajpurkar2016SQuAD10, + title={SQuAD: 100, 000+ Questions for Machine Comprehension of Text}, + author={Pranav Rajpurkar and Jian Zhang and Konstantin Lopyrev and Percy Liang}, + booktitle={EMNLP}, + year={2016} +} +""" + +_DESCRIPTION = """ +This metric wrap the official scoring script for version 1 of the Stanford Question Answering Dataset (SQuAD). + +Stanford Question Answering Dataset (SQuAD) is a reading comprehension dataset, consisting of questions posed by +crowdworkers on a set of Wikipedia articles, where the answer to every question is a segment of text, or span, +from the corresponding reading passage, or the question might be unanswerable. +""" + +_KWARGS_DESCRIPTION = """ +Computes SQuAD scores (F1 and EM). +Args: + predictions: List of question-answers dictionaries with the following key-values: + - 'id': id of the question-answer pair as given in the references (see below) + - 'prediction_text': the text of the answer + references: List of question-answers dictionaries with the following key-values: + - 'id': id of the question-answer pair (see above), + - 'answers': a Dict in the SQuAD dataset format + { + 'text': list of possible texts for the answer, as a list of strings + 'answer_start': list of start positions for the answer, as a list of ints + } + Note that answer_start values are not taken into account to compute the metric. +Returns: + 'exact_match': Exact match (the normalized answer exactly match the gold answer) + 'f1': The F-score of predicted tokens versus the gold answer +Examples: + + >>> predictions = [{'prediction_text': '1976', 'id': '56e10a3be3433e1400422b22'}] + >>> references = [{'answers': {'answer_start': [97], 'text': ['1976']}, 'id': '56e10a3be3433e1400422b22'}] + >>> squad_metric = evaluate.load("squad") + >>> results = squad_metric.compute(predictions=predictions, references=references) + >>> print(results) + {'exact_match': 100.0, 'f1': 100.0} +""" + + +@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION) +class Squad(evaluate.Metric): + def _info(self): + return evaluate.MetricInfo( + description=_DESCRIPTION, + citation=_CITATION, + inputs_description=_KWARGS_DESCRIPTION, + features=datasets.Features( + { + "predictions": {"id": datasets.Value("string"), "prediction_text": datasets.Value("string")}, + "references": { + "id": datasets.Value("string"), + "answers": datasets.features.Sequence( + { + "text": datasets.Value("string"), + "answer_start": datasets.Value("int32"), + } + ), + }, + } + ), + codebase_urls=["https://rajpurkar.github.io/SQuAD-explorer/"], + reference_urls=["https://rajpurkar.github.io/SQuAD-explorer/"], + ) + + def _compute(self, predictions, references): + pred_dict = {prediction["id"]: prediction["prediction_text"] for prediction in predictions} + dataset = [ + { + "paragraphs": [ + { + "qas": [ + { + "answers": [{"text": answer_text} for answer_text in ref["answers"]["text"]], + "id": ref["id"], + } + for ref in references + ] + } + ] + } + ] + score = compute_score(dataset=dataset, predictions=pred_dict) + return score diff --git a/opencompass/openicl/icl_evaluator/lm_evaluator.py b/opencompass/openicl/icl_evaluator/lm_evaluator.py index 94f2cf94..53814070 100644 --- a/opencompass/openicl/icl_evaluator/lm_evaluator.py +++ b/opencompass/openicl/icl_evaluator/lm_evaluator.py @@ -329,4 +329,4 @@ class LMEvaluator: else: kwargs = self.dict_postprocessor proc = DICT_POSTPROCESSORS.get(kwargs.pop('type')) - return proc(output, self.output_path, **kwargs) \ No newline at end of file + return proc(output, self.output_path, **kwargs) diff --git a/opencompass/utils/datasets.py b/opencompass/utils/datasets.py index dc1e237d..05591c52 100644 --- a/opencompass/utils/datasets.py +++ b/opencompass/utils/datasets.py @@ -1,105 +1,105 @@ -import os -from .fileio import download_and_extract_archive -from .datasets_info import DATASETS_MAPPING, DATASETS_URL -from .logging import get_logger - -USER_HOME = os.path.expanduser("~") -DEFAULT_DATA_FOLDER = os.path.join(USER_HOME, '.cache/opencompass/') - - -def get_data_path(dataset_id: str, local_mode: bool = False): - """return dataset id when getting data from ModelScope/HuggingFace repo, otherwise just - return local path as is. - - Args: - dataset_id (str): dataset id or data path - local_mode (bool): whether to use local path or - ModelScope/HuggignFace repo - """ - # update the path with CACHE_DIR - cache_dir = os.environ.get('COMPASS_DATA_CACHE', '') - - # For absolute path customized by the users - if dataset_id.startswith('/'): - return dataset_id - - # For relative path, with CACHE_DIR - if local_mode: - local_path = os.path.join(cache_dir, dataset_id) - if not os.path.exists(local_path): - return download_dataset(local_path, cache_dir) - else: - return local_path - - dataset_source = os.environ.get('DATASET_SOURCE', None) - if dataset_source == 'ModelScope': - ms_id = DATASETS_MAPPING[dataset_id]['ms_id'] - assert ms_id is not None, \ - f'{dataset_id} is not supported in ModelScope' - return ms_id - elif dataset_source == 'HF': - # TODO: HuggingFace mode is currently not supported! - hf_id = DATASETS_MAPPING[dataset_id]['hf_id'] - assert hf_id is not None, \ - f'{dataset_id} is not supported in HuggingFace' - return hf_id - else: - # for the local path - local_path = DATASETS_MAPPING[dataset_id]['local'] - local_path = os.path.join(cache_dir, local_path) - - if not os.path.exists(local_path): - return download_dataset(local_path, cache_dir) - else: - return local_path - - -def download_dataset(data_path, cache_dir, remove_finished=True): - - # Try to load from default cache folder - try_default_path = os.path.join(DEFAULT_DATA_FOLDER, data_path) - if os.path.exists(try_default_path): - get_logger().info(f"Try to load the data from {try_default_path}") - return try_default_path - - get_logger().info(f'{data_path} does not exist!' - 'Start Download data automatically!' - 'If you have downloaded the data before,' - 'You can specific `COMPASS_DATA_CACHE` ' - 'to avoid downloading~') - - # Cannot find data from default cache folder, download data. - # Update DATASET_URL for internal dataset - try: - import json - internal_datasets = '.OPENCOMPASS_INTERNAL_DATA_URL.json' - file_path = os.path.join(USER_HOME, internal_datasets) - assert os.path.exists(file_path), f"{file_path} not exits" - with open(file_path, 'r') as f: - internal_datasets_info = json.load(f) - DATASETS_URL.update(internal_datasets_info) - get_logger().info(f"Load internal dataset from: {file_path}") - except Exception as e: # noqa - pass - - valid_data_names = list(DATASETS_URL.keys()) - dataset_name = '' - for name in valid_data_names: - if name in data_path: - dataset_name = name - assert dataset_name, f'No valid url for {data_path}!\n' + \ - f'Please make sure `{data_path}` is correct' - dataset_info = DATASETS_URL[dataset_name] - dataset_url = dataset_info['url'] - dataset_md5 = dataset_info['md5'] - cache_dir = cache_dir if cache_dir else DEFAULT_DATA_FOLDER - - # download and extract files - download_and_extract_archive( - url=dataset_url, - download_root=os.path.join(cache_dir, 'data'), - md5=dataset_md5, - remove_finished=remove_finished - ) - - return os.path.join(cache_dir, data_path) +import os +from .fileio import download_and_extract_archive +from .datasets_info import DATASETS_MAPPING, DATASETS_URL +from .logging import get_logger + +USER_HOME = os.path.expanduser("~") +DEFAULT_DATA_FOLDER = os.path.join(USER_HOME, '.cache/opencompass/') + + +def get_data_path(dataset_id: str, local_mode: bool = False): + """return dataset id when getting data from ModelScope/HuggingFace repo, otherwise just + return local path as is. + + Args: + dataset_id (str): dataset id or data path + local_mode (bool): whether to use local path or + ModelScope/HuggignFace repo + """ + # update the path with CACHE_DIR + cache_dir = os.environ.get('COMPASS_DATA_CACHE', '') + + # For absolute path customized by the users + if dataset_id.startswith('/'): + return dataset_id + + # For relative path, with CACHE_DIR + if local_mode: + local_path = os.path.join(cache_dir, dataset_id) + if not os.path.exists(local_path): + return download_dataset(local_path, cache_dir) + else: + return local_path + + dataset_source = os.environ.get('DATASET_SOURCE', None) + if dataset_source == 'ModelScope': + ms_id = DATASETS_MAPPING[dataset_id]['ms_id'] + assert ms_id is not None, \ + f'{dataset_id} is not supported in ModelScope' + return ms_id + elif dataset_source == 'HF': + # TODO: HuggingFace mode is currently not supported! + hf_id = DATASETS_MAPPING[dataset_id]['hf_id'] + assert hf_id is not None, \ + f'{dataset_id} is not supported in HuggingFace' + return hf_id + else: + # for the local path + local_path = DATASETS_MAPPING[dataset_id]['local'] + local_path = os.path.join(cache_dir, local_path) + + if not os.path.exists(local_path): + return download_dataset(local_path, cache_dir) + else: + return local_path + + +def download_dataset(data_path, cache_dir, remove_finished=True): + + # Try to load from default cache folder + try_default_path = os.path.join(DEFAULT_DATA_FOLDER, data_path) + if os.path.exists(try_default_path): + get_logger().info(f"Try to load the data from {try_default_path}") + return try_default_path + + get_logger().info(f'{data_path} does not exist!' + 'Start Download data automatically!' + 'If you have downloaded the data before,' + 'You can specific `COMPASS_DATA_CACHE` ' + 'to avoid downloading~') + + # Cannot find data from default cache folder, download data. + # Update DATASET_URL for internal dataset + try: + import json + internal_datasets = '.OPENCOMPASS_INTERNAL_DATA_URL.json' + file_path = os.path.join(USER_HOME, internal_datasets) + assert os.path.exists(file_path), f"{file_path} not exits" + with open(file_path, 'r') as f: + internal_datasets_info = json.load(f) + DATASETS_URL.update(internal_datasets_info) + get_logger().info(f"Load internal dataset from: {file_path}") + except Exception as e: # noqa + pass + + valid_data_names = list(DATASETS_URL.keys()) + dataset_name = '' + for name in valid_data_names: + if name in data_path: + dataset_name = name + assert dataset_name, f'No valid url for {data_path}!\n' + \ + f'Please make sure `{data_path}` is correct' + dataset_info = DATASETS_URL[dataset_name] + dataset_url = dataset_info['url'] + dataset_md5 = dataset_info['md5'] + cache_dir = cache_dir if cache_dir else DEFAULT_DATA_FOLDER + + # download and extract files + download_and_extract_archive( + url=dataset_url, + download_root=os.path.join(cache_dir, 'data'), + md5=dataset_md5, + remove_finished=remove_finished + ) + + return os.path.join(cache_dir, data_path) diff --git a/opencompass/utils/datasets_info.py b/opencompass/utils/datasets_info.py index 00db25e8..0160ca93 100644 --- a/opencompass/utils/datasets_info.py +++ b/opencompass/utils/datasets_info.py @@ -1,750 +1,755 @@ -DATASETS_MAPPING = { - # ADVGLUE Datasets - "opencompass/advglue-dev": { - "ms_id": None, - "hf_id": None, - "local": "./data/adv_glue/dev_ann.json", - }, - # AGIEval Datasets - "opencompass/agieval": { - "ms_id": "opencompass/agieval", - "hf_id": "opencompass/agieval", - "local": "./data/AGIEval/data/v1/", - }, - # ARC Datasets(Test) - "opencompass/ai2_arc-test": { - "ms_id": "opencompass/ai2_arc", - "hf_id": "opencompass/ai2_arc", - "local": "./data/ARC/ARC-c/ARC-Challenge-Test.jsonl", - }, - "opencompass/ai2_arc-dev": { - "ms_id": "opencompass/ai2_arc", - "hf_id": "opencompass/ai2_arc", - "local": "./data/ARC/ARC-c/ARC-Challenge-Dev.jsonl", - }, - "opencompass/ai2_arc-easy-dev": { - "ms_id": "opencompass/ai2_arc", - "hf_id": "opencompass/ai2_arc", - "local": "./data/ARC/ARC-e/ARC-Easy-Dev.jsonl", - }, - # BBH - "opencompass/bbh": { - "ms_id": "opencompass/bbh", - "hf_id": "opencompass/bbh", - "local": "./data/BBH/data", - }, - # bbeh - "opencompass/bbeh": { - "ms_id": "", - "hf_id": "", - "local": "./data/bbeh/", - }, - # C-Eval - "opencompass/ceval-exam": { - "ms_id": "opencompass/ceval-exam", - "hf_id": "opencompass/ceval-exam", - "local": "./data/ceval/formal_ceval", - }, - # AFQMC - "opencompass/afqmc-dev": { - "ms_id": "opencompass/afqmc", - "hf_id": "opencompass/afqmc", - "local": "./data/CLUE/AFQMC/dev.json", - }, - # CMNLI - "opencompass/cmnli-dev": { - "ms_id": "opencompass/cmnli", - "hf_id": "opencompass/cmnli", - "local": "./data/CLUE/cmnli/cmnli_public/dev.json", - }, - # OCNLI - "opencompass/OCNLI-dev": { - "ms_id": "opencompass/OCNLI", - "hf_id": "opencompass/OCNLI", - "local": "./data/CLUE/OCNLI/dev.json", - }, - # ChemBench - "opencompass/ChemBench": { - "ms_id": "opencompass/ChemBench", - "hf_id": "opencompass/ChemBench", - "local": "./data/ChemBench/", - }, - # CMMLU - "opencompass/cmmlu": { - "ms_id": "opencompass/cmmlu", - "hf_id": "opencompass/cmmlu", - "local": "./data/cmmlu/", - }, - # CommonsenseQA - "opencompass/commonsense_qa": { - "ms_id": "opencompass/commonsense_qa", - "hf_id": "opencompass/commonsense_qa", - "local": "./data/commonsenseqa", - }, - # CMRC - "opencompass/cmrc_dev": { - "ms_id": "opencompass/cmrc_dev", - "hf_id": "opencompass/cmrc_dev", - "local": "./data/CLUE/CMRC/dev.json", - }, - # DRCD_dev - "opencompass/drcd_dev": { - "ms_id": "opencompass/drcd_dev", - "hf_id": "opencompass/drcd_dev", - "local": "./data/CLUE/DRCD/dev.json", - }, - # clozeTest_maxmin - "opencompass/clozeTest_maxmin": { - "ms_id": None, - "hf_id": None, - "local": "./data/clozeTest-maxmin/python/clozeTest.json", - }, - # clozeTest_maxmin - "opencompass/clozeTest_maxmin_answers": { - "ms_id": None, - "hf_id": None, - "local": "./data/clozeTest-maxmin/python/answers.txt", - }, - # Flores - "opencompass/flores": { - "ms_id": "opencompass/flores", - "hf_id": "opencompass/flores", - "local": "./data/flores_first100", - }, - # MBPP - "opencompass/mbpp": { - "ms_id": "opencompass/mbpp", - "hf_id": "opencompass/mbpp", - "local": "./data/mbpp/mbpp.jsonl", - }, - # 'opencompass/mbpp': { - # 'ms_id': 'opencompass/mbpp', - # 'hf_id': 'opencompass/mbpp', - # 'local': './data/mbpp/mbpp.jsonl', - # }, - "opencompass/sanitized_mbpp": { - "ms_id": "opencompass/mbpp", - "hf_id": "opencompass/mbpp", - "local": "./data/mbpp/sanitized-mbpp.jsonl", - }, - # GSM - "opencompass/gsm8k": { - "ms_id": "opencompass/gsm8k", - "hf_id": "opencompass/gsm8k", - "local": "./data/gsm8k/", - }, - # HellaSwag - "opencompass/hellaswag": { - "ms_id": "opencompass/hellaswag", - "hf_id": "opencompass/hellaswag", - "local": "./data/hellaswag/hellaswag.jsonl", - }, - # HellaSwagICE - "opencompass/hellaswag_ice": { - "ms_id": "opencompass/hellaswag", - "hf_id": "opencompass/hellaswag", - "local": "./data/hellaswag/", - }, - # HumanEval - "opencompass/humaneval": { - "ms_id": "opencompass/humaneval", - "hf_id": "opencompass/humaneval", - "local": "./data/humaneval/human-eval-v2-20210705.jsonl", - }, - # HumanEvalCN - "opencompass/humaneval_cn": { - "ms_id": "opencompass/humaneval", - "hf_id": "opencompass/humaneval", - "local": "./data/humaneval_cn/human-eval-cn-v2-20210705.jsonl", - }, - #KORBENCH - "opencompass/korbench": { - "ms_id": "", - "hf_id": "", - "local": "./data/korbench", - }, - # Lambada - "opencompass/lambada": { - "ms_id": "opencompass/lambada", - "hf_id": "opencompass/lambada", - "local": "./data/lambada/test.jsonl", - }, - # LCSTS - "opencompass/LCSTS": { - "ms_id": "opencompass/LCSTS", - "hf_id": "opencompass/LCSTS", - "local": "./data/LCSTS", - }, - # MATH - "opencompass/math": { - "ms_id": "opencompass/math", - "hf_id": "opencompass/math", - "local": "./data/math/", - }, - # MMLU - "opencompass/mmlu": { - "ms_id": "opencompass/mmlu", - "hf_id": "opencompass/mmlu", - "local": "./data/mmlu/", - }, - # MMLU_PRO - "opencompass/mmlu_pro": { - "ms_id": "", - "hf_id": "", - "local": "./data/mmlu_pro", - }, - # MultiPL-E - "opencompass/multipl_e": { - "ms_id": "", - "hf_id": "", - "local": "./data/multipl_e", - }, - # NQ - "opencompass/natural_question": { - "ms_id": "opencompass/natural_question", - "hf_id": "opencompass/natural_question", - "local": "./data/nq/", - }, - # OpenBook QA-test - "opencompass/openbookqa_test": { - "ms_id": "opencompass/openbookqa", - "hf_id": "opencompass/openbookqa", - "local": "./data/openbookqa/Main/test.jsonl", - }, - # OpenBook QA-fact - "opencompass/openbookqa_fact": { - "ms_id": "opencompass/openbookqa", - "hf_id": "opencompass/openbookqa", - "local": "./data/openbookqa/Additional/test_complete.jsonl", - }, - # PIQA - "opencompass/piqa": { - "ms_id": "opencompass/piqa", - "hf_id": "opencompass/piqa", - "local": "./data/piqa", - }, - # RACE - "opencompass/race": { - "ms_id": "opencompass/race", - "hf_id": "opencompass/race", - "local": "./data/race/", - }, - # SIQA - "opencompass/siqa": { - "ms_id": "opencompass/siqa", - "hf_id": "opencompass/siqa", - "local": "./data/siqa", - }, - # XStoryCloze - "opencompass/xstory_cloze": { - "ms_id": "opencompass/xstory_cloze", - "hf_id": "opencompass/xstory_cloze", - "local": "./data/xstory_cloze", - }, - # StrategyQA - "opencompass/strategy_qa": { - "ms_id": "opencompass/strategy_qa", - "hf_id": "opencompass/strategy_qa", - "local": "./data/strategyqa/strategyQA_train.json", - }, - # SummEdits - "opencompass/summedits": { - "ms_id": "opencompass/summedits", - "hf_id": "opencompass/summedits", - "local": "./data/summedits/summedits.jsonl", - }, - # SuperGLUE - "opencompass/boolq": { - "ms_id": "opencompass/boolq", - "hf_id": "opencompass/boolq", - "local": "./data/SuperGLUE/BoolQ/val.jsonl", - }, - # TriviaQA - "opencompass/trivia_qa": { - "ms_id": "opencompass/trivia_qa", - "hf_id": "opencompass/trivia_qa", - "local": "./data/triviaqa/", - }, - # TydiQA - "opencompass/tydiqa": { - "ms_id": "opencompass/tydiqa", - "hf_id": "opencompass/tydiqa", - "local": "./data/tydiqa/", - }, - # Winogrande - "opencompass/winogrande": { - "ms_id": "opencompass/winogrande", - "hf_id": "opencompass/winogrande", - "local": "./data/winogrande/", - }, - # XSum - "opencompass/xsum": { - "ms_id": "opencompass/xsum", - "hf_id": "opencompass/xsum", - "local": "./data/Xsum/dev.jsonl", - }, - # Longbench - "opencompass/Longbench": { - "ms_id": "", - "hf_id": "THUDM/LongBench", - "local": "./data/Longbench", - }, - # Needlebench - "opencompass/needlebench": { - "ms_id": "", - "hf_id": "opencompass/needlebench", - "local": "./data/needlebench", - }, - "opencompass/code_generation_lite": { - "ms_id": "", - "hf_id": "", - "local": "./data/code_generation_lite", - }, - "opencompass/execution-v2": { - "ms_id": "", - "hf_id": "", - "local": "./data/execution-v2", - }, - "opencompass/test_generation": { - "ms_id": "", - "hf_id": "", - "local": "./data/test_generation", - }, - "opencompass/aime2024": { - "ms_id": "", - "hf_id": "", - "local": "./data/aime.jsonl", - }, - "opencompass/aime2025": { - "ms_id": "", - "hf_id": "", - "local": "./data/aime2025/aime2025.jsonl", - }, - "opencompass/cmo_fib": { - "ms_id": "", - "hf_id": "", - "local": "./data/cmo.jsonl", - }, - "opencompass/nq_open": { - "ms_id": "", - "hf_id": "", - "local": "./data/nq-open/", - }, - "opencompass/GAOKAO-BENCH": { - "ms_id": "", - "hf_id": "", - "local": "./data/GAOKAO-BENCH/data", - }, - "opencompass/WikiBench": { - "ms_id": "", - "hf_id": "", - "local": "./data/WikiBench/", - }, - "opencompass/mmmlu_lite": { - "ms_id": "", - "hf_id": "", - "local": "./data/mmmlu_lite", - }, - "opencompass/mmmlu_lite": { - "ms_id": "", - "hf_id": "", - "local": "./data/mmmlu_lite", - }, - "opencompass/musr": { - "ms_id": "", - "hf_id": "", - "local": "./data/musr", - }, - "opencompass/babilong": { - "ms_id": "", - "hf_id": "", - "local": "./data/babilong/data/", - }, - "P-MMEval": { - "ms_id": "", - "hf_id": "", - "local": "./data/P-MMEval/", - }, - "opencompass/arc_prize_public_evaluation": { - "ms_id": "", - "hf_id": "", - "local": "./data/arc_prize_public_evaluation", - }, - "opencompass/simpleqa": { - "ms_id": "", - "hf_id": "", - "local": "./data/simpleqa/simple_qa_test_set.csv", - }, - "opencompass/chinese_simpleqa": { - "ms_id": "", - "hf_id": "", - "local": "./data/chinese_simpleqa", - }, - "opencompass/LiveMathBench202412": { - "ms_id": "", - "hf_id": "", - "local": "./data/LiveMathBench/", - }, - "opencompass/LiveMathBench": { - "ms_id": "", - "hf_id": "opencompass/LiveMathBench", - "local": "./data/LiveMathBench/", - }, - "opencompass/LiveReasonBench": { - "ms_id": "", - "hf_id": "", - "local": "./data/LiveReasonBench/", - }, - "opencompass/bigcodebench": { - "ms_id": "", - "hf_id": "", - "local": "./data/bigcodebench/", - }, - "opencompass/qabench": { - "ms_id": "", - "hf_id": "", - "local": "./data/qabench", - }, - "opencompass/livestembench": { - "ms_id": "", - "hf_id": "", - "local": "./data/livestembench/", - }, - "opencompass/longbenchv2": { - "ms_id": "", - "hf_id": "THUDM/LongBench-v2", - "local": "./data/longbenchv2/data.json", - }, - "opencompass/OlympiadBench": { - "ms_id": "", - "hf_id": "", - "local": "./data/OlympiadBench", - }, -} - -DATASETS_URL = { - "/OlympiadBench": { - "url": - "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/OlympiadBench.zip", - "md5": "97e8b1ae7f6170d94817288a8930ef00", - }, - "/longbenchv2": { - "url": - "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/longbenchv2.zip", - "md5": "09b7e06e6f98c5cca8ad597b3d7b42f0", - }, - "/livestembench": { - "url": - "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/livestembench.zip", - "md5": "0ff59d031c3dcff56a2e00e8c1489f5d", - }, - "/musr": { - "url": - "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/musr.zip", - "md5": "7447d2a5bec4586035196102135e2af9", - }, - "/mmlu/": { - "url": - "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/mmlu.zip", - "md5": "761310671509a239e41c4b717f7fab9c", - }, - "/mmmlu_lite": { - "url": - "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/mmmlu_lite.zip", - "md5": "a776af1220e1826fd0608eda1bc4425e", - }, - "/simpleqa": { - "url": - "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/simpleqa.zip", - "md5": "1d83fc2e15798d39cb265c9a3cb5195a", - }, - "/chinese_simpleqa": { - "url": - "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/chinese_simpleqa.zip", - "md5": "4bdf854b291fc0ee29da57dc47ac47b5", - }, - "/gpqa/": { - "url": - "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/gpqa.zip", - "md5": "2e9657959030a765916f1f2aca29140d", - }, - "/CHARM/": { - "url": - "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/CHARM.zip", - "md5": "fdf51e955d1b8e0bb35bc1997eaf37cb", - }, - "/ifeval/": { - "url": - "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/ifeval.zip", - "md5": "64d98b6f36b42e7390c9cef76cace75f", - }, - "/mbpp/": { - "url": - "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/mbpp.zip", - "md5": "777739c90f04bce44096a5bc96c8f9e5", - }, - "/cmmlu/": { - "url": - "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/cmmlu.zip", - "md5": "a59f4003d6918509a719ce3bc2a5d5bc", - }, - "/math/": { - "url": - "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/math.zip", - "md5": "cb5b4c8378085929e20345174e731fdf", - }, - "/hellaswag/": { - "url": - "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/hellaswag.zip", - "md5": "2b700a02ffb58571c7df8d8d0619256f", - }, - "/BBH/": { - "url": - "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/BBH.zip", - "md5": "60c49f9bef5148aa7e1941328e96a554", - }, - "/compass_arena/": { - "url": - "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/compass_arena.zip", - "md5": "cd59b54a179d16f2a858b359b60588f6", - }, - "/TheoremQA/": { - "url": - "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/TheoremQA.zip", - "md5": "f2793b07bc26510d507aa710d9bd8622", - }, - "/mathbench_v1/": { - "url": - "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/mathbench_v1.zip", - "md5": "50257a910ca43d1f61a610a79fdb16b5", - }, - "/gsm8k/": { - "url": - "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/gsm8k.zip", - "md5": "901e5dc93a2889789a469da9850cdca8", - }, - "/LCBench2023/": { - "url": - "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/LCBench2023.zip", - "md5": "e1a38c94a42ad1809e9e0650476a9306", - }, - "/humaneval/": { - "url": - "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/humaneval.zip", - "md5": "88b1b89dc47b7121c81da6bcd85a69c3", - }, - "/humanevalx": { - "url": - "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/humanevalx.zip", - "md5": "22930355c03fb73fb5bae14b50f1deb9", - }, - "/ds1000_data": { - "url": - "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/ds1000_data.zip", - "md5": "1a4990aec04a2fd73ccfad12e2d43b43", - }, - "/drop_simple_eval/": { - "url": - "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/drop_simple_eval.zip", - "md5": "c912afe5b4a63509851cf16e6b91830e", - }, - "subjective/alignment_bench/": { - "url": - "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/alignment_bench.zip", - "md5": "d8ae9a0398526479dbbcdb80fafabceb", - }, - "subjective/alpaca_eval": { - "url": - "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/alpaca_eval.zip", - "md5": "d7399d63cb46c82f089447160ef49b6a", - }, - "subjective/arena_hard": { - "url": - "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/arena_hard.zip", - "md5": "02cd09a482cb0f0cd9d2c2afe7a1697f", - }, - "subjective/mtbench": { - "url": - "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/mtbench.zip", - "md5": "d1afc0787aeac7f1f24872742e161069", - }, - "subjective/fofo": { - "url": - "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/fofo.zip", - "md5": "8a302712e425e27e4292a9369df5b9d3", - }, - "subjective/followbench": { - "url": - "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/followbench.zip", - "md5": "da7a831817c969da15d1e78d4a245d8a", - }, - "subjective/mtbench101": { - "url": - "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/mtbench101.zip", - "md5": "5d80257bc9929ebe5cfbf6d11184b04c", - }, - "subjective/WildBench": { - "url": - "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/wildbench.zip", - "md5": "b06252857f1f8f44a17b1bfca4888ff4", - }, - "/ruler/": { - "url": - "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/ruler.zip", - "md5": "c60bdfff3d02358067104cc1dea7c0f7", - }, - "/scicode": { - "url": - "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/scicode.zip", - "md5": "9c6c64b8c70edc418f713419ea39989c", - }, - "/commonsenseqa": { - "url": - "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/commonsenseqa.zip", - "md5": "c4a82fc07c81ae1462605f5d7fd2bb2e", - }, - "FewCLUE": { - "url": - "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/FewCLUE.zip", - "md5": "7976e2bb0e9d885ffd3c55f7c5d4021e", - }, - "/race": { - "url": - "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/race.zip", - "md5": "b758251764a264746cf45749c02363f9", - }, - "/ARC": { - "url": - "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/ARC.zip", - "md5": "d720629b69f1a51cfe78bf65b00b44f6", - }, - "/SuperGLUE": { - "url": - "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/SuperGLUE.zip", - "md5": "b60904915b0b61d1a04ea52280169936", - }, - "SQuAD2.0": { - "url": - "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/SQuAD2.0.zip", - "md5": "1321cbf9349e1102a57d31d1b2bfdd7e", - }, - "mmlu_pro": { - "url": - "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/mmlu_pro.zip", - "md5": "e3200c7380f4cea5f13c768f2815fabb", - }, - "multipl_e": { - "url": - "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/multipl_e.zip", - "md5": "24462aac7a38a4a62f5c5e89eb614e20", - }, - "/Longbench": { - "url": - "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/Longbench.zip", - "md5": "ab0cb9e520ae5cfb899bf38b564249bb", - }, - "/needlebench": { - "url": - "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/needlebench.zip", - "md5": "dad5c903ebfea16eaf186b8997aeedad", - }, - "/teval": { - "url": - "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/teval.zip", - "md5": "7628ab5891a26bf96ca17becfd044867", - }, - "/code_generation_lite": { - "url": - "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/code_generation_lite.zip", - "md5": "ebcf8db56f5c817ca8202a542be30cb4", - }, - "/execution-v2": { - "url": - "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/execution-v2.zip", - "md5": "019ef1a0686ee6ca34f51c8af104fcd9", - }, - "/test_generation": { - "url": - "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/test_generation.zip", - "md5": "918a6ea2b1eee6f2b1314db3c21cb4c7", - }, - "/aime2024": { - "url": - "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/aime.zip", - "md5": "fbe2d0577fc210962a549f8cea1a00c8", - }, - "/aime2025": { - "url": - "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/aime2025.zip", - "md5": "aa18cd5d2e2de246c5397f5eb1e61004", - }, - "/cmo": { - "url": - "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/cmo.zip", - "md5": "fad52c81290506a8ca74f46b5400d8fc", - }, - "/nq-open": { - "url": - "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/nq-open.zip", - "md5": "a340521e5c9ec591227dcb367f718b25", - }, - "/winogrande": { - "url": - "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/winogrande.zip", - "md5": "9e949a75eacc26ed4fd2b9aa870b495b", - }, - "/triviaqa": { - "url": - "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/triviaqa.zip", - "md5": "e6a118d744236814926b2ec7ec66c034", - }, - "/GAOKAO-BENCH": { - "url": - "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/GAOKAO-BENCH.zip", - "md5": "ba3c71b8b9db96d2a0664b977c4f9784", - }, - "/WikiBench": { - "url": - "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/WikiBench.zip", - "md5": "6dac1d1a3133fe1effff185cbf71d928", - }, - "/babilong": { - "url": - "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/babilong.zip", - "md5": "e400864c31bc58d29eaa3e199751f99b", - }, - "/korbench": { - "url": - "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/korbench.zip", - "md5": "9107597d137e7362eaf7d218ddef7a6d", - }, - "/bbeh": { - "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/bbeh.zip", - "md5": "43a3c2d73aee731ac68ac790bc9a358e", - }, - "subjective/judgerbench": { - "url": - "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/judgerbench.zip", - "md5": "60d605883aa8cac9755819140ab42c6b" - }, - "/arc_prize_public_evaluation": { - "url": - "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/arc_prize_public_evaluation.zip", - "md5": "367a33977651496efddba7670009807e" - }, - "P-MMEval": { - "url": - "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/pmmeval.zip", - "md5": "09e401e6229a50647b9e13c429e634d1", - }, - "LiveMathBench": { - 'url': - "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/LiveMathBench.zip", - "md5": "d0781f9185c9bb50e81e6e3ca8c59013", - }, - "bigcodebench": { - "url": - "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/bigcodebench.zip", - "md5": "270f399f4142b74f47ecff116cc3b21d" - } -} +DATASETS_MAPPING = { + # ADVGLUE Datasets + "opencompass/advglue-dev": { + "ms_id": None, + "hf_id": None, + "local": "./data/adv_glue/dev_ann.json", + }, + # AGIEval Datasets + "opencompass/agieval": { + "ms_id": "opencompass/agieval", + "hf_id": "opencompass/agieval", + "local": "./data/AGIEval/data/v1/", + }, + # ARC Datasets(Test) + "opencompass/ai2_arc-test": { + "ms_id": "opencompass/ai2_arc", + "hf_id": "opencompass/ai2_arc", + "local": "./data/ARC/ARC-c/ARC-Challenge-Test.jsonl", + }, + "opencompass/ai2_arc-dev": { + "ms_id": "opencompass/ai2_arc", + "hf_id": "opencompass/ai2_arc", + "local": "./data/ARC/ARC-c/ARC-Challenge-Dev.jsonl", + }, + "opencompass/ai2_arc-easy-dev": { + "ms_id": "opencompass/ai2_arc", + "hf_id": "opencompass/ai2_arc", + "local": "./data/ARC/ARC-e/ARC-Easy-Dev.jsonl", + }, + # BBH + "opencompass/bbh": { + "ms_id": "opencompass/bbh", + "hf_id": "opencompass/bbh", + "local": "./data/BBH/data", + }, + # bbeh + "opencompass/bbeh": { + "ms_id": "", + "hf_id": "", + "local": "./data/bbeh/", + }, + # C-Eval + "opencompass/ceval-exam": { + "ms_id": "opencompass/ceval-exam", + "hf_id": "opencompass/ceval-exam", + "local": "./data/ceval/formal_ceval", + }, + # AFQMC + "opencompass/afqmc-dev": { + "ms_id": "opencompass/afqmc", + "hf_id": "opencompass/afqmc", + "local": "./data/CLUE/AFQMC/dev.json", + }, + # CMNLI + "opencompass/cmnli-dev": { + "ms_id": "opencompass/cmnli", + "hf_id": "opencompass/cmnli", + "local": "./data/CLUE/cmnli/cmnli_public/dev.json", + }, + # OCNLI + "opencompass/OCNLI-dev": { + "ms_id": "opencompass/OCNLI", + "hf_id": "opencompass/OCNLI", + "local": "./data/CLUE/OCNLI/dev.json", + }, + # ChemBench + "opencompass/ChemBench": { + "ms_id": "opencompass/ChemBench", + "hf_id": "opencompass/ChemBench", + "local": "./data/ChemBench/", + }, + # CMMLU + "opencompass/cmmlu": { + "ms_id": "opencompass/cmmlu", + "hf_id": "opencompass/cmmlu", + "local": "./data/cmmlu/", + }, + # CommonsenseQA + "opencompass/commonsense_qa": { + "ms_id": "opencompass/commonsense_qa", + "hf_id": "opencompass/commonsense_qa", + "local": "./data/commonsenseqa", + }, + # CMRC + "opencompass/cmrc_dev": { + "ms_id": "opencompass/cmrc_dev", + "hf_id": "opencompass/cmrc_dev", + "local": "./data/CLUE/CMRC/dev.json", + }, + # DRCD_dev + "opencompass/drcd_dev": { + "ms_id": "opencompass/drcd_dev", + "hf_id": "opencompass/drcd_dev", + "local": "./data/CLUE/DRCD/dev.json", + }, + # clozeTest_maxmin + "opencompass/clozeTest_maxmin": { + "ms_id": None, + "hf_id": None, + "local": "./data/clozeTest-maxmin/python/clozeTest.json", + }, + # clozeTest_maxmin + "opencompass/clozeTest_maxmin_answers": { + "ms_id": None, + "hf_id": None, + "local": "./data/clozeTest-maxmin/python/answers.txt", + }, + # Flores + "opencompass/flores": { + "ms_id": "opencompass/flores", + "hf_id": "opencompass/flores", + "local": "./data/flores_first100", + }, + # MBPP + "opencompass/mbpp": { + "ms_id": "opencompass/mbpp", + "hf_id": "opencompass/mbpp", + "local": "./data/mbpp/mbpp.jsonl", + }, + # 'opencompass/mbpp': { + # 'ms_id': 'opencompass/mbpp', + # 'hf_id': 'opencompass/mbpp', + # 'local': './data/mbpp/mbpp.jsonl', + # }, + "opencompass/sanitized_mbpp": { + "ms_id": "opencompass/mbpp", + "hf_id": "opencompass/mbpp", + "local": "./data/mbpp/sanitized-mbpp.jsonl", + }, + # GSM + "opencompass/gsm8k": { + "ms_id": "opencompass/gsm8k", + "hf_id": "opencompass/gsm8k", + "local": "./data/gsm8k/", + }, + # HellaSwag + "opencompass/hellaswag": { + "ms_id": "opencompass/hellaswag", + "hf_id": "opencompass/hellaswag", + "local": "./data/hellaswag/hellaswag.jsonl", + }, + # HellaSwagICE + "opencompass/hellaswag_ice": { + "ms_id": "opencompass/hellaswag", + "hf_id": "opencompass/hellaswag", + "local": "./data/hellaswag/", + }, + # HumanEval + "opencompass/humaneval": { + "ms_id": "opencompass/humaneval", + "hf_id": "opencompass/humaneval", + "local": "./data/humaneval/human-eval-v2-20210705.jsonl", + }, + # HumanEvalCN + "opencompass/humaneval_cn": { + "ms_id": "opencompass/humaneval", + "hf_id": "opencompass/humaneval", + "local": "./data/humaneval_cn/human-eval-cn-v2-20210705.jsonl", + }, + #KORBENCH + "opencompass/korbench": { + "ms_id": "", + "hf_id": "", + "local": "./data/korbench", + }, + # Lambada + "opencompass/lambada": { + "ms_id": "opencompass/lambada", + "hf_id": "opencompass/lambada", + "local": "./data/lambada/test.jsonl", + }, + # LCSTS + "opencompass/LCSTS": { + "ms_id": "opencompass/LCSTS", + "hf_id": "opencompass/LCSTS", + "local": "./data/LCSTS", + }, + # MATH + "opencompass/math": { + "ms_id": "opencompass/math", + "hf_id": "opencompass/math", + "local": "./data/math/", + }, + # MMLU + "opencompass/mmlu": { + "ms_id": "opencompass/mmlu", + "hf_id": "opencompass/mmlu", + "local": "./data/mmlu/", + }, + # MMLU_PRO + "opencompass/mmlu_pro": { + "ms_id": "", + "hf_id": "", + "local": "./data/mmlu_pro", + }, + # MultiPL-E + "opencompass/multipl_e": { + "ms_id": "", + "hf_id": "", + "local": "./data/multipl_e", + }, + # NQ + "opencompass/natural_question": { + "ms_id": "opencompass/natural_question", + "hf_id": "opencompass/natural_question", + "local": "./data/nq/", + }, + # OpenBook QA-test + "opencompass/openbookqa_test": { + "ms_id": "opencompass/openbookqa", + "hf_id": "opencompass/openbookqa", + "local": "./data/openbookqa/Main/test.jsonl", + }, + # OpenBook QA-fact + "opencompass/openbookqa_fact": { + "ms_id": "opencompass/openbookqa", + "hf_id": "opencompass/openbookqa", + "local": "./data/openbookqa/Additional/test_complete.jsonl", + }, + # PIQA + "opencompass/piqa": { + "ms_id": "opencompass/piqa", + "hf_id": "opencompass/piqa", + "local": "./data/piqa", + }, + # RACE + "opencompass/race": { + "ms_id": "opencompass/race", + "hf_id": "opencompass/race", + "local": "./data/race/", + }, + # SIQA + "opencompass/siqa": { + "ms_id": "opencompass/siqa", + "hf_id": "opencompass/siqa", + "local": "./data/siqa", + }, + # XStoryCloze + "opencompass/xstory_cloze": { + "ms_id": "opencompass/xstory_cloze", + "hf_id": "opencompass/xstory_cloze", + "local": "./data/xstory_cloze", + }, + # StrategyQA + "opencompass/strategy_qa": { + "ms_id": "opencompass/strategy_qa", + "hf_id": "opencompass/strategy_qa", + "local": "./data/strategyqa/strategyQA_train.json", + }, + # SummEdits + "opencompass/summedits": { + "ms_id": "opencompass/summedits", + "hf_id": "opencompass/summedits", + "local": "./data/summedits/summedits.jsonl", + }, + # SuperGLUE + "opencompass/boolq": { + "ms_id": "opencompass/boolq", + "hf_id": "opencompass/boolq", + "local": "./data/SuperGLUE/BoolQ/val.jsonl", + }, + # TriviaQA + "opencompass/trivia_qa": { + "ms_id": "opencompass/trivia_qa", + "hf_id": "opencompass/trivia_qa", + "local": "./data/triviaqa/", + }, + # TydiQA + "opencompass/tydiqa": { + "ms_id": "opencompass/tydiqa", + "hf_id": "opencompass/tydiqa", + "local": "./data/tydiqa/", + }, + # Winogrande + "opencompass/winogrande": { + "ms_id": "opencompass/winogrande", + "hf_id": "opencompass/winogrande", + "local": "./data/winogrande/", + }, + # XSum + "opencompass/xsum": { + "ms_id": "opencompass/xsum", + "hf_id": "opencompass/xsum", + "local": "./data/Xsum/dev.jsonl", + }, + # Longbench + "opencompass/Longbench": { + "ms_id": "", + "hf_id": "THUDM/LongBench", + "local": "./data/Longbench", + }, + # Needlebench + "opencompass/needlebench": { + "ms_id": "", + "hf_id": "opencompass/needlebench", + "local": "./data/needlebench", + }, + "opencompass/code_generation_lite": { + "ms_id": "", + "hf_id": "", + "local": "./data/code_generation_lite", + }, + "opencompass/execution-v2": { + "ms_id": "", + "hf_id": "", + "local": "./data/execution-v2", + }, + "opencompass/test_generation": { + "ms_id": "", + "hf_id": "", + "local": "./data/test_generation", + }, + "opencompass/aime2024": { + "ms_id": "", + "hf_id": "", + "local": "./data/aime.jsonl", + }, + "opencompass/aime2025": { + "ms_id": "", + "hf_id": "", + "local": "./data/aime2025/aime2025.jsonl", + }, + "opencompass/cmo_fib": { + "ms_id": "", + "hf_id": "", + "local": "./data/cmo.jsonl", + }, + "opencompass/nq_open": { + "ms_id": "", + "hf_id": "", + "local": "./data/nq-open/", + }, + "opencompass/GAOKAO-BENCH": { + "ms_id": "", + "hf_id": "", + "local": "./data/GAOKAO-BENCH/data", + }, + "opencompass/WikiBench": { + "ms_id": "", + "hf_id": "", + "local": "./data/WikiBench/", + }, + "opencompass/mmmlu_lite": { + "ms_id": "", + "hf_id": "", + "local": "./data/mmmlu_lite", + }, + "opencompass/mmmlu_lite": { + "ms_id": "", + "hf_id": "", + "local": "./data/mmmlu_lite", + }, + "opencompass/musr": { + "ms_id": "", + "hf_id": "", + "local": "./data/musr", + }, + "opencompass/babilong": { + "ms_id": "", + "hf_id": "", + "local": "./data/babilong/data/", + }, + "P-MMEval": { + "ms_id": "", + "hf_id": "", + "local": "./data/P-MMEval/", + }, + "opencompass/arc_prize_public_evaluation": { + "ms_id": "", + "hf_id": "", + "local": "./data/arc_prize_public_evaluation", + }, + "opencompass/simpleqa": { + "ms_id": "", + "hf_id": "", + "local": "./data/simpleqa/simple_qa_test_set.csv", + }, + "opencompass/chinese_simpleqa": { + "ms_id": "", + "hf_id": "", + "local": "./data/chinese_simpleqa", + }, + "opencompass/LiveMathBench202412": { + "ms_id": "", + "hf_id": "", + "local": "./data/LiveMathBench/", + }, + "opencompass/LiveMathBench": { + "ms_id": "", + "hf_id": "opencompass/LiveMathBench", + "local": "./data/LiveMathBench/", + }, + "opencompass/LiveReasonBench": { + "ms_id": "", + "hf_id": "", + "local": "./data/LiveReasonBench/", + }, + "opencompass/bigcodebench": { + "ms_id": "", + "hf_id": "", + "local": "./data/bigcodebench/", + }, + "opencompass/qabench": { + "ms_id": "", + "hf_id": "", + "local": "./data/qabench", + }, + "opencompass/livestembench": { + "ms_id": "", + "hf_id": "", + "local": "./data/livestembench/", + }, + "opencompass/longbenchv2": { + "ms_id": "", + "hf_id": "THUDM/LongBench-v2", + "local": "./data/longbenchv2/data.json", + }, + "opencompass/OlympiadBench": { + "ms_id": "", + "hf_id": "", + "local": "./data/OlympiadBench", + }, + "opencompass/maritimebench": { + "ms_id": "HiDolphin/MaritimeBench", + "hf_id": "", + "local": "./data/maritimebench", + }, +} + +DATASETS_URL = { + "/OlympiadBench": { + "url": + "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/OlympiadBench.zip", + "md5": "97e8b1ae7f6170d94817288a8930ef00", + }, + "/longbenchv2": { + "url": + "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/longbenchv2.zip", + "md5": "09b7e06e6f98c5cca8ad597b3d7b42f0", + }, + "/livestembench": { + "url": + "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/livestembench.zip", + "md5": "0ff59d031c3dcff56a2e00e8c1489f5d", + }, + "/musr": { + "url": + "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/musr.zip", + "md5": "7447d2a5bec4586035196102135e2af9", + }, + "/mmlu/": { + "url": + "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/mmlu.zip", + "md5": "761310671509a239e41c4b717f7fab9c", + }, + "/mmmlu_lite": { + "url": + "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/mmmlu_lite.zip", + "md5": "a776af1220e1826fd0608eda1bc4425e", + }, + "/simpleqa": { + "url": + "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/simpleqa.zip", + "md5": "1d83fc2e15798d39cb265c9a3cb5195a", + }, + "/chinese_simpleqa": { + "url": + "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/chinese_simpleqa.zip", + "md5": "4bdf854b291fc0ee29da57dc47ac47b5", + }, + "/gpqa/": { + "url": + "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/gpqa.zip", + "md5": "2e9657959030a765916f1f2aca29140d", + }, + "/CHARM/": { + "url": + "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/CHARM.zip", + "md5": "fdf51e955d1b8e0bb35bc1997eaf37cb", + }, + "/ifeval/": { + "url": + "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/ifeval.zip", + "md5": "64d98b6f36b42e7390c9cef76cace75f", + }, + "/mbpp/": { + "url": + "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/mbpp.zip", + "md5": "777739c90f04bce44096a5bc96c8f9e5", + }, + "/cmmlu/": { + "url": + "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/cmmlu.zip", + "md5": "a59f4003d6918509a719ce3bc2a5d5bc", + }, + "/math/": { + "url": + "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/math.zip", + "md5": "cb5b4c8378085929e20345174e731fdf", + }, + "/hellaswag/": { + "url": + "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/hellaswag.zip", + "md5": "2b700a02ffb58571c7df8d8d0619256f", + }, + "/BBH/": { + "url": + "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/BBH.zip", + "md5": "60c49f9bef5148aa7e1941328e96a554", + }, + "/compass_arena/": { + "url": + "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/compass_arena.zip", + "md5": "cd59b54a179d16f2a858b359b60588f6", + }, + "/TheoremQA/": { + "url": + "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/TheoremQA.zip", + "md5": "f2793b07bc26510d507aa710d9bd8622", + }, + "/mathbench_v1/": { + "url": + "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/mathbench_v1.zip", + "md5": "50257a910ca43d1f61a610a79fdb16b5", + }, + "/gsm8k/": { + "url": + "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/gsm8k.zip", + "md5": "901e5dc93a2889789a469da9850cdca8", + }, + "/LCBench2023/": { + "url": + "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/LCBench2023.zip", + "md5": "e1a38c94a42ad1809e9e0650476a9306", + }, + "/humaneval/": { + "url": + "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/humaneval.zip", + "md5": "88b1b89dc47b7121c81da6bcd85a69c3", + }, + "/humanevalx": { + "url": + "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/humanevalx.zip", + "md5": "22930355c03fb73fb5bae14b50f1deb9", + }, + "/ds1000_data": { + "url": + "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/ds1000_data.zip", + "md5": "1a4990aec04a2fd73ccfad12e2d43b43", + }, + "/drop_simple_eval/": { + "url": + "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/drop_simple_eval.zip", + "md5": "c912afe5b4a63509851cf16e6b91830e", + }, + "subjective/alignment_bench/": { + "url": + "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/alignment_bench.zip", + "md5": "d8ae9a0398526479dbbcdb80fafabceb", + }, + "subjective/alpaca_eval": { + "url": + "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/alpaca_eval.zip", + "md5": "d7399d63cb46c82f089447160ef49b6a", + }, + "subjective/arena_hard": { + "url": + "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/arena_hard.zip", + "md5": "02cd09a482cb0f0cd9d2c2afe7a1697f", + }, + "subjective/mtbench": { + "url": + "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/mtbench.zip", + "md5": "d1afc0787aeac7f1f24872742e161069", + }, + "subjective/fofo": { + "url": + "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/fofo.zip", + "md5": "8a302712e425e27e4292a9369df5b9d3", + }, + "subjective/followbench": { + "url": + "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/followbench.zip", + "md5": "da7a831817c969da15d1e78d4a245d8a", + }, + "subjective/mtbench101": { + "url": + "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/mtbench101.zip", + "md5": "5d80257bc9929ebe5cfbf6d11184b04c", + }, + "subjective/WildBench": { + "url": + "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/wildbench.zip", + "md5": "b06252857f1f8f44a17b1bfca4888ff4", + }, + "/ruler/": { + "url": + "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/ruler.zip", + "md5": "c60bdfff3d02358067104cc1dea7c0f7", + }, + "/scicode": { + "url": + "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/scicode.zip", + "md5": "9c6c64b8c70edc418f713419ea39989c", + }, + "/commonsenseqa": { + "url": + "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/commonsenseqa.zip", + "md5": "c4a82fc07c81ae1462605f5d7fd2bb2e", + }, + "FewCLUE": { + "url": + "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/FewCLUE.zip", + "md5": "7976e2bb0e9d885ffd3c55f7c5d4021e", + }, + "/race": { + "url": + "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/race.zip", + "md5": "b758251764a264746cf45749c02363f9", + }, + "/ARC": { + "url": + "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/ARC.zip", + "md5": "d720629b69f1a51cfe78bf65b00b44f6", + }, + "/SuperGLUE": { + "url": + "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/SuperGLUE.zip", + "md5": "b60904915b0b61d1a04ea52280169936", + }, + "SQuAD2.0": { + "url": + "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/SQuAD2.0.zip", + "md5": "1321cbf9349e1102a57d31d1b2bfdd7e", + }, + "mmlu_pro": { + "url": + "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/mmlu_pro.zip", + "md5": "e3200c7380f4cea5f13c768f2815fabb", + }, + "multipl_e": { + "url": + "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/multipl_e.zip", + "md5": "24462aac7a38a4a62f5c5e89eb614e20", + }, + "/Longbench": { + "url": + "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/Longbench.zip", + "md5": "ab0cb9e520ae5cfb899bf38b564249bb", + }, + "/needlebench": { + "url": + "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/needlebench.zip", + "md5": "dad5c903ebfea16eaf186b8997aeedad", + }, + "/teval": { + "url": + "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/teval.zip", + "md5": "7628ab5891a26bf96ca17becfd044867", + }, + "/code_generation_lite": { + "url": + "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/code_generation_lite.zip", + "md5": "ebcf8db56f5c817ca8202a542be30cb4", + }, + "/execution-v2": { + "url": + "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/execution-v2.zip", + "md5": "019ef1a0686ee6ca34f51c8af104fcd9", + }, + "/test_generation": { + "url": + "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/test_generation.zip", + "md5": "918a6ea2b1eee6f2b1314db3c21cb4c7", + }, + "/aime2024": { + "url": + "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/aime.zip", + "md5": "fbe2d0577fc210962a549f8cea1a00c8", + }, + "/aime2025": { + "url": + "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/aime2025.zip", + "md5": "aa18cd5d2e2de246c5397f5eb1e61004", + }, + "/cmo": { + "url": + "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/cmo.zip", + "md5": "fad52c81290506a8ca74f46b5400d8fc", + }, + "/nq-open": { + "url": + "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/nq-open.zip", + "md5": "a340521e5c9ec591227dcb367f718b25", + }, + "/winogrande": { + "url": + "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/winogrande.zip", + "md5": "9e949a75eacc26ed4fd2b9aa870b495b", + }, + "/triviaqa": { + "url": + "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/triviaqa.zip", + "md5": "e6a118d744236814926b2ec7ec66c034", + }, + "/GAOKAO-BENCH": { + "url": + "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/GAOKAO-BENCH.zip", + "md5": "ba3c71b8b9db96d2a0664b977c4f9784", + }, + "/WikiBench": { + "url": + "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/WikiBench.zip", + "md5": "6dac1d1a3133fe1effff185cbf71d928", + }, + "/babilong": { + "url": + "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/babilong.zip", + "md5": "e400864c31bc58d29eaa3e199751f99b", + }, + "/korbench": { + "url": + "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/korbench.zip", + "md5": "9107597d137e7362eaf7d218ddef7a6d", + }, + "/bbeh": { + "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/bbeh.zip", + "md5": "43a3c2d73aee731ac68ac790bc9a358e", + }, + "subjective/judgerbench": { + "url": + "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/judgerbench.zip", + "md5": "60d605883aa8cac9755819140ab42c6b" + }, + "/arc_prize_public_evaluation": { + "url": + "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/arc_prize_public_evaluation.zip", + "md5": "367a33977651496efddba7670009807e" + }, + "P-MMEval": { + "url": + "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/pmmeval.zip", + "md5": "09e401e6229a50647b9e13c429e634d1", + }, + "LiveMathBench": { + 'url': + "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/LiveMathBench.zip", + "md5": "d0781f9185c9bb50e81e6e3ca8c59013", + }, + "bigcodebench": { + "url": + "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/bigcodebench.zip", + "md5": "270f399f4142b74f47ecff116cc3b21d" + } +} diff --git a/tests/dataset/test_humaneval.py b/tests/dataset/test_humaneval.py index 0d1c23a1..175bfe22 100644 --- a/tests/dataset/test_humaneval.py +++ b/tests/dataset/test_humaneval.py @@ -1,110 +1,110 @@ -import unittest - -from opencompass.datasets.humaneval import humaneval_postprocess - - -def run_humaneval_check(completion): - program = [ - 'def get_fraction(x: float) -> float:', - humaneval_postprocess(completion), - '', - 'assert get_fraction(1.28) == 0.28', - 'assert get_fraction(1.0) == 0.0', - ] - program = '\n'.join(program) - exec(program) - - -class TestHumaneval(unittest.TestCase): - - def test_vanilla(self): - raw = ' return x - int(x)' - run_humaneval_check(raw) - - def test_python_quote(self): - lines = [ - '```python', - ' return x - int(x)', - '```', - ] - raw = '\n'.join(lines) - run_humaneval_check(raw) - - def test_bare_quote(self): - lines = [ - '```', - ' return x - int(x)', - '```', - ] - raw = '\n'.join(lines) - run_humaneval_check(raw) - - def test_error_space_quote(self): - lines = [ - '```', - ' return x - int(x)', - '```', - ] - raw = '\n'.join(lines) - run_humaneval_check(raw) - - def test_import_1(self): - lines = [ - 'import numpy as np', - 'import math', - 'from typing import List', - '', - 'def func(x):', - ' return x - int(x)', - ] - raw = '\n'.join(lines) - run_humaneval_check(raw) - - def test_import_2(self): - lines = [ - 'from typing import List', - 'import numpy as np', - 'import math', - 'def func(x):', - ' return x - int(x)', - ] - raw = '\n'.join(lines) - run_humaneval_check(raw) - - def test_import_3(self): - lines = [ - 'import math', - '', - '', - 'def func(x):', - ' return x - int(x)', - ] - raw = '\n'.join(lines) - run_humaneval_check(raw) - - def test_comment(self): - lines = [ - 'def func(x: float) -> float:', - " '''", - ' blah blah blah', - ' blah blah blah', - " '''", - ' return x - int(x)', - ] - raw = '\n'.join(lines) - run_humaneval_check(raw) - - def test_additional(self): - lines = [ - ' return x - int(x)', - '', - '', - 'def func(x: float) -> float:', - " '''", - ' blah blah blah', - ' blah blah blah', - " '''", - ' return x - int(x)', - ] - raw = '\n'.join(lines) - run_humaneval_check(raw) +import unittest + +from opencompass.datasets.humaneval import humaneval_postprocess + + +def run_humaneval_check(completion): + program = [ + 'def get_fraction(x: float) -> float:', + humaneval_postprocess(completion), + '', + 'assert get_fraction(1.28) == 0.28', + 'assert get_fraction(1.0) == 0.0', + ] + program = '\n'.join(program) + exec(program) + + +class TestHumaneval(unittest.TestCase): + + def test_vanilla(self): + raw = ' return x - int(x)' + run_humaneval_check(raw) + + def test_python_quote(self): + lines = [ + '```python', + ' return x - int(x)', + '```', + ] + raw = '\n'.join(lines) + run_humaneval_check(raw) + + def test_bare_quote(self): + lines = [ + '```', + ' return x - int(x)', + '```', + ] + raw = '\n'.join(lines) + run_humaneval_check(raw) + + def test_error_space_quote(self): + lines = [ + '```', + ' return x - int(x)', + '```', + ] + raw = '\n'.join(lines) + run_humaneval_check(raw) + + def test_import_1(self): + lines = [ + 'import numpy as np', + 'import math', + 'from typing import List', + '', + 'def func(x):', + ' return x - int(x)', + ] + raw = '\n'.join(lines) + run_humaneval_check(raw) + + def test_import_2(self): + lines = [ + 'from typing import List', + 'import numpy as np', + 'import math', + 'def func(x):', + ' return x - int(x)', + ] + raw = '\n'.join(lines) + run_humaneval_check(raw) + + def test_import_3(self): + lines = [ + 'import math', + '', + '', + 'def func(x):', + ' return x - int(x)', + ] + raw = '\n'.join(lines) + run_humaneval_check(raw) + + def test_comment(self): + lines = [ + 'def func(x: float) -> float:', + " '''", + ' blah blah blah', + ' blah blah blah', + " '''", + ' return x - int(x)', + ] + raw = '\n'.join(lines) + run_humaneval_check(raw) + + def test_additional(self): + lines = [ + ' return x - int(x)', + '', + '', + 'def func(x: float) -> float:', + " '''", + ' blah blah blah', + ' blah blah blah', + " '''", + ' return x - int(x)', + ] + raw = '\n'.join(lines) + run_humaneval_check(raw) diff --git a/tests/dataset/test_local_datasets.py b/tests/dataset/test_local_datasets.py index 1178749f..343b6812 100644 --- a/tests/dataset/test_local_datasets.py +++ b/tests/dataset/test_local_datasets.py @@ -1,230 +1,230 @@ -import random -import sys -import unittest -import warnings -from os import environ - -from datasets import Dataset, DatasetDict -from mmengine.config import read_base -from tqdm import tqdm - -from concurrent.futures import ThreadPoolExecutor, as_completed - -warnings.filterwarnings('ignore', category=DeprecationWarning) - - -def reload_datasets(): - modules_to_remove = [ - module_name for module_name in sys.modules - if module_name.startswith('configs.datasets') - ] - - for module_name in modules_to_remove: - del sys.modules[module_name] - - with read_base(): - from configs.datasets.ceval.ceval_gen import ceval_datasets - from configs.datasets.gsm8k.gsm8k_gen import gsm8k_datasets - from configs.datasets.cmmlu.cmmlu_gen import cmmlu_datasets - from configs.datasets.ARC_c.ARC_c_gen import ARC_c_datasets - from configs.datasets.ARC_e.ARC_e_gen import ARC_e_datasets - from configs.datasets.humaneval.humaneval_gen import humaneval_datasets - from configs.datasets.humaneval.humaneval_repeat10_gen_8e312c import humaneval_datasets as humaneval_repeat10_datasets - from configs.datasets.race.race_ppl import race_datasets - from configs.datasets.commonsenseqa.commonsenseqa_gen import commonsenseqa_datasets - - from configs.datasets.mmlu.mmlu_gen import mmlu_datasets - from configs.datasets.strategyqa.strategyqa_gen import strategyqa_datasets - from configs.datasets.bbh.bbh_gen import bbh_datasets - from configs.datasets.Xsum.Xsum_gen import Xsum_datasets - from configs.datasets.winogrande.winogrande_gen import winogrande_datasets - from configs.datasets.winogrande.winogrande_ll import winogrande_datasets as winogrande_ll_datasets - from configs.datasets.winogrande.winogrande_5shot_ll_252f01 import winogrande_datasets as winogrande_5shot_ll_datasets - from configs.datasets.obqa.obqa_gen import obqa_datasets - from configs.datasets.obqa.obqa_ppl_6aac9e import obqa_datasets as obqa_ppl_datasets - from configs.datasets.agieval.agieval_gen import agieval_datasets as agieval_v2_datasets - # from configs.datasets.agieval.agieval_gen_a0c741 import agieval_datasets as agieval_v1_datasets - from configs.datasets.siqa.siqa_gen import siqa_datasets as siqa_v2_datasets - from configs.datasets.siqa.siqa_gen_18632c import siqa_datasets as siqa_v3_datasets - from configs.datasets.siqa.siqa_ppl_42bc6e import siqa_datasets as siqa_ppl_datasets - from configs.datasets.storycloze.storycloze_gen import storycloze_datasets - from configs.datasets.storycloze.storycloze_ppl import storycloze_datasets as storycloze_ppl_datasets - from configs.datasets.summedits.summedits_gen import summedits_datasets as summedits_v2_datasets - - from configs.datasets.hellaswag.hellaswag_gen import hellaswag_datasets as hellaswag_v2_datasets - from configs.datasets.hellaswag.hellaswag_10shot_gen_e42710 import hellaswag_datasets as hellaswag_ice_datasets - from configs.datasets.hellaswag.hellaswag_ppl_9dbb12 import hellaswag_datasets as hellaswag_v1_datasets - from configs.datasets.hellaswag.hellaswag_ppl_a6e128 import hellaswag_datasets as hellaswag_v3_datasets - from configs.datasets.mbpp.mbpp_gen import mbpp_datasets as mbpp_v1_datasets - from configs.datasets.mbpp.mbpp_passk_gen_830460 import mbpp_datasets as mbpp_v2_datasets - from configs.datasets.mbpp.sanitized_mbpp_gen_830460 import sanitized_mbpp_datasets - from configs.datasets.nq.nq_gen import nq_datasets - from configs.datasets.lcsts.lcsts_gen import lcsts_datasets - from configs.datasets.math.math_gen import math_datasets - from configs.datasets.piqa.piqa_gen import piqa_datasets as piqa_v2_datasets - from configs.datasets.piqa.piqa_ppl import piqa_datasets as piqa_v1_datasets - from configs.datasets.piqa.piqa_ppl_0cfff2 import piqa_datasets as piqa_v3_datasets - from configs.datasets.lambada.lambada_gen import lambada_datasets - from configs.datasets.tydiqa.tydiqa_gen import tydiqa_datasets - from configs.datasets.GaokaoBench.GaokaoBench_gen import GaokaoBench_datasets - from configs.datasets.GaokaoBench.GaokaoBench_mixed import GaokaoBench_datasets as GaokaoBench_mixed_datasets - from configs.datasets.GaokaoBench.GaokaoBench_no_subjective_gen_4c31db import GaokaoBench_datasets as GaokaoBench_no_subjective_datasets - from configs.datasets.triviaqa.triviaqa_gen import triviaqa_datasets - from configs.datasets.triviaqa.triviaqa_wiki_1shot_gen_20a989 import triviaqa_datasets as triviaqa_wiki_1shot_datasets - - from configs.datasets.CLUE_cmnli.CLUE_cmnli_gen import cmnli_datasets - from configs.datasets.CLUE_cmnli.CLUE_cmnli_ppl import cmnli_datasets as cmnli_ppl_datasets - from configs.datasets.CLUE_ocnli.CLUE_ocnli_gen import ocnli_datasets - - from configs.datasets.ceval.ceval_clean_ppl import ceval_datasets as ceval_clean_datasets - from configs.datasets.ARC_c.ARC_c_clean_ppl import ARC_c_datasets as ARC_c_clean_datasets - from configs.datasets.mmlu.mmlu_clean_ppl import mmlu_datasets as mmlu_clean_datasets - from configs.datasets.hellaswag.hellaswag_clean_ppl import hellaswag_datasets as hellaswag_clean_datasets - from configs.datasets.FewCLUE_ocnli_fc.FewCLUE_ocnli_fc_gen import ocnli_fc_datasets - - return sum((v for k, v in locals().items() if k.endswith('_datasets')), []) - - -def load_datasets_conf(source): - environ['DATASET_SOURCE'] = source - datasets_conf = reload_datasets() - return datasets_conf - - -def load_datasets(source, conf): - environ['DATASET_SOURCE'] = source - if 'lang' in conf: - dataset = conf['type'].load(path=conf['path'], lang=conf['lang']) - return dataset - if 'setting_name' in conf: - dataset = conf['type'].load(path=conf['path'], - name=conf['name'], - setting_name=conf['setting_name']) - return dataset - if 'name' in conf: - dataset = conf['type'].load(path=conf['path'], name=conf['name']) - return dataset - - if 'local_mode' in conf: - dataset = conf['type'].load(path=conf['path'], local_mode=conf['local_mode']) - return dataset - try: - dataset = conf['type'].load(path=conf['path']) - except Exception: - dataset = conf['type'].load(**conf) - return dataset - - -def clean_string(value): - """Helper function to clean and normalize string data. - - It strips leading and trailing whitespace and replaces multiple whitespace - characters with a single space. - """ - if isinstance(value, str): - return ' '.join(value.split()) - return value - - -class TestingLocalDatasets(unittest.TestCase): - - def test_datasets(self): - # 加载 ModelScope 和 Local 数据集配置 - # ms_datasets_conf = load_datasets_conf('ModelScope') - local_datasets_conf = load_datasets_conf('Local') - - # 初始化成功和失败的数据集列表 - successful_comparisons = [] - failed_comparisons = [] - - def compare_datasets(local_conf): - # local_dataset = load_datasets(local_conf) - local_dataset = load_datasets('Local', local_conf) - # modelscope_path_name = f"{ms_conf.get('path')}/{ms_conf.get('name', '')}\t{ms_conf.get('lang', '')}" - local_path_name = f"{local_conf.get('path')}/{local_conf.get('name', '')}\t{local_conf.get('lang', '')}" - # # 断言类型一致 - # assert ms_conf['type'] == local_conf['type'], "Data types do not match" - # print(modelscope_path_name, local_path_name) - try: - # ms_dataset = load_datasets('ModelScope', ms_conf) - local_dataset = load_datasets('Local', local_conf) - # _check_data(ms_dataset, local_dataset, sample_size=sample_size) - return 'success', f'{local_path_name}' - except Exception as exception: - # print(exception) - return 'failure', f'can\'t load {local_path_name}' - - with ThreadPoolExecutor(16) as executor: - futures = { - executor.submit(compare_datasets, local_conf): local_conf - for local_conf in local_datasets_conf - } - - for future in tqdm(as_completed(futures), total=len(futures)): - result, message = future.result() - if result == 'success': - successful_comparisons.append(message) - else: - failed_comparisons.append(message) - - # 输出测试总结 - total_datasets = len(local_datasets_conf) - print(f"All {total_datasets} datasets") - print(f"OK {len(successful_comparisons)} datasets") - for success in successful_comparisons: - print(f" {success}") - print(f"Fail {len(failed_comparisons)} datasets") - for failure in failed_comparisons: - print(f" {failure}") - - -def _check_data(ms_dataset: Dataset | DatasetDict, - oc_dataset: Dataset | DatasetDict, - sample_size): - assert type(ms_dataset) == type( - oc_dataset - ), f'Dataset type not match: {type(ms_dataset)} != {type(oc_dataset)}' - - # match DatasetDict - if isinstance(oc_dataset, DatasetDict): - assert ms_dataset.keys() == oc_dataset.keys( - ), f'DatasetDict not match: {ms_dataset.keys()} != {oc_dataset.keys()}' - - for key in ms_dataset.keys(): - _check_data(ms_dataset[key], oc_dataset[key], sample_size=sample_size) - - elif isinstance(oc_dataset, Dataset): - # match by cols - assert set(ms_dataset.column_names) == set( - oc_dataset.column_names - ), f'Column names do not match: {ms_dataset.column_names} != {oc_dataset.column_names}' - - # Check that the number of rows is the same - assert len(ms_dataset) == len( - oc_dataset - ), f'Number of rows do not match: {len(ms_dataset)} != {len(oc_dataset)}' - - # Randomly sample indices - sample_indices = random.sample(range(len(ms_dataset)), - min(sample_size, len(ms_dataset))) - - for i, idx in enumerate(sample_indices): - for col in ms_dataset.column_names: - ms_value = clean_string(str(ms_dataset[col][idx])) - oc_value = clean_string(str(oc_dataset[col][idx])) - try: - assert ms_value == oc_value, f"Value mismatch in column '{col}', index {idx}: {ms_value} != {oc_value}" - except AssertionError as e: - print(f"Assertion failed for column '{col}', index {idx}") - print(f"ms_data: {ms_dataset[idx]}") - print(f'oc_data: {oc_dataset[idx]}') - print(f'ms_value: {ms_value} ({type(ms_value)})') - print(f'oc_value: {oc_value} ({type(oc_value)})') - raise e - else: - raise ValueError(f'Datasets type not supported {type(ms_dataset)}') - - -if __name__ == '__main__': - sample_size = 100 - unittest.main() +import random +import sys +import unittest +import warnings +from os import environ + +from datasets import Dataset, DatasetDict +from mmengine.config import read_base +from tqdm import tqdm + +from concurrent.futures import ThreadPoolExecutor, as_completed + +warnings.filterwarnings('ignore', category=DeprecationWarning) + + +def reload_datasets(): + modules_to_remove = [ + module_name for module_name in sys.modules + if module_name.startswith('configs.datasets') + ] + + for module_name in modules_to_remove: + del sys.modules[module_name] + + with read_base(): + from configs.datasets.ceval.ceval_gen import ceval_datasets + from configs.datasets.gsm8k.gsm8k_gen import gsm8k_datasets + from configs.datasets.cmmlu.cmmlu_gen import cmmlu_datasets + from configs.datasets.ARC_c.ARC_c_gen import ARC_c_datasets + from configs.datasets.ARC_e.ARC_e_gen import ARC_e_datasets + from configs.datasets.humaneval.humaneval_gen import humaneval_datasets + from configs.datasets.humaneval.humaneval_repeat10_gen_8e312c import humaneval_datasets as humaneval_repeat10_datasets + from configs.datasets.race.race_ppl import race_datasets + from configs.datasets.commonsenseqa.commonsenseqa_gen import commonsenseqa_datasets + + from configs.datasets.mmlu.mmlu_gen import mmlu_datasets + from configs.datasets.strategyqa.strategyqa_gen import strategyqa_datasets + from configs.datasets.bbh.bbh_gen import bbh_datasets + from configs.datasets.Xsum.Xsum_gen import Xsum_datasets + from configs.datasets.winogrande.winogrande_gen import winogrande_datasets + from configs.datasets.winogrande.winogrande_ll import winogrande_datasets as winogrande_ll_datasets + from configs.datasets.winogrande.winogrande_5shot_ll_252f01 import winogrande_datasets as winogrande_5shot_ll_datasets + from configs.datasets.obqa.obqa_gen import obqa_datasets + from configs.datasets.obqa.obqa_ppl_6aac9e import obqa_datasets as obqa_ppl_datasets + from configs.datasets.agieval.agieval_gen import agieval_datasets as agieval_v2_datasets + # from configs.datasets.agieval.agieval_gen_a0c741 import agieval_datasets as agieval_v1_datasets + from configs.datasets.siqa.siqa_gen import siqa_datasets as siqa_v2_datasets + from configs.datasets.siqa.siqa_gen_18632c import siqa_datasets as siqa_v3_datasets + from configs.datasets.siqa.siqa_ppl_42bc6e import siqa_datasets as siqa_ppl_datasets + from configs.datasets.storycloze.storycloze_gen import storycloze_datasets + from configs.datasets.storycloze.storycloze_ppl import storycloze_datasets as storycloze_ppl_datasets + from configs.datasets.summedits.summedits_gen import summedits_datasets as summedits_v2_datasets + + from configs.datasets.hellaswag.hellaswag_gen import hellaswag_datasets as hellaswag_v2_datasets + from configs.datasets.hellaswag.hellaswag_10shot_gen_e42710 import hellaswag_datasets as hellaswag_ice_datasets + from configs.datasets.hellaswag.hellaswag_ppl_9dbb12 import hellaswag_datasets as hellaswag_v1_datasets + from configs.datasets.hellaswag.hellaswag_ppl_a6e128 import hellaswag_datasets as hellaswag_v3_datasets + from configs.datasets.mbpp.mbpp_gen import mbpp_datasets as mbpp_v1_datasets + from configs.datasets.mbpp.mbpp_passk_gen_830460 import mbpp_datasets as mbpp_v2_datasets + from configs.datasets.mbpp.sanitized_mbpp_gen_830460 import sanitized_mbpp_datasets + from configs.datasets.nq.nq_gen import nq_datasets + from configs.datasets.lcsts.lcsts_gen import lcsts_datasets + from configs.datasets.math.math_gen import math_datasets + from configs.datasets.piqa.piqa_gen import piqa_datasets as piqa_v2_datasets + from configs.datasets.piqa.piqa_ppl import piqa_datasets as piqa_v1_datasets + from configs.datasets.piqa.piqa_ppl_0cfff2 import piqa_datasets as piqa_v3_datasets + from configs.datasets.lambada.lambada_gen import lambada_datasets + from configs.datasets.tydiqa.tydiqa_gen import tydiqa_datasets + from configs.datasets.GaokaoBench.GaokaoBench_gen import GaokaoBench_datasets + from configs.datasets.GaokaoBench.GaokaoBench_mixed import GaokaoBench_datasets as GaokaoBench_mixed_datasets + from configs.datasets.GaokaoBench.GaokaoBench_no_subjective_gen_4c31db import GaokaoBench_datasets as GaokaoBench_no_subjective_datasets + from configs.datasets.triviaqa.triviaqa_gen import triviaqa_datasets + from configs.datasets.triviaqa.triviaqa_wiki_1shot_gen_20a989 import triviaqa_datasets as triviaqa_wiki_1shot_datasets + + from configs.datasets.CLUE_cmnli.CLUE_cmnli_gen import cmnli_datasets + from configs.datasets.CLUE_cmnli.CLUE_cmnli_ppl import cmnli_datasets as cmnli_ppl_datasets + from configs.datasets.CLUE_ocnli.CLUE_ocnli_gen import ocnli_datasets + + from configs.datasets.ceval.ceval_clean_ppl import ceval_datasets as ceval_clean_datasets + from configs.datasets.ARC_c.ARC_c_clean_ppl import ARC_c_datasets as ARC_c_clean_datasets + from configs.datasets.mmlu.mmlu_clean_ppl import mmlu_datasets as mmlu_clean_datasets + from configs.datasets.hellaswag.hellaswag_clean_ppl import hellaswag_datasets as hellaswag_clean_datasets + from configs.datasets.FewCLUE_ocnli_fc.FewCLUE_ocnli_fc_gen import ocnli_fc_datasets + + return sum((v for k, v in locals().items() if k.endswith('_datasets')), []) + + +def load_datasets_conf(source): + environ['DATASET_SOURCE'] = source + datasets_conf = reload_datasets() + return datasets_conf + + +def load_datasets(source, conf): + environ['DATASET_SOURCE'] = source + if 'lang' in conf: + dataset = conf['type'].load(path=conf['path'], lang=conf['lang']) + return dataset + if 'setting_name' in conf: + dataset = conf['type'].load(path=conf['path'], + name=conf['name'], + setting_name=conf['setting_name']) + return dataset + if 'name' in conf: + dataset = conf['type'].load(path=conf['path'], name=conf['name']) + return dataset + + if 'local_mode' in conf: + dataset = conf['type'].load(path=conf['path'], local_mode=conf['local_mode']) + return dataset + try: + dataset = conf['type'].load(path=conf['path']) + except Exception: + dataset = conf['type'].load(**conf) + return dataset + + +def clean_string(value): + """Helper function to clean and normalize string data. + + It strips leading and trailing whitespace and replaces multiple whitespace + characters with a single space. + """ + if isinstance(value, str): + return ' '.join(value.split()) + return value + + +class TestingLocalDatasets(unittest.TestCase): + + def test_datasets(self): + # 加载 ModelScope 和 Local 数据集配置 + # ms_datasets_conf = load_datasets_conf('ModelScope') + local_datasets_conf = load_datasets_conf('Local') + + # 初始化成功和失败的数据集列表 + successful_comparisons = [] + failed_comparisons = [] + + def compare_datasets(local_conf): + # local_dataset = load_datasets(local_conf) + local_dataset = load_datasets('Local', local_conf) + # modelscope_path_name = f"{ms_conf.get('path')}/{ms_conf.get('name', '')}\t{ms_conf.get('lang', '')}" + local_path_name = f"{local_conf.get('path')}/{local_conf.get('name', '')}\t{local_conf.get('lang', '')}" + # # 断言类型一致 + # assert ms_conf['type'] == local_conf['type'], "Data types do not match" + # print(modelscope_path_name, local_path_name) + try: + # ms_dataset = load_datasets('ModelScope', ms_conf) + local_dataset = load_datasets('Local', local_conf) + # _check_data(ms_dataset, local_dataset, sample_size=sample_size) + return 'success', f'{local_path_name}' + except Exception as exception: + # print(exception) + return 'failure', f'can\'t load {local_path_name}' + + with ThreadPoolExecutor(16) as executor: + futures = { + executor.submit(compare_datasets, local_conf): local_conf + for local_conf in local_datasets_conf + } + + for future in tqdm(as_completed(futures), total=len(futures)): + result, message = future.result() + if result == 'success': + successful_comparisons.append(message) + else: + failed_comparisons.append(message) + + # 输出测试总结 + total_datasets = len(local_datasets_conf) + print(f"All {total_datasets} datasets") + print(f"OK {len(successful_comparisons)} datasets") + for success in successful_comparisons: + print(f" {success}") + print(f"Fail {len(failed_comparisons)} datasets") + for failure in failed_comparisons: + print(f" {failure}") + + +def _check_data(ms_dataset: Dataset | DatasetDict, + oc_dataset: Dataset | DatasetDict, + sample_size): + assert type(ms_dataset) == type( + oc_dataset + ), f'Dataset type not match: {type(ms_dataset)} != {type(oc_dataset)}' + + # match DatasetDict + if isinstance(oc_dataset, DatasetDict): + assert ms_dataset.keys() == oc_dataset.keys( + ), f'DatasetDict not match: {ms_dataset.keys()} != {oc_dataset.keys()}' + + for key in ms_dataset.keys(): + _check_data(ms_dataset[key], oc_dataset[key], sample_size=sample_size) + + elif isinstance(oc_dataset, Dataset): + # match by cols + assert set(ms_dataset.column_names) == set( + oc_dataset.column_names + ), f'Column names do not match: {ms_dataset.column_names} != {oc_dataset.column_names}' + + # Check that the number of rows is the same + assert len(ms_dataset) == len( + oc_dataset + ), f'Number of rows do not match: {len(ms_dataset)} != {len(oc_dataset)}' + + # Randomly sample indices + sample_indices = random.sample(range(len(ms_dataset)), + min(sample_size, len(ms_dataset))) + + for i, idx in enumerate(sample_indices): + for col in ms_dataset.column_names: + ms_value = clean_string(str(ms_dataset[col][idx])) + oc_value = clean_string(str(oc_dataset[col][idx])) + try: + assert ms_value == oc_value, f"Value mismatch in column '{col}', index {idx}: {ms_value} != {oc_value}" + except AssertionError as e: + print(f"Assertion failed for column '{col}', index {idx}") + print(f"ms_data: {ms_dataset[idx]}") + print(f'oc_data: {oc_dataset[idx]}') + print(f'ms_value: {ms_value} ({type(ms_value)})') + print(f'oc_value: {oc_value} ({type(oc_value)})') + raise e + else: + raise ValueError(f'Datasets type not supported {type(ms_dataset)}') + + +if __name__ == '__main__': + sample_size = 100 + unittest.main() diff --git a/tests/dataset/test_ms_datasets.py b/tests/dataset/test_ms_datasets.py index 2f2133c7..39f199f4 100644 --- a/tests/dataset/test_ms_datasets.py +++ b/tests/dataset/test_ms_datasets.py @@ -1,226 +1,226 @@ -import random -import sys -import unittest -import warnings -from os import environ - -from datasets import Dataset, DatasetDict -from mmengine.config import read_base -from tqdm import tqdm - -from concurrent.futures import ThreadPoolExecutor, as_completed - -warnings.filterwarnings('ignore', category=DeprecationWarning) - - -def reload_datasets(): - modules_to_remove = [ - module_name for module_name in sys.modules - if module_name.startswith('configs.datasets') - ] - - for module_name in modules_to_remove: - del sys.modules[module_name] - - with read_base(): - from configs.datasets.ceval.ceval_gen import ceval_datasets - from configs.datasets.gsm8k.gsm8k_gen import gsm8k_datasets - from configs.datasets.cmmlu.cmmlu_gen import cmmlu_datasets - from configs.datasets.ARC_c.ARC_c_gen import ARC_c_datasets - from configs.datasets.ARC_e.ARC_e_gen import ARC_e_datasets - from configs.datasets.humaneval.humaneval_gen import humaneval_datasets - from configs.datasets.humaneval.humaneval_repeat10_gen_8e312c import humaneval_datasets as humaneval_repeat10_datasets - from configs.datasets.race.race_ppl import race_datasets - from configs.datasets.commonsenseqa.commonsenseqa_gen import commonsenseqa_datasets - - from configs.datasets.mmlu.mmlu_gen import mmlu_datasets - from configs.datasets.bbh.bbh_gen import bbh_datasets - from configs.datasets.Xsum.Xsum_gen import Xsum_datasets - from configs.datasets.winogrande.winogrande_gen import winogrande_datasets - from configs.datasets.winogrande.winogrande_ll import winogrande_datasets as winogrande_ll_datasets - from configs.datasets.winogrande.winogrande_5shot_ll_252f01 import winogrande_datasets as winogrande_5shot_ll_datasets - from configs.datasets.obqa.obqa_gen import obqa_datasets - from configs.datasets.obqa.obqa_ppl_6aac9e import obqa_datasets as obqa_ppl_datasets - from configs.datasets.agieval.agieval_gen import agieval_datasets as agieval_v2_datasets - from configs.datasets.agieval.agieval_gen_a0c741 import agieval_datasets as agieval_v1_datasets - from configs.datasets.siqa.siqa_gen import siqa_datasets as siqa_v2_datasets - from configs.datasets.siqa.siqa_gen_18632c import siqa_datasets as siqa_v3_datasets - from configs.datasets.siqa.siqa_ppl_42bc6e import siqa_datasets as siqa_ppl_datasets - from configs.datasets.storycloze.storycloze_gen import storycloze_datasets - from configs.datasets.storycloze.storycloze_ppl import storycloze_datasets as storycloze_ppl_datasets - from configs.datasets.summedits.summedits_gen import summedits_datasets as summedits_v2_datasets - - from configs.datasets.strategyqa.strategyqa_gen import strategyqa_datasets - from configs.datasets.mbpp.mbpp_gen import mbpp_datasets as mbpp_v1_datasets - from configs.datasets.lcsts.lcsts_gen import lcsts_datasets - - from configs.datasets.hellaswag.hellaswag_gen import hellaswag_datasets as hellaswag_v2_datasets - from configs.datasets.hellaswag.hellaswag_10shot_gen_e42710 import hellaswag_datasets as hellaswag_ice_datasets - from configs.datasets.hellaswag.hellaswag_ppl_9dbb12 import hellaswag_datasets as hellaswag_v1_datasets - from configs.datasets.hellaswag.hellaswag_ppl_a6e128 import hellaswag_datasets as hellaswag_v3_datasets - from configs.datasets.mbpp.mbpp_passk_gen_830460 import mbpp_datasets as mbpp_v2_datasets - from configs.datasets.mbpp.sanitized_mbpp_gen_830460 import sanitized_mbpp_datasets - from configs.datasets.nq.nq_gen import nq_datasets - from configs.datasets.math.math_gen import math_datasets - from configs.datasets.piqa.piqa_gen import piqa_datasets as piqa_v2_datasets - from configs.datasets.piqa.piqa_ppl import piqa_datasets as piqa_v1_datasets - from configs.datasets.piqa.piqa_ppl_0cfff2 import piqa_datasets as piqa_v3_datasets - from configs.datasets.lambada.lambada_gen import lambada_datasets - from configs.datasets.tydiqa.tydiqa_gen import tydiqa_datasets - from configs.datasets.GaokaoBench.GaokaoBench_gen import GaokaoBench_datasets - from configs.datasets.GaokaoBench.GaokaoBench_mixed import GaokaoBench_datasets as GaokaoBench_mixed_datasets - from configs.datasets.GaokaoBench.GaokaoBench_no_subjective_gen_4c31db import GaokaoBench_datasets as GaokaoBench_no_subjective_datasets - from configs.datasets.triviaqa.triviaqa_gen import triviaqa_datasets - from configs.datasets.triviaqa.triviaqa_wiki_1shot_gen_20a989 import triviaqa_datasets as triviaqa_wiki_1shot_datasets - - from configs.datasets.CLUE_cmnli.CLUE_cmnli_gen import cmnli_datasets - from configs.datasets.CLUE_cmnli.CLUE_cmnli_ppl import cmnli_datasets as cmnli_ppl_datasets - from configs.datasets.CLUE_ocnli.CLUE_ocnli_gen import ocnli_datasets - - from configs.datasets.ceval.ceval_clean_ppl import ceval_datasets as ceval_clean_datasets - from configs.datasets.ARC_c.ARC_c_clean_ppl import ARC_c_datasets as ARC_c_clean_datasets - from configs.datasets.mmlu.mmlu_clean_ppl import mmlu_datasets as mmlu_clean_datasets - from configs.datasets.hellaswag.hellaswag_clean_ppl import hellaswag_datasets as hellaswag_clean_datasets - - return sum((v for k, v in locals().items() if k.endswith('_datasets')), []) - - -def load_datasets_conf(source): - environ['DATASET_SOURCE'] = source - datasets_conf = reload_datasets() - return datasets_conf - - -def load_datasets(source, conf): - environ['DATASET_SOURCE'] = source - if 'lang' in conf: - dataset = conf['type'].load(path=conf['path'], lang=conf['lang']) - return dataset - if 'setting_name' in conf: - dataset = conf['type'].load(path=conf['path'], - name=conf['name'], - setting_name=conf['setting_name']) - return dataset - if 'name' in conf: - dataset = conf['type'].load(path=conf['path'], name=conf['name']) - return dataset - try: - dataset = conf['type'].load(path=conf['path']) - except Exception as e: - print(e) - dataset = conf['type'].load(**conf) - return dataset - - -def clean_string(value): - """Helper function to clean and normalize string data. - - It strips leading and trailing whitespace and replaces multiple whitespace - characters with a single space. - """ - if isinstance(value, str): - return ' '.join(value.split()) - return value - - -class TestingMsDatasets(unittest.TestCase): - - def test_datasets(self): - # 加载 ModelScope 和 Local 数据集配置 - ms_datasets_conf = load_datasets_conf('ModelScope') - local_datasets_conf = load_datasets_conf('Local') - - # 初始化成功和失败的数据集列表 - successful_comparisons = [] - failed_comparisons = [] - - def compare_datasets(ms_conf, local_conf): - modelscope_path_name = f"{ms_conf.get('path')}/{ms_conf.get('name', '')}\t{ms_conf.get('lang', '')}" - local_path_name = f"{local_conf.get('path')}/{local_conf.get('name', '')}\t{local_conf.get('lang', '')}" - # 断言类型一致 - assert ms_conf['type'] == local_conf['type'], "Data types do not match" - print(modelscope_path_name, local_path_name) - try: - ms_dataset = load_datasets('ModelScope', ms_conf) - local_dataset = load_datasets('Local', local_conf) - _check_data(ms_dataset, local_dataset, sample_size=sample_size) - return 'success', f'{modelscope_path_name} | {local_path_name}' - except Exception as exception: - print(exception) - return 'failure', f'{modelscope_path_name} is not the same as {local_path_name}' - - with ThreadPoolExecutor(thread) as executor: - futures = { - executor.submit(compare_datasets, ms_conf, local_conf): (ms_conf, local_conf) - for ms_conf, local_conf in zip(ms_datasets_conf, local_datasets_conf) - } - - for future in tqdm(as_completed(futures), total=len(futures)): - result, message = future.result() - if result == 'success': - successful_comparisons.append(message) - else: - failed_comparisons.append(message) - - # 输出测试总结 - total_datasets = len(ms_datasets_conf) - print(f"All {total_datasets} datasets") - print(f"OK {len(successful_comparisons)} datasets") - for success in successful_comparisons: - print(f" {success}") - print(f"Fail {len(failed_comparisons)} datasets") - for failure in failed_comparisons: - print(f" {failure}") - - -def _check_data(ms_dataset: Dataset | DatasetDict, - oc_dataset: Dataset | DatasetDict, - sample_size): - assert type(ms_dataset) == type( - oc_dataset - ), f'Dataset type not match: {type(ms_dataset)} != {type(oc_dataset)}' - - # match DatasetDict - if isinstance(oc_dataset, DatasetDict): - assert ms_dataset.keys() == oc_dataset.keys( - ), f'DatasetDict not match: {ms_dataset.keys()} != {oc_dataset.keys()}' - - for key in ms_dataset.keys(): - _check_data(ms_dataset[key], oc_dataset[key], sample_size=sample_size) - - elif isinstance(oc_dataset, Dataset): - # match by cols - assert set(ms_dataset.column_names) == set( - oc_dataset.column_names - ), f'Column names do not match: {ms_dataset.column_names} != {oc_dataset.column_names}' - - # Check that the number of rows is the same - assert len(ms_dataset) == len( - oc_dataset - ), f'Number of rows do not match: {len(ms_dataset)} != {len(oc_dataset)}' - - # Randomly sample indices - sample_indices = random.sample(range(len(ms_dataset)), - min(sample_size, len(ms_dataset))) - - for i, idx in enumerate(sample_indices): - for col in ms_dataset.column_names: - ms_value = clean_string(str(ms_dataset[col][idx])) - oc_value = clean_string(str(oc_dataset[col][idx])) - try: - assert ms_value == oc_value, f"Value mismatch in column '{col}', index {idx}: {ms_value} != {oc_value}" - except AssertionError as e: - print(f"Assertion failed for column '{col}', index {idx}") - print(f"ms_data: {ms_dataset[idx]}") - print(f'oc_data: {oc_dataset[idx]}') - print(f'ms_value: {ms_value} ({type(ms_value)})') - print(f'oc_value: {oc_value} ({type(oc_value)})') - raise e - else: - raise ValueError(f'Datasets type not supported {type(ms_dataset)}') - - -if __name__ == '__main__': - sample_size = 100 - thread = 1 - unittest.main() +import random +import sys +import unittest +import warnings +from os import environ + +from datasets import Dataset, DatasetDict +from mmengine.config import read_base +from tqdm import tqdm + +from concurrent.futures import ThreadPoolExecutor, as_completed + +warnings.filterwarnings('ignore', category=DeprecationWarning) + + +def reload_datasets(): + modules_to_remove = [ + module_name for module_name in sys.modules + if module_name.startswith('configs.datasets') + ] + + for module_name in modules_to_remove: + del sys.modules[module_name] + + with read_base(): + from configs.datasets.ceval.ceval_gen import ceval_datasets + from configs.datasets.gsm8k.gsm8k_gen import gsm8k_datasets + from configs.datasets.cmmlu.cmmlu_gen import cmmlu_datasets + from configs.datasets.ARC_c.ARC_c_gen import ARC_c_datasets + from configs.datasets.ARC_e.ARC_e_gen import ARC_e_datasets + from configs.datasets.humaneval.humaneval_gen import humaneval_datasets + from configs.datasets.humaneval.humaneval_repeat10_gen_8e312c import humaneval_datasets as humaneval_repeat10_datasets + from configs.datasets.race.race_ppl import race_datasets + from configs.datasets.commonsenseqa.commonsenseqa_gen import commonsenseqa_datasets + + from configs.datasets.mmlu.mmlu_gen import mmlu_datasets + from configs.datasets.bbh.bbh_gen import bbh_datasets + from configs.datasets.Xsum.Xsum_gen import Xsum_datasets + from configs.datasets.winogrande.winogrande_gen import winogrande_datasets + from configs.datasets.winogrande.winogrande_ll import winogrande_datasets as winogrande_ll_datasets + from configs.datasets.winogrande.winogrande_5shot_ll_252f01 import winogrande_datasets as winogrande_5shot_ll_datasets + from configs.datasets.obqa.obqa_gen import obqa_datasets + from configs.datasets.obqa.obqa_ppl_6aac9e import obqa_datasets as obqa_ppl_datasets + from configs.datasets.agieval.agieval_gen import agieval_datasets as agieval_v2_datasets + from configs.datasets.agieval.agieval_gen_a0c741 import agieval_datasets as agieval_v1_datasets + from configs.datasets.siqa.siqa_gen import siqa_datasets as siqa_v2_datasets + from configs.datasets.siqa.siqa_gen_18632c import siqa_datasets as siqa_v3_datasets + from configs.datasets.siqa.siqa_ppl_42bc6e import siqa_datasets as siqa_ppl_datasets + from configs.datasets.storycloze.storycloze_gen import storycloze_datasets + from configs.datasets.storycloze.storycloze_ppl import storycloze_datasets as storycloze_ppl_datasets + from configs.datasets.summedits.summedits_gen import summedits_datasets as summedits_v2_datasets + + from configs.datasets.strategyqa.strategyqa_gen import strategyqa_datasets + from configs.datasets.mbpp.mbpp_gen import mbpp_datasets as mbpp_v1_datasets + from configs.datasets.lcsts.lcsts_gen import lcsts_datasets + + from configs.datasets.hellaswag.hellaswag_gen import hellaswag_datasets as hellaswag_v2_datasets + from configs.datasets.hellaswag.hellaswag_10shot_gen_e42710 import hellaswag_datasets as hellaswag_ice_datasets + from configs.datasets.hellaswag.hellaswag_ppl_9dbb12 import hellaswag_datasets as hellaswag_v1_datasets + from configs.datasets.hellaswag.hellaswag_ppl_a6e128 import hellaswag_datasets as hellaswag_v3_datasets + from configs.datasets.mbpp.mbpp_passk_gen_830460 import mbpp_datasets as mbpp_v2_datasets + from configs.datasets.mbpp.sanitized_mbpp_gen_830460 import sanitized_mbpp_datasets + from configs.datasets.nq.nq_gen import nq_datasets + from configs.datasets.math.math_gen import math_datasets + from configs.datasets.piqa.piqa_gen import piqa_datasets as piqa_v2_datasets + from configs.datasets.piqa.piqa_ppl import piqa_datasets as piqa_v1_datasets + from configs.datasets.piqa.piqa_ppl_0cfff2 import piqa_datasets as piqa_v3_datasets + from configs.datasets.lambada.lambada_gen import lambada_datasets + from configs.datasets.tydiqa.tydiqa_gen import tydiqa_datasets + from configs.datasets.GaokaoBench.GaokaoBench_gen import GaokaoBench_datasets + from configs.datasets.GaokaoBench.GaokaoBench_mixed import GaokaoBench_datasets as GaokaoBench_mixed_datasets + from configs.datasets.GaokaoBench.GaokaoBench_no_subjective_gen_4c31db import GaokaoBench_datasets as GaokaoBench_no_subjective_datasets + from configs.datasets.triviaqa.triviaqa_gen import triviaqa_datasets + from configs.datasets.triviaqa.triviaqa_wiki_1shot_gen_20a989 import triviaqa_datasets as triviaqa_wiki_1shot_datasets + + from configs.datasets.CLUE_cmnli.CLUE_cmnli_gen import cmnli_datasets + from configs.datasets.CLUE_cmnli.CLUE_cmnli_ppl import cmnli_datasets as cmnli_ppl_datasets + from configs.datasets.CLUE_ocnli.CLUE_ocnli_gen import ocnli_datasets + + from configs.datasets.ceval.ceval_clean_ppl import ceval_datasets as ceval_clean_datasets + from configs.datasets.ARC_c.ARC_c_clean_ppl import ARC_c_datasets as ARC_c_clean_datasets + from configs.datasets.mmlu.mmlu_clean_ppl import mmlu_datasets as mmlu_clean_datasets + from configs.datasets.hellaswag.hellaswag_clean_ppl import hellaswag_datasets as hellaswag_clean_datasets + + return sum((v for k, v in locals().items() if k.endswith('_datasets')), []) + + +def load_datasets_conf(source): + environ['DATASET_SOURCE'] = source + datasets_conf = reload_datasets() + return datasets_conf + + +def load_datasets(source, conf): + environ['DATASET_SOURCE'] = source + if 'lang' in conf: + dataset = conf['type'].load(path=conf['path'], lang=conf['lang']) + return dataset + if 'setting_name' in conf: + dataset = conf['type'].load(path=conf['path'], + name=conf['name'], + setting_name=conf['setting_name']) + return dataset + if 'name' in conf: + dataset = conf['type'].load(path=conf['path'], name=conf['name']) + return dataset + try: + dataset = conf['type'].load(path=conf['path']) + except Exception as e: + print(e) + dataset = conf['type'].load(**conf) + return dataset + + +def clean_string(value): + """Helper function to clean and normalize string data. + + It strips leading and trailing whitespace and replaces multiple whitespace + characters with a single space. + """ + if isinstance(value, str): + return ' '.join(value.split()) + return value + + +class TestingMsDatasets(unittest.TestCase): + + def test_datasets(self): + # 加载 ModelScope 和 Local 数据集配置 + ms_datasets_conf = load_datasets_conf('ModelScope') + local_datasets_conf = load_datasets_conf('Local') + + # 初始化成功和失败的数据集列表 + successful_comparisons = [] + failed_comparisons = [] + + def compare_datasets(ms_conf, local_conf): + modelscope_path_name = f"{ms_conf.get('path')}/{ms_conf.get('name', '')}\t{ms_conf.get('lang', '')}" + local_path_name = f"{local_conf.get('path')}/{local_conf.get('name', '')}\t{local_conf.get('lang', '')}" + # 断言类型一致 + assert ms_conf['type'] == local_conf['type'], "Data types do not match" + print(modelscope_path_name, local_path_name) + try: + ms_dataset = load_datasets('ModelScope', ms_conf) + local_dataset = load_datasets('Local', local_conf) + _check_data(ms_dataset, local_dataset, sample_size=sample_size) + return 'success', f'{modelscope_path_name} | {local_path_name}' + except Exception as exception: + print(exception) + return 'failure', f'{modelscope_path_name} is not the same as {local_path_name}' + + with ThreadPoolExecutor(thread) as executor: + futures = { + executor.submit(compare_datasets, ms_conf, local_conf): (ms_conf, local_conf) + for ms_conf, local_conf in zip(ms_datasets_conf, local_datasets_conf) + } + + for future in tqdm(as_completed(futures), total=len(futures)): + result, message = future.result() + if result == 'success': + successful_comparisons.append(message) + else: + failed_comparisons.append(message) + + # 输出测试总结 + total_datasets = len(ms_datasets_conf) + print(f"All {total_datasets} datasets") + print(f"OK {len(successful_comparisons)} datasets") + for success in successful_comparisons: + print(f" {success}") + print(f"Fail {len(failed_comparisons)} datasets") + for failure in failed_comparisons: + print(f" {failure}") + + +def _check_data(ms_dataset: Dataset | DatasetDict, + oc_dataset: Dataset | DatasetDict, + sample_size): + assert type(ms_dataset) == type( + oc_dataset + ), f'Dataset type not match: {type(ms_dataset)} != {type(oc_dataset)}' + + # match DatasetDict + if isinstance(oc_dataset, DatasetDict): + assert ms_dataset.keys() == oc_dataset.keys( + ), f'DatasetDict not match: {ms_dataset.keys()} != {oc_dataset.keys()}' + + for key in ms_dataset.keys(): + _check_data(ms_dataset[key], oc_dataset[key], sample_size=sample_size) + + elif isinstance(oc_dataset, Dataset): + # match by cols + assert set(ms_dataset.column_names) == set( + oc_dataset.column_names + ), f'Column names do not match: {ms_dataset.column_names} != {oc_dataset.column_names}' + + # Check that the number of rows is the same + assert len(ms_dataset) == len( + oc_dataset + ), f'Number of rows do not match: {len(ms_dataset)} != {len(oc_dataset)}' + + # Randomly sample indices + sample_indices = random.sample(range(len(ms_dataset)), + min(sample_size, len(ms_dataset))) + + for i, idx in enumerate(sample_indices): + for col in ms_dataset.column_names: + ms_value = clean_string(str(ms_dataset[col][idx])) + oc_value = clean_string(str(oc_dataset[col][idx])) + try: + assert ms_value == oc_value, f"Value mismatch in column '{col}', index {idx}: {ms_value} != {oc_value}" + except AssertionError as e: + print(f"Assertion failed for column '{col}', index {idx}") + print(f"ms_data: {ms_dataset[idx]}") + print(f'oc_data: {oc_dataset[idx]}') + print(f'ms_value: {ms_value} ({type(ms_value)})') + print(f'oc_value: {oc_value} ({type(oc_value)})') + raise e + else: + raise ValueError(f'Datasets type not supported {type(ms_dataset)}') + + +if __name__ == '__main__': + sample_size = 100 + thread = 1 + unittest.main()