{{ content }}
import io import os from collections import defaultdict from concurrent.futures import ThreadPoolExecutor, as_completed from multiprocessing.pool import ThreadPool from typing import Any, Callable import jinja2 import numpy as np import requests from tqdm import tqdm from .types import EvalResult, Message, SamplerBase, SingleEvalResult QUERY_TEMPLATE_MULTICHOICE = """ Answer the following multiple choice question. The last line of your response should be of the following format: 'Answer: $LETTER' (without quotes) where LETTER is one of ABCD. Think step by step before answering. {Question} A) {A} B) {B} C) {C} D) {D} """.strip() ANSWER_PATTERN_MULTICHOICE = r'(?i)Answer[ \t]*:[ \t]*\$?([A-D])\$?' ANSWER_PATTERN = r'(?i)Answer\s*:\s*([^\n]+)' MULTILINGUAL_ANSWER_PATTERN_TEMPLATE = ( '(?i){}[ \t]*([A-D]|[أ-د]|[অ]|[ব]|[ড]|[ঢ]|[A]|[B]|[C]|[D])') # All the different ways "Answer" is written in different languages MULTILINGUAL_ANSWER_REGEXES = [ 'Answer\s*:', 'Answer\s*:', # Korean invisible character 'উত্তর\s*:', 'उत्तर\s*:', 'উত্তরঃ', 'উত্তর\s*:', 'Antwort\s*:', '답변\s*:', '정답\s*:', '답\s*:', '答案\s*:', '答案\s*:', '答\s*:', '答\s*:', '答复\s*:', '答曰\s*:', 'الإجابة:', 'الجواب:', 'إجابة:', 'الإجابة النهائية:', 'الإجابة الصحيحة:', 'الإجابة الصحيحة هي:', 'الإجابة هي:', 'الجواب النهائي:', 'Respuesta\s*:', 'Risposta\s*:', '答え\s*:', '答え\s*:', '回答\s*:', '回答\s*:', '解答\s*:', 'Jawaban\s*:', 'Réponse\s*:', 'Resposta\s*:', 'Jibu\s*:', 'Idahun\s*:', 'Ìdáhùn\s*:', 'Idáhùn\s*:', 'Àmọ̀nà\s*:', 'Àdáhùn\s*:', 'Ànúgọ\s*:', 'Àṣàyàn\s*:', ] EQUALITY_TEMPLATE = r""" Look at the following two expressions (answers to a math problem) and judge whether they are equivalent. Only perform trivial simplifications Examples: Expression 1: $2x+3$ Expression 2: $3+2x$ Yes Expression 1: 3/2 Expression 2: 1.5 Yes Expression 1: $x^2+2x+1$ Expression 2: $y^2+2y+1$ No Expression 1: $x^2+2x+1$ Expression 2: $(x+1)^2$ Yes Expression 1: 3245/5 Expression 2: 649 No (these are actually equal, don't mark them equivalent if you need to do nontrivial simplifications) Expression 1: 2/(-3) Expression 2: -2/3 Yes (trivial simplifications are allowed) Expression 1: 72 degrees Expression 2: 72 Yes (give benefit of the doubt to units) Expression 1: 64 Expression 2: 64 square feet Yes (give benefit of the doubt to units) --- YOUR TASK Respond with only "Yes" or "No" (without quotes). Do not include a rationale. Expression 1: %(expression1)s Expression 2: %(expression2)s """.strip() HTML_JINJA = """
Correct Answer: {{ correct_answer }}
Extracted Answer: {{ extracted_answer }}
Score: {{ score }}
""" def format_multichoice_question(row): return QUERY_TEMPLATE_MULTICHOICE.format(**row) def check_equality(sampler: SamplerBase, expr1: str, expr2: str): prompt = EQUALITY_TEMPLATE % {'expression1': expr1, 'expression2': expr2} sampler_response = sampler([dict(content=prompt, role='user')]) response_text = sampler_response.response_text return response_text.lower().strip() == 'yes' def _compute_stat(values: list, stat: str): if stat == 'mean': return np.mean(values) elif stat == 'std': return np.std(values) elif stat == 'min': return np.min(values) elif stat == 'max': return np.max(values) elif stat == 'n_samples': return len(values) elif stat == 'bootstrap_std': return np.std([ np.mean(np.random.choice(values, len(values))) for _ in range(1000) ]) else: raise ValueError(f'Unknown {stat =}') def aggregate_results( single_eval_results: list[SingleEvalResult], default_stats: tuple[str, ...] = ('mean', 'std'), name2stats: dict[str, tuple[str]] | None = None, ) -> EvalResult: """Aggregate results from multiple evaluations into a single EvalResult.""" name2stats = name2stats or {} name2values = defaultdict(list) htmls = [] convos = [] metadata = [] for single_eval_result in single_eval_results: for name, value in single_eval_result.metrics.items(): name2values[name].append(value) if single_eval_result.score is not None: name2values['score'].append(single_eval_result.score) htmls.append(single_eval_result.html) convos.append(single_eval_result.convo) metadata.append(single_eval_result.example_level_metadata) final_metrics = {} for name, values in name2values.items(): stats = name2stats.get(name, default_stats) for stat in stats: key = name if stat == 'mean' else f'{name}:{stat}' final_metrics[key] = _compute_stat(values, stat) return EvalResult( score=final_metrics.pop('score', None), metrics=final_metrics, htmls=htmls, convos=convos, metadata={'example_level_metadata': metadata}, ) def map_with_progress( f: Callable, xs: list[Any], num_threads: int = os.cpu_count() or 10, pbar: bool = True, ): """Apply f to each element of xs, using a ThreadPool, and show progress.""" pbar_fn = tqdm if pbar else lambda x, *args, **kwargs: x if os.getenv('debug'): return list(map(f, pbar_fn(xs, total=len(xs)))) else: with ThreadPool(min(num_threads, len(xs))) as pool: return list(pbar_fn(pool.imap(f, xs), total=len(xs))) jinja_env = jinja2.Environment( loader=jinja2.BaseLoader(), undefined=jinja2.StrictUndefined, autoescape=jinja2.select_autoescape(['html', 'xml']), ) _message_template = """ """ def message_to_html(message: Message) -> str: """Generate HTML snippet (inside aMetric | Value |
---|---|
Score | {{ score | float | round(3) }} |
{{ name }} | {{ value }} |