chat: glm-4-9b-chat-hf: gsm8k_accuracy: 56.25 race-high_accuracy: 84.38 glm-4-9b-chat-turbomind: gsm8k_accuracy: 71.88 race-high_accuracy: 90.62 glm-4-9b-chat-vllm: gsm8k_accuracy: 71.88 race-high_accuracy: 90.62 deepseek-7b-chat-hf: gsm8k_accuracy: 46.88 race-high_accuracy: 81.25 deepseek-r1-distill-llama-8b-turbomind: gsm8k_accuracy: 34.38 race-high_accuracy: 81.25 deepseek-r1-distill-qwen-1_5b-turbomind: gsm8k_accuracy: 28.12 race-high_accuracy: 53.12 deepseek-7b-chat-vllm: gsm8k_accuracy: 56.25 race-high_accuracy: 78.12 gemma2-2b-it-hf: gsm8k_accuracy: 50 race-high_accuracy: 75 gemma2-9b-it-hf: gsm8k_accuracy: 68.75 race-high_accuracy: 84.38 gemma-2b-it-hf: gsm8k_accuracy: 3.12 race-high_accuracy: 40.62 gemma-7b-it-hf: gsm8k_accuracy: 40.62 race-high_accuracy: 68.75 gemma-2-9b-it-turbomind: gsm8k_accuracy: 68.75 race-high_accuracy: 84.38 gemma-2-27b-it-turbomind: gsm8k_accuracy: 78.12 race-high_accuracy: 93.75 gemma-7b-it-vllm: gsm8k_accuracy: 28.12 race-high_accuracy: 68.75 internlm2_5-7b-chat-hf: gsm8k_accuracy: 84.38 race-high_accuracy: 90.62 internlm3-8b-instruct-hf: gsm8k_accuracy: 65.62 race-high_accuracy: 87.5 internlm2_5-7b-chat-turbomind: gsm8k_accuracy: 81.25 race-high_accuracy: 90.62 internlm2-chat-1.8b-turbomind: gsm8k_accuracy: 25.00 race-high_accuracy: 84.38 internlm2-chat-1.8b-sft-turbomind: gsm8k_accuracy: 34.38 race-high_accuracy: 84.38 internlm2-chat-7b-lmdeploy: gsm8k_accuracy: 59.38 race-high_accuracy: 87.50 internlm2-chat-7b-sft-turbomind: gsm8k_accuracy: 56.25 race-high_accuracy: 87.50 internlm3-8b-instruct-turbomind: gsm8k_accuracy: 65.62 race-high_accuracy: 87.5 internlm2-chat-7b-vllm: gsm8k_accuracy: 53.12 race-high_accuracy: 87.50 llama-3_1-8b-instruct-hf: gsm8k_accuracy: 84.38 race-high_accuracy: 90.62 llama-3_2-3b-instruct-hf: gsm8k_accuracy: 71.88 race-high_accuracy: 81.25 llama-3-8b-instruct-hf: gsm8k_accuracy: 68.75 race-high_accuracy: 87.5 llama-2-7b-chat-turbomind: gsm8k_accuracy: 18.75 race-high_accuracy: 46.88 llama-3_1-8b-instruct-turbomind: gsm8k_accuracy: 84.38 race-high_accuracy: 90.62 llama-3_2-3b-instruct-turbomind: gsm8k_accuracy: 65.62 race-high_accuracy: 81.25 llama-3-8b-instruct-turbomind: gsm8k_accuracy: 65.62 race-high_accuracy: 84.38 mistral-7b-instruct-v0.2-hf: gsm8k_accuracy: 40.62 race-high_accuracy: 75 mistral-7b-instruct-v0.3-hf: gsm8k_accuracy: 40.62 race-high_accuracy: 75 mistral-nemo-instruct-2407-hf: gsm8k_accuracy: 75 race-high_accuracy: 81.25 mistral-nemo-instruct-2407-turbomind: gsm8k_accuracy: 71.88 race-high_accuracy: 75 mistral-7b-instruct-v0.1-vllm: gsm8k_accuracy: 34.38 race-high_accuracy: 65.62 mistral-7b-instruct-v0.2-vllm: gsm8k_accuracy: 28.12 race-high_accuracy: 78.12 qwen2.5-0.5b-instruct-hf: gsm8k_accuracy: 34.38 race-high_accuracy: 46.88 qwen2.5-3b-instruct-hf : gsm8k_accuracy: 53.12 race-high_accuracy: 90.62 qwen2.5-0.5b-instruct-turbomind: gsm8k_accuracy: 28.12 race-high_accuracy: 43.75 qwen2.5-3b-instruct-turbomind: gsm8k_accuracy: 56.25 race-high_accuracy: 90.62 qwen1.5-0.5b-chat-hf: gsm8k_accuracy: 0 race-high_accuracy: 53.12 qwen2-1.5b-instruct-hf: gsm8k_accuracy: 62.5 race-high_accuracy: 84.38 qwen2-7b-instruct-hf: gsm8k_accuracy: 68.75 race-high_accuracy: 90.62 qwen2-1.5b-instruct-turbomind: gsm8k_accuracy: 56.25 race-high_accuracy: 84.38 qwen2-7b-instruct-turbomind: gsm8k_accuracy: 75.00 race-high_accuracy: 87.50 qwen1.5-0.5b-chat-vllm: gsm8k_accuracy: 6.25 race-high_accuracy: 53.12 yi-1.5-6b-chat-hf: gsm8k_accuracy: 65.62 race-high_accuracy: 84.38 yi-1.5-9b-chat-hf: gsm8k_accuracy: 75 race-high_accuracy: 93.75 yi-1.5-6b-chat-turbomind: gsm8k_accuracy: 59.38 race-high_accuracy: 84.38 yi-1.5-9b-chat-turbomind: gsm8k_accuracy: 78.12 race-high_accuracy: 93.75 deepseek-v2_lite-chat-turbomind: gsm8k_accuracy: 43.75 race-high_accuracy: 71.88 gemma2-27b-it-hf: gsm8k_accuracy: 71.88 race-high_accuracy: 93.75 internlm2_5-20b-chat-hf: gsm8k_accuracy: 84.38 race-high_accuracy: 87.5 internlm2_5-20b-chat-turbomind: gsm8k_accuracy: 87.50 race-high_accuracy: 87.5 mistral-small-instruct-2409-hf: gsm8k_accuracy: 81.25 race-high_accuracy: 87.50 mistral-small-instruct-2409-turbomind: gsm8k_accuracy: 78.12 race-high_accuracy: 87.50 phi-4: gsm8k_accuracy: 81.25 race-high_accuracy: 87.50 qwen2.5-14b-instruct-hf: gsm8k_accuracy: 71.88 race-high_accuracy: 96.88 qwen2.5-14b-instruct-turbomind: gsm8k_accuracy: 71.88 race-high_accuracy: 96.88 yi-1.5-34b-chat-turbomind: gsm8k_accuracy: 71.88 race-high_accuracy: 93.75 deepseek-67b-chat-turbomind: gsm8k_accuracy: 71.88 race-high_accuracy: 75.00 deepseek-r1-distill-qwen-32b-turbomind: gsm8k_accuracy: 31.25 race-high_accuracy: 90.62 llama-3_3-70b-instruct-turbomind: gsm8k_accuracy: 93.75 race-high_accuracy: 87.5 mixtral-large-instruct-2411-turbomind: gsm8k_accuracy: 87.50 race-high_accuracy: 93.75 nvidia-3_1-Nemotron-70b-instruct-HF-turbomind: gsm8k_accuracy: 90.62 race-high_accuracy: 53.12 qwen2.5-72b-instruct-turbomind: gsm8k_accuracy: 78.12 race-high_accuracy: 90.62 deepseek-r1-distill-llama-70b-turbomind: gsm8k_accuracy: 50.00 race-high_accuracy: 87.50 deepseek-v2_5-1210-turbomind: gsm8k_accuracy: 90.62 race-high_accuracy: 84.38 mixtral-8x22b-instruct-v0.1-turbomind: gsm8k_accuracy: 75.00 race-high_accuracy: 78.12 mixtral-8x22b-instruct-v0.1-vllm: gsm8k_accuracy: 78.12 race-high_accuracy: 78.12 base: glm-4-9b-turbomind: gsm8k_accuracy: 59.38 GPQA_diamond_accuracy: 28.12 race-high_accuracy: 93.75 winogrande_accuracy: 84.38 deepseek-7b-base-hf: gsm8k_accuracy: 25 GPQA_diamond_accuracy: 0 race-high_accuracy: 46.88 winogrande_accuracy: 71.88 deepseek-7b-base-turbomind: gsm8k_accuracy: 18.75 GPQA_diamond_accuracy: 3.12 race-high_accuracy: 50.00 winogrande_accuracy: 84.38 deepseek-moe-16b-base-vllm: gsm8k_accuracy: 25.00 GPQA_diamond_accuracy: 0 race-high_accuracy: 25 winogrande_accuracy: 68.75 gemma2-2b-hf: gsm8k_accuracy: 31.25 GPQA_diamond_accuracy: 3.12 race-high_accuracy: 56.25 winogrande_accuracy: 75.00 gemma2-9b-hf: gsm8k_accuracy: 75.00 GPQA_diamond_accuracy: 0 race-high_accuracy: 84.38 winogrande_accuracy: 81.25 gemma-2b-hf: gsm8k_accuracy: 21.88 GPQA_diamond_accuracy: 3.12 race-high_accuracy: 21.88 winogrande_accuracy: 53.12 gemma-7b-hf: gsm8k_accuracy: 56.25 GPQA_diamond_accuracy: 3.12 race-high_accuracy: 65.62 winogrande_accuracy: 71.88 gemma-2-9b-turbomind: gsm8k_accuracy: 68.75 GPQA_diamond_accuracy: 0 race-high_accuracy: 84.38 winogrande_accuracy: 81.25 gemma-2b-vllm: gsm8k_accuracy: 15.62 GPQA_diamond_accuracy: 3.12 race-high_accuracy: 28.12 winogrande_accuracy: 68.75 gemma-7b-vllm: gsm8k_accuracy: 59.38 GPQA_diamond_accuracy: 6.25 race-high_accuracy: 81.25 winogrande_accuracy: 81.25 internlm2_5-7b-hf: gsm8k_accuracy: 37.5 GPQA_diamond_accuracy: 25 race-high_accuracy: 93.75 winogrande_accuracy: 71.88 internlm2-7b-hf: gsm8k_accuracy: 53.12 GPQA_diamond_accuracy: 18.75 race-high_accuracy: 62.5 winogrande_accuracy: 78.12 internlm2-1.8b-turbomind: gsm8k_accuracy: 12.50 GPQA_diamond_accuracy: 9.38 race-high_accuracy: 71.88 winogrande_accuracy: 75 internlm2_5-7b-turbomind: gsm8k_accuracy: 62.5 GPQA_diamond_accuracy: 31.25 race-high_accuracy: 93.75 winogrande_accuracy: 87.5 internlm2-7b-turbomind: gsm8k_accuracy: 53.12 GPQA_diamond_accuracy: 25.00 race-high_accuracy: 78.12 winogrande_accuracy: 71.88 internlm2-base-7b-turbomind: gsm8k_accuracy: 25.00 GPQA_diamond_accuracy: 34.38 race-high_accuracy: 71.88 winogrande_accuracy: 62.50 llama-2-7b-hf: gsm8k_accuracy: 21.88 GPQA_diamond_accuracy: 21.88 race-high_accuracy: 40.62 winogrande_accuracy: 71.88 llama-3_1-8b-hf: gsm8k_accuracy: 78.12 GPQA_diamond_accuracy: 25 race-high_accuracy: 90.62 winogrande_accuracy: 62.5 llama-3-8b-hf: gsm8k_accuracy: 46.88 GPQA_diamond_accuracy: 6.25 race-high_accuracy: 65.62 winogrande_accuracy: 65.62 llama-3.1-8b-turbomind: gsm8k_accuracy: 56.25 GPQA_diamond_accuracy: 9.38 race-high_accuracy: 78.12 winogrande_accuracy: 78.12 llama-3-8b-turbomind: gsm8k_accuracy: 46.88 GPQA_diamond_accuracy: 12.50 race-high_accuracy: 65.62 winogrande_accuracy: 81.25 mistral-7b-v0.3-hf: gsm8k_accuracy: 31.25 GPQA_diamond_accuracy: 6.25 race-high_accuracy: 62.5 winogrande_accuracy: 59.38 qwen2.5-7b-hf: gsm8k_accuracy: 81.25 GPQA_diamond_accuracy: 18.75 race-high_accuracy: 87.5 winogrande_accuracy: 71.88 qwen2.5-1.5b-turbomind: gsm8k_accuracy: 59.38 GPQA_diamond_accuracy: 21.88 race-high_accuracy: 78.12 winogrande_accuracy: 71.88 qwen2.5-7b-turbomind: gsm8k_accuracy: 78.12 GPQA_diamond_accuracy: 21.88 race-high_accuracy: 87.5 winogrande_accuracy: 75.00 qwen1.5-moe-a2.7b-hf: gsm8k_accuracy: 62.5 GPQA_diamond_accuracy: 18.75 race-high_accuracy: 84.38 winogrande_accuracy: 75 qwen2-0.5b-hf: gsm8k_accuracy: 25 GPQA_diamond_accuracy: 0 race-high_accuracy: 40.62 winogrande_accuracy: 62.5 qwen2-1.5b-hf: gsm8k_accuracy: 59.38 GPQA_diamond_accuracy: 9.38 race-high_accuracy: 81.25 winogrande_accuracy: 62.5 qwen2-7b-hf: gsm8k_accuracy: 68.75 GPQA_diamond_accuracy: 9.38 race-high_accuracy: 87.5 winogrande_accuracy: 68.75 qwen2-1.5b-turbomind: gsm8k_accuracy: 56.25 GPQA_diamond_accuracy: 12.50 race-high_accuracy: 81.25 winogrande_accuracy: 75 qwen2-7b-turbomind: gsm8k_accuracy: 65.62 GPQA_diamond_accuracy: 12.5 race-high_accuracy: 87.5 winogrande_accuracy: 75 qwen1.5-0.5b-vllm: gsm8k_accuracy: 9.38 GPQA_diamond_accuracy: 3.12 race-high_accuracy: 56.25 winogrande_accuracy: 59.38 yi-1.5-6b-hf: gsm8k_accuracy: 62.5 GPQA_diamond_accuracy: 3.12 race-high_accuracy: 87.5 winogrande_accuracy: 62.5 yi-1.5-9b-hf: gsm8k_accuracy: 75 GPQA_diamond_accuracy: 40.62 race-high_accuracy: 87.5 winogrande_accuracy: 59.38 yi-1.5-9b-turbomind: gsm8k_accuracy: 75.00 GPQA_diamond_accuracy: 40.62 race-high_accuracy: 87.5 winogrande_accuracy: 65.62 internlm2-20b-turbomind: gsm8k_accuracy: 71.88 GPQA_diamond_accuracy: 18.75 race-high_accuracy: 68.75 winogrande_accuracy: 81.25 qwen2.5-14b-hf: gsm8k_accuracy: 75 GPQA_diamond_accuracy: 37.5 race-high_accuracy: 93.75 winogrande_accuracy: 84.38 qwen2.5-32b-hf: gsm8k_accuracy: 87.5 GPQA_diamond_accuracy: 31.25 race-high_accuracy: 93.75 winogrande_accuracy: 78.12 qwen2.5-32b-turbomind: gsm8k_accuracy: 90.62 GPQA_diamond_accuracy: 31.25 race-high_accuracy: 93.75 winogrande_accuracy: 81.25 deepseek-67b-base-turbomind: gsm8k_accuracy: 62.50 GPQA_diamond_accuracy: 31.25 race-high_accuracy: 78.12 winogrande_accuracy: 81.25 llama-3-70b-turbomind: gsm8k_accuracy: 56.25 GPQA_diamond_accuracy: 15.62 race-high_accuracy: 93.75 winogrande_accuracy: 84.38 qwen2.5-72b-turbomind: gsm8k_accuracy: 84.38 GPQA_diamond_accuracy: 40.62 race-high_accuracy: 93.75 winogrande_accuracy: 87.5 deepseek-v2-turbomind: gsm8k_accuracy: 65.62 GPQA_diamond_accuracy: 3.12 race-high_accuracy: 93.75 winogrande_accuracy: 81.25