import heapq import math import random import re import json from typing import List, Tuple, Dict, Any, Optional import itertools from transformers import AutoTokenizer import asyncio # New import added for async handling from openai import AsyncOpenAI # Using AsyncOpenAI as client import numpy as np from openai import OpenAI import openai import json import re def read_jsonl(file_path): data = [] with open(file_path, 'r', encoding='utf-8') as file: for line in file: data.append(json.loads(line.strip())) return data def extract_answer_judge(solution_text: str): boxed_pattern = r'\\boxed\{([^}]*)\}' matches = re.findall(boxed_pattern, solution_text) if matches: return matches[-1].strip() return None def separate_steps(steps: List[str], mode: str = 'join') -> Any: delimiter = "\n\n" if mode == 'join': if not isinstance(steps, list): raise TypeError("For 'join' mode, 'steps' must be a list of strings.") return delimiter.join(steps) elif mode == 'split': if not isinstance(steps, str): raise TypeError("For 'split' mode, 'steps' must be a string.") return steps.split(delimiter) else: raise ValueError("Mode should be either 'join' or 'split'.") def evaluate_llm_as_judge(problem: str, steps: list, final_answer, output_type: str = 'bool') -> bool: global client client = OpenAI( base_url="http://localhost:8014/v1", api_key="token-abc123" ) # client = OpenAI( # base_url="https://ark.cn-beijing.volces.com/api/v3", # api_key="d61217e7-8ff3-4937-83ed-3dd2bebf72ad" # ) model_name = "DeepSeek-R1-Distill-Qwen-14B" # model_name = 'deepseek-v3-241226' messages = [] feedback = None judge_prompt = f""" I will show you a [Math Problem], the [Answer], and an [AI's Solution] generated by an AI assistant. Your task is to determine if the **final answer** in the [AI's Solution] matches the answer in the [Reference Solution]. -------------------------------------------------- [Math Problem] {problem} [Answer] {final_answer} [AI's Solution] {steps} -------------------------------------------------- Please evaluate whether the **Answer:** in the [AI's Solution] is correct, based solely on whether it matches the **final answer** in the [Answer]. Note that the [AI's Solution] does not need to replicate same reasoning steps of the [Answer]; it only needs to reach the same **final answer** to be considered correct. Reply with only "Yes" or "No" in the end of your response. """ messages.append({ 'role': 'user', 'content': judge_prompt }) completion = client.chat.completions.create( model=model_name, messages=messages, n=1, temperature=0.6, max_tokens=8192, ) response = completion.choices[0].message.content # print('*****step*****',steps) print("*****Verification*****:", response) content = response.strip() last_words = ' '.join(content.split()[-3:]) # Last 3 words # print('last_words:', last_words) if "Yes" in last_words: yes_or_no = "Yes" elif "No" in last_words: yes_or_no = "No" else: yes_or_no = None print('yes_or_no',yes_or_no) merged_data = { 'question': problem, 'final_answer': final_answer, 'reasining_steps': steps, 'yes_or_no': yes_or_no, 'response': response } return merged_data new_file_path = '/data/zeju/O1_data/0311_test_training_new_processed.jsonl' data_all = read_jsonl(new_file_path) print(len(data_all)) output = [] zero = 0 for data in data_all: print(data_all.index(data)) problem = data['question'] steps_ori = data['process'] labels = data['label'] final_answer = data['answer'] steps = steps_ori.split('\n\n') steps[0] = problem + ' ' + steps[0] # print('steps:',steps) steps_updated = steps[0:len(steps)-1] if zero in data['label']: merged_data = evaluate_llm_as_judge(problem=problem, steps=steps_updated, final_answer= final_answer, output_type='bool') if merged_data != None: output.append(merged_data) else: merged_data = { 'question': problem, 'final_answer': final_answer, 'reasining_steps': steps_updated, 'yes_or_no': "No", 'response': '\n\n-1' } output.append(merged_data) output_file = '/data/zeju/O1_data/0312_test_80_washdata.jsonl' with open(output_file, 'w', encoding='utf-8') as output_file: for entry in output: output_file.write(json.dumps(entry, ensure_ascii=False) + '\n') print(f"数据已成功写入 {output_file}")