1515logger = logging .getLogger (__name__ )
1616
1717# Initialize OpenAI client
18- client = OpenAI (api_key = os .environ .get ("OPENAI_API_KEY" ), base_url = "http ://localhost:8000 /v1" )
18+ client = OpenAI (api_key = os .environ .get ("OPENAI_API_KEY" ), base_url = "https ://ot7nh9nqf4l7b43s.us-east-1.aws.endpoints.huggingface.cloud /v1/ " )
1919
2020SYSTEM_PROMPT = '''You are solving AIME (American Invitational Mathematics Examination) problems.
2121
@@ -241,18 +241,21 @@ def analyze_results(results: List[Dict], n: int):
241241 print ("---" )
242242
243243def main (model : str , n_attempts : int ):
244- """Main evaluation function."""
244+ """Main evaluation function that handles gaps in processed indexes ."""
245245 os .makedirs ("results" , exist_ok = True )
246246
247- # Include n_attempts in filename to keep separate results for different n values
248247 results_file = f"evaluation_results_{ model .replace ('/' , '_' )} _pass_at_{ n_attempts } .json"
249248
250249 dataset = load_2024_dataset ()
251250 existing_results = load_existing_results (results_file )
252- last_processed_index = get_last_processed_index (existing_results )
253251
254- for idx , item in enumerate (tqdm (dataset , desc = "Evaluating problems" )):
255- if idx <= last_processed_index :
252+ # Create a set of already processed indexes for efficient lookup
253+ processed_indexes = {result ['index' ] for result in existing_results }
254+
255+ for _ , item in enumerate (tqdm (dataset , desc = "Evaluating problems" )):
256+ id = int (item ['id' ])
257+ # Skip if this index has already been processed
258+ if id in processed_indexes :
256259 continue
257260
258261 problem_text = item ['problem' ]
@@ -263,7 +266,7 @@ def main(model: str, n_attempts: int):
263266 is_correct , first_correct = evaluate_pass_at_n (attempts , correct_answer )
264267
265268 result = {
266- "index" : idx ,
269+ "index" : id ,
267270 "problem" : problem_text ,
268271 "attempts" : attempts ,
269272 "correct_answer" : correct_answer ,
@@ -275,6 +278,7 @@ def main(model: str, n_attempts: int):
275278 final_results = load_existing_results (results_file )
276279 analyze_results (final_results , n_attempts )
277280
281+
278282if __name__ == "__main__" :
279283 parser = argparse .ArgumentParser (description = "Evaluate LLM performance on AIME 2024 problems" )
280284 parser .add_argument ("--model" , type = str , required = True , help = "OpenAI model to use (e.g., gpt-4, gpt-3.5-turbo)" )
0 commit comments