From 8f2b8c69fcea633f82dffd3d040f6db0f83f7a61 Mon Sep 17 00:00:00 2001 From: robotMonkeyButler Date: Fri, 25 Jul 2025 19:22:01 +0000 Subject: [PATCH 1/4] update --- examples/hf_auto_eval.py | 242 +++++++-------- tiny_scientist/coder.py | 11 +- tiny_scientist/coder_docker.py | 353 ++++++++++++++++++++++ tiny_scientist/fewshot_sample/__init__.py | 0 tiny_scientist/tool.py | 56 +++- 5 files changed, 509 insertions(+), 153 deletions(-) create mode 100644 tiny_scientist/coder_docker.py create mode 100644 tiny_scientist/fewshot_sample/__init__.py diff --git a/examples/hf_auto_eval.py b/examples/hf_auto_eval.py index d565597..ae165c5 100644 --- a/examples/hf_auto_eval.py +++ b/examples/hf_auto_eval.py @@ -1,147 +1,121 @@ +#!/usr/bin/env python3 +""" +Batch‑evaluate all model/dataset/metric triples from perfect_model_dataset_metrics.json + +Usage: + python hf_eval_all.py --llm-model gpt-4o --runs 1 --max-fixes 5 --limit 20 +""" + import argparse +import json +import os +import sys +from pathlib import Path -from tiny_scientist import TinyScientist - - -def test_docker_availability() -> bool: - """Test if Docker is available.""" - try: - from tiny_scientist.tool import DockerExperimentRunner - - runner = DockerExperimentRunner() - if runner.use_docker: - print("✅ Docker is available and will be used") - return True - else: - print("⚠️ Docker is not available, will use local execution") - return False - except Exception as e: - print(f"❌ Error checking Docker: {e}") - return False - - -def create_formatted_idea(model: str, dataset: str, metric: str) -> dict: - """Create a formatted idea dictionary that matches TinyScientist's expected structure.""" - return { - "Name": f"evaluate_{model.replace('/', '_').replace('-', '_')}_{dataset.replace('/', '_').replace('-', '_')}", - "Title": f"Evaluating {model} on {dataset} using {metric} Metric", - "Description": f"Reproduce and evaluate the performance of the Hugging Face model {model} on the {dataset} dataset, specifically measuring the {metric} metric to establish baseline performance.", - "Problem": f"Need to reproduce and validate the evaluation of {model} on {dataset} with focus on {metric} metric for performance verification and comparison.", - "Importance": f"Reproducing model evaluations is crucial for scientific reproducibility and establishing reliable baselines. The {metric} metric provides key insights into model performance on {dataset}.", - "Difficulty": "Moderate - requires proper model loading, dataset preprocessing, and evaluation setup, but uses standard HuggingFace libraries.", - "NoveltyComparison": f"While model evaluation is standard practice, this specific reproduction of {model} on {dataset} focusing on {metric} provides valuable validation and baseline establishment.", - "Approach": f"Load the pre-trained {model} from HuggingFace, prepare the {dataset} dataset, implement evaluation pipeline, and compute {metric} along with other relevant metrics.", - "is_experimental": True, - "Interestingness": 6, - "Feasibility": 9, - "Novelty": 4, - "IntentAlignment": 10, - "Score": 7, - "Experiment": {"Model": model, "Dataset": dataset, "Metric": metric}, - } +# Allow importing tiny_scientist +sys.path.insert(0, str(Path(__file__).parent.parent)) +from tiny_scientist.coder_docker import DockerCoder -def main(): +def load_combinations(json_path: Path): + if not json_path.exists(): + raise FileNotFoundError(f"Config file not found: {json_path}") + with open(json_path, "r") as f: + data = json.load(f) + return data.get("results", []) + + +def iter_triples(combos): """ - This script uses TinyScientist to automate the process of reproducing - a model evaluation on a given dataset for a specific task. + Yield (model_id, dataset_id, metric_name) for every simple metric. + Skips metrics whose value is a dict (nested/complex). """ - parser = argparse.ArgumentParser( - description="Reproduce a model evaluation using TinyScientist." - ) - parser.add_argument( - "--model", - type=str, - required=True, - help="The Hugging Face model name (e.g., 'dslim/bert-base-NER').", - ) - parser.add_argument( - "--dataset", - type=str, - required=True, - help="The Hugging Face dataset name (e.g., 'eriktks/conll2003').", - ) - parser.add_argument( - "--metric", - type=str, - required=True, - help="The specific metric to evaluate (e.g., 'F1', 'accuracy', 'BLEU', 'ROUGE', 'precision', 'recall').", - ) - parser.add_argument( - "--gpt_model", - type=str, - default="gpt-4o", - help="The GPT model to use for TinyScientist.", - ) - parser.add_argument( - "--use_docker", - action="store_true", - default=True, - help="Use Docker for experiment execution (default: True)", - ) + for combo in combos: + model = combo.get("model_id") + dataset = combo.get("dataset_id") + metrics = combo.get("metrics", {}) or {} + for metric_name, value in metrics.items(): + if value is None or isinstance(value, dict): + continue + yield model, dataset, metric_name + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("--json-file", default=str(Path(__file__).parent / "perfect_model_dataset_metrics.json")) + parser.add_argument("--llm-model", default="gpt-4o") + parser.add_argument("--runs", type=int, default=1) + parser.add_argument("--max-fixes", type=int, default=5) + parser.add_argument("--limit", type=int, default=10, help="Evaluate only first N triples (0 = all)") args = parser.parse_args() - # Test Docker availability - docker_available = test_docker_availability() - - if args.use_docker and not docker_available: - print("⚠️ Docker requested but not available, falling back to local execution") - args.use_docker = False - - # Before running, ensure you have tiny_scientist installed: - # pip install tiny-scientist - - # Initialize TinyScientist with the specified model and Docker configuration - print(f"Initializing TinyScientist with model: {args.gpt_model}") - print(f"Docker enabled: {args.use_docker}") - scientist = TinyScientist(model=args.gpt_model, use_docker=args.use_docker) - - # 1. Define the research intent based on user input. - # This string is the core instruction for TinyScientist. - intent = ( - f"I want to write a script to reproduce the evaluation of the Hugging Face model '{args.model}' " - f"on the dataset '{args.dataset}'. I want to specifically measure the {args.metric} metric. " - f"The script should load the model and dataset, run the evaluation, " - f"and report the {args.metric} metric along with other relevant evaluation metrics." - ) - - print(f"🔬 Intent: {intent}") - - # Step 1: Create a formatted idea directly (skipping scientist.think) - print("\nStep 1: Creating formatted research idea...") - idea = create_formatted_idea(args.model, args.dataset, args.metric) - print("✅ Research idea created.") - print(f"📋 Idea Title: {idea['Title']}") - print(f"📊 Target Metric: {idea['Experiment']['Metric']}") - - # Step 2: Generate and run the experiment code - print("\nStep 2: Generating and running experiment code...") - status, experiment_dir = scientist.code(idea=idea) - - # If the experiments run successfully, proceed to writing the paper - if status is True: - print(f"✅ Experiments completed successfully. Results are in: {experiment_dir}") - - # Step 3: Write a research paper based on the findings - print("\nStep 3: Writing a research paper...") - pdf_path = scientist.write(idea=idea, experiment_dir=experiment_dir) - if not pdf_path: - print("❌ Failed to write the paper.") - return - print(f"✅ Paper written and saved to: {pdf_path}") - - # Step 4: Review the generated paper - print("\nStep 4: Reviewing the paper...") - review = scientist.review(pdf_path=pdf_path) - print("✅ Review complete.") - print("\n--- Paper Review ---") - print(review) - print("--------------------") - else: - print( - f"❌ Experiments failed. Check the logs in the experiment directory: {experiment_dir}" + combos = load_combinations(Path(args.json_file)) + triples = list(iter_triples(combos)) + if args.limit and args.limit > 0: + triples = triples[: args.limit] + + if not triples: + print("No evaluable triples found.") + return + + os.makedirs("simple_results", exist_ok=True) + + summary = [] + success_count = 0 + + print(f"Starting evaluation of {len(triples)} triples...\n") + for i, (model, dataset, metric) in enumerate(triples, 1): + print("="*50) + print(f"[{i}/{len(triples)}] {model} | {dataset} | {metric}") + print("="*50) + safe_dir = f"{model}_{dataset}_{metric}".replace("/", "_").replace("-", "_") + out_dir = f"simple_results/{safe_dir}" + + try: + # Create a new DockerCoder instance for each evaluation with its own output directory + coder = DockerCoder(model=args.llm_model, output_dir=out_dir) + success, message = coder.evaluate_model( + model_name=model, + dataset_name=dataset, + metric=metric, + max_runs=args.runs, + max_fixes=args.max_fixes, + ) + except Exception as e: + success, message = False, f"Exception: {e}" + + summary.append( + { + "model": model, + "dataset": dataset, + "metric": metric, + "success": success, + "message": message, + "output_dir": out_dir if success else None, + } ) + if success: + success_count += 1 + + # Save summary + summary_path = "simple_results/batch_summary.json" + with open(summary_path, "w") as f: + json.dump( + { + "total": len(triples), + "success": success_count, + "failed": len(triples) - success_count, + "success_rate": success_count / len(triples), + "results": summary, + }, + f, + indent=2, + ) + + print("\nDone.") + print(f"Success: {success_count} / {len(triples)} " + f"({success_count / len(triples) * 100:.1f}%)") + print(f"Summary written to {summary_path}") if __name__ == "__main__": diff --git a/tiny_scientist/coder.py b/tiny_scientist/coder.py index 784274b..b5889eb 100644 --- a/tiny_scientist/coder.py +++ b/tiny_scientist/coder.py @@ -89,11 +89,12 @@ def setup_aider( def run( self, idea: Dict[str, Any], baseline_results: Optional[Dict[str, Any]] = {} ) -> Tuple[bool, str, Optional[str]]: - # Ensure a clean slate for every run - print(f"[System] Cleaning experiment directory: {self.output_dir}") - if osp.exists(self.output_dir): - shutil.rmtree(self.output_dir) - os.makedirs(self.output_dir) + # Ensure the output directory exists. + # NOTE: Directory cleaning has been removed for safety. + # Old experiment files might persist in subsequent runs. + print(f"[System] Ensuring experiment directory exists: {self.output_dir}") + os.makedirs(self.output_dir, exist_ok=True) + fnames = [ osp.join(self.output_dir, "experiment.py"), osp.join(self.output_dir, "notes.txt"), diff --git a/tiny_scientist/coder_docker.py b/tiny_scientist/coder_docker.py new file mode 100644 index 0000000..46edffa --- /dev/null +++ b/tiny_scientist/coder_docker.py @@ -0,0 +1,353 @@ +""" +Simplified Docker-based Code Generation and Execution with Aider Auto-fixing + +This module provides a minimal implementation for: +1. Generating Python experiment code using LLMs +2. Running code in Docker containers +3. Auto-fixing errors using Aider inside containers +""" + +import json +import os +import tempfile +from typing import Dict, Optional, Tuple + +import docker +from .utils.llm import create_client, get_response_from_llm + +# Add requests import for HuggingFace API +import requests +import time + + +class DockerCoder: + """Simplified Docker-based code generator with Aider auto-fixing""" + + def __init__(self, model: str = "gpt-4o-mini", output_dir: str = "results"): + try: + self.client, self.model = create_client(model) + print(f"✅ LLM client created: {model}") + except Exception as e: + print(f"❌ LLM client creation failed: {e}") + print("💡 Make sure to set OPENAI_API_KEY or configure API keys") + raise + + self.output_dir = os.path.abspath(output_dir) + self.docker_client = docker.from_env() + + # Create output directory + os.makedirs(self.output_dir, exist_ok=True) + + def evaluate_model( + self, + model_name: str, + dataset_name: str, + metric: str, + max_runs: int = 3, + max_fixes: int = 5 + ) -> Tuple[bool, str]: + """ + Generate and run experiment with automatic error fixing + + Args: + model_name: HuggingFace model identifier + dataset_name: HuggingFace dataset identifier + metric: Evaluation metric (accuracy, f1, etc.) + max_runs: Number of experiment runs + max_fixes: Max Aider fix attempts per failed run + + Returns: + Tuple of (success, message) + """ + print(f"🎯 Evaluating {model_name} on {dataset_name} using {metric}") + + # Step 1: Generate experiment code using LLM + if not self._generate_experiment_code(model_name, dataset_name, metric): + return False, "Failed to generate experiment code" + + # Step 2: Create Docker container + container = self._create_container() + if not container: + return False, "Failed to create Docker container" + + try: + # Step 3: Run experiments with auto-fixing + success_count = 0 + for run in range(1, max_runs + 1): + print(f"\n🔄 Run {run}/{max_runs}") + + if self._run_experiment_with_fixes(container, run, max_fixes): + success_count += 1 + print(f"✅ Run {run} succeeded") + else: + print(f"❌ Run {run} failed after {max_fixes} fix attempts") + + if success_count > 0: + return True, f"Completed {success_count}/{max_runs} successful runs" + else: + return False, f"All {max_runs} runs failed" + + finally: + # Clean up container + container.stop() + container.remove() + + def _generate_experiment_code(self, model_name: str, dataset_name: str, metric: str) -> bool: + """Generate Python experiment code using LLM only - no fallback""" + + # Require LLM client - no fallback + if self.client is None: + print("❌ No LLM client available. Cannot generate experiment code.") + return False + + # Get model README for context + print(f"📖 Fetching README for {model_name}...") + model_readme = self._get_model_readme(model_name) + + # LLM generation only + prompt = f""" +Generate a complete Python script to evaluate the HuggingFace model '{model_name}' +on the '{dataset_name}' dataset using the '{metric}' metric. + +MODEL README INFORMATION: +``` +{model_readme} +``` + +CRITICAL REQUIREMENTS: +1. Import os and use: hf_token = os.getenv('HF_TOKEN') +2. Pass token parameter to from_pretrained(): token=hf_token (NOT use_auth_token) +3. Use device = torch.device("cuda" if torch.cuda.is_available() else "cpu") +4. Move model to device: model.to(device) +5. For dataset processing, iterate through individual examples, NOT batches +6. Process each example individually to avoid tensor conversion issues +7. Use simple list operations for predictions and labels +8. Handle errors with try/except and print traceback +9. Use argparse for --out_dir parameter +10. Process max 100 samples for speed (not 1000) +11. Print progress every 50 samples +12. **CRITICAL**: Save ONLY metric results to JSON - NO predictions, NO true_labels arrays +13. **CRITICAL**: Use sklearn.metrics functions to calculate final metric value +14. **CRITICAL**: Follow the model README guidelines for proper usage, including: + - Correct input preprocessing + - Proper label mapping (if specified in README) + - Recommended inference settings + - Any model-specific requirements + +FEW-SHOT EXAMPLE - Results saving pattern (FOLLOW THIS EXACTLY): +```python +# Calculate the final metric value only +if "{metric}".lower() == "accuracy": + from sklearn.metrics import accuracy_score + metric_value = accuracy_score(true_labels, predictions) +elif "{metric}".lower() == "f1": + from sklearn.metrics import f1_score + metric_value = f1_score(true_labels, predictions, average='weighted') +else: + from sklearn.metrics import accuracy_score + metric_value = accuracy_score(true_labels, predictions) # default + +# Save ONLY the calculated metric - DO NOT save predictions or true_labels +results = {{ + "{metric}": metric_value, + "total_samples": len(true_labels) +}} + +os.makedirs(out_dir, exist_ok=True) +with open(os.path.join(out_dir, "evaluation_results.json"), "w") as f: + json.dump(results, f, indent=2) +``` + +Generate the complete script with proper imports, main function, and all necessary code: +""" + try: + print("🤖 Generating experiment code with LLM...") + response, _ = get_response_from_llm( + msg=prompt, + client=self.client, + model=self.model, + system_message="You are an expert Python developer. Generate complete, robust code for ML experiments. Follow the requirements exactly." + ) + + # Extract and save code + code = self._extract_code(response) + exp_path = os.path.join(self.output_dir, "experiment.py") + with open(exp_path, "w") as f: + f.write(code) + + print(f"✅ Generated experiment code: {exp_path}") + return True + + except Exception as e: + print(f"❌ LLM generation failed: {e}") + return False + + def _create_container(self) -> Optional[docker.models.containers.Container]: + """Create Docker container with Python + Aider + ML libraries""" + try: + # Use existing image + image_name = "simple-coder:latest" + + # Environment variables + env_vars = {} + for key in ["HF_TOKEN", "OPENAI_API_KEY", "ANTHROPIC_API_KEY"]: + if os.getenv(key): + env_vars[key] = os.getenv(key) + print(f"🔑 Found {key}: {os.getenv(key)[:10]}...") + else: + print(f"❌ Missing {key}") + + print(f"📦 Environment variables to pass: {list(env_vars.keys())}") + + # Create container + container = self.docker_client.containers.run( + image=image_name, + command="tail -f /dev/null", # Keep running + volumes={self.output_dir: {"bind": "/workspace", "mode": "rw"}}, + working_dir="/workspace", + environment=env_vars, + detach=True, + remove=False, + mem_limit="4g" + ) + + # Initialize git (Aider requirement) + container.exec_run("git init", workdir="/workspace") + container.exec_run("git config user.email 'test@example.com'", workdir="/workspace") + container.exec_run("git config user.name 'Test User'", workdir="/workspace") + container.exec_run("git add .", workdir="/workspace") + container.exec_run("git commit -m 'Initial commit' --allow-empty", workdir="/workspace") + + print(f"✅ Container ready: {container.id[:12]}") + return container + + except Exception as e: + print(f"❌ Container creation failed: {e}") + return None + + def _run_experiment_with_fixes(self, container, run_num: int, max_fixes: int) -> bool: + """Run experiment with automatic Aider fixes on failure""" + + for attempt in range(max_fixes + 1): # +1 for initial attempt + # Run experiment + print(f" 🧪 Attempt {attempt + 1}") + print(f" 📋 Running: python experiment.py --out_dir=run_{run_num}") + + result = container.exec_run( + f"python experiment.py --out_dir=run_{run_num}", + workdir="/workspace" + ) + + output = result.output.decode('utf-8') if result.output else "" + print(f" 📊 Exit code: {result.exit_code}") + print(f" 📝 Output:") + print("=" * 50) + print(output) + print("=" * 50) + + # Check for success + if result.exit_code == 0 and not self._has_errors(output): + # Additional check: verify any JSON results file was created + run_dir = os.path.join(self.output_dir, f"run_{run_num}") + json_files = [] + if os.path.exists(run_dir): + json_files = [f for f in os.listdir(run_dir) if f.endswith('.json')] + + if json_files: + print(f" ✅ Success on attempt {attempt + 1} - found JSON files: {json_files}") + return True + else: + print(f" ⚠️ Exit code 0 but no JSON files found in {run_dir}, treating as failure") + else: + print(f" ❌ Failed on attempt {attempt + 1}") + + # Failed - try Aider fix if we have API keys + if attempt < max_fixes and os.getenv("OPENAI_API_KEY"): + print(f" 🔧 Fixing with Aider (attempt {attempt + 1}/{max_fixes})") + + fix_prompt = f""" +The experiment failed with this error: +{output} + +Please fix the experiment.py file to resolve this error. +Make sure the code handles authentication, imports, and data processing correctly. +""" + + fix_result = container.exec_run([ + "aider", "--yes", "--model", "gpt-4o-mini", + "--message", fix_prompt, "experiment.py" + ], workdir="/workspace") + + if fix_result.exit_code != 0: + print(f" ❌ Aider fix failed") + else: + print(f" ✅ Aider fix completed") + elif attempt < max_fixes: + print(f" ⚠️ No OPENAI_API_KEY, skipping Aider fix") + else: + print(f" 💀 Max fix attempts reached") + + return False + + def _has_errors(self, output: str) -> bool: + """Check if output contains error indicators""" + error_patterns = [ + 'traceback', 'error:', 'exception:', 'failed', + 'importerror', 'modulenotfounderror', 'syntaxerror', + '401 client error', 'unauthorized', 'invalid credentials', + 'http error', 'connection error', 'authentication', + 'hfhubhttperror', 'oserror', 'valueerror' + ] + + # Check for specific error patterns + has_error = any(pattern in output.lower() for pattern in error_patterns) + + if has_error: + return True + + # Only consider it an error if we see explicit failure patterns + # Don't require specific success indicators - let file existence check handle success + return False + + def _extract_code(self, response: str) -> str: + """Extract Python code from LLM response""" + # Look for code blocks + if "```python" in response: + start = response.find("```python") + 9 + end = response.find("```", start) + if end != -1: + return response[start:end].strip() + + if "```" in response: + start = response.find("```") + 3 + end = response.find("```", start) + if end != -1: + return response[start:end].strip() + + # If no code blocks, return the whole response + return response.strip() + + def _get_model_readme(self, model_name: str) -> str: + """Fetch model README from HuggingFace Hub API""" + try: + # Get HF token if available + hf_token = os.getenv('HF_TOKEN') + headers = {} + if hf_token: + headers['Authorization'] = f'Bearer {hf_token}' + + # Try to get README from HF Hub API + url = f"https://huggingface.co/{model_name}/resolve/main/README.md" + response = requests.get(url, headers=headers, timeout=10) + + if response.status_code == 200: + readme_content = response.text + # Limit README size to avoid token overflow + if len(readme_content) > 3000: + readme_content = readme_content[:3000] + "\n... (truncated)" + return readme_content + else: + return f"README not available (HTTP {response.status_code})" + + except Exception as e: + return f"Failed to fetch README: {str(e)}" \ No newline at end of file diff --git a/tiny_scientist/fewshot_sample/__init__.py b/tiny_scientist/fewshot_sample/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tiny_scientist/tool.py b/tiny_scientist/tool.py index 4e2003e..96fff89 100644 --- a/tiny_scientist/tool.py +++ b/tiny_scientist/tool.py @@ -744,7 +744,7 @@ def get_or_build_base_image(self) -> Optional[str]: print(f"[Docker] Building image: {self.docker_image}") dockerfile = f""" FROM {self.docker_base} -RUN pip install --no-cache-dir numpy pandas scikit-learn matplotlib seaborn torch tensorflow transformers datasets evaluate wandb tqdm requests pillow +RUN pip install --no-cache-dir numpy pandas scikit-learn matplotlib seaborn torch tensorflow transformers datasets evaluate wandb tqdm requests pillow tf_keras keras sentencepiece """ with tempfile.TemporaryDirectory() as tmpdir: with open(os.path.join(tmpdir, "Dockerfile"), "w") as f: @@ -785,25 +785,53 @@ def get_or_build_experiment_image(self, experiment_py_path: str) -> Optional[str COPY experiment.py . """ with open(os.path.join(tmpdir, "Dockerfile"), "w") as f: - f.write(dockerfile) + f.write(dockerfile) # Copy experiment.py shutil.copy( experiment_py_path, os.path.join(tmpdir, "experiment.py") ) try: - # Build with detailed logging - build_logs = self.docker_client.images.build( - path=tmpdir, tag=image_name, rm=True, decode=True + # Build with detailed logging. + # The build() method can return a tuple: (image_object, logs_generator) + build_result = self.docker_client.images.build( + path=tmpdir, tag=image_name, rm=True, decode=False ) - # Check for build errors - for log in build_logs: - if "error" in log: - print(f"[Docker] Build error: {log['error']}") - raise Exception( - f"Docker build failed: {log['error']}" - ) - elif "stream" in log: - print(f"[Docker] {log['stream'].strip()}") + # docker-py 不同版本: + # a) 返回 (image_obj, logs_generator) + # b) 仅返回 logs_generator + if isinstance(build_result, tuple): + image, logs_generator = build_result + else: + image, logs_generator = None, build_result + + for entry in logs_generator: + # entry 现在应该是 bytes,需要手动解码 + if isinstance(entry, (bytes, bytearray)): + text = entry.decode("utf-8", errors="ignore") + for line in text.splitlines(): + if line.strip(): + try: + log = json.loads(line) + if "error" in log: + raise docker.errors.BuildError(log["error"], log) + if "stream" in log: + print(f"[Docker] {log['stream'].strip()}") + except json.JSONDecodeError: + print(f"[Docker] {line.strip()}") + else: + # 如果不是 bytes,直接转成字符串处理 + line = str(entry).strip() + if line: + print(f"[Docker] {line}") + + print(f"[Docker] Build successful for image {image_name}") + + except docker.errors.BuildError as e: + print(f"[Docker] Docker build failed explicitly: {e}") + print(f"[Docker] Build logs: {e.build_log}") + # Fallback to base image + print(f"[Docker] Falling back to base image: {base_image}") + return base_image except Exception as e: print(f"[Docker] Failed to build image {image_name}: {e}") # Fallback to base image From 5ff7e5581dd2a11cf505b6ac1ab2bc1ba6648539 Mon Sep 17 00:00:00 2001 From: robotMonkeyButler Date: Fri, 25 Jul 2025 19:22:45 +0000 Subject: [PATCH 2/4] update --- examples/hf_auto_eval_parallel.py | 159 ++++++++++++++++++++++++++++++ 1 file changed, 159 insertions(+) create mode 100644 examples/hf_auto_eval_parallel.py diff --git a/examples/hf_auto_eval_parallel.py b/examples/hf_auto_eval_parallel.py new file mode 100644 index 0000000..0145b75 --- /dev/null +++ b/examples/hf_auto_eval_parallel.py @@ -0,0 +1,159 @@ +#!/usr/bin/env python3 +""" +Parallel batch evaluation of model/dataset/metric triples + +Usage: + python hf_auto_eval_parallel.py --workers 4 --limit 20 --llm-model gpt-4o +""" + +import argparse +import json +import os +import sys +from pathlib import Path +from multiprocessing import Pool, Manager +import time + +# Allow importing tiny_scientist +sys.path.insert(0, str(Path(__file__).parent.parent)) +from tiny_scientist.coder_docker import DockerCoder + + +def load_combinations(json_path: Path): + if not json_path.exists(): + raise FileNotFoundError(f"Config file not found: {json_path}") + with open(json_path, "r") as f: + data = json.load(f) + return data.get("results", []) + + +def iter_triples(combos): + """ + Yield (model_id, dataset_id, metric_name) for every simple metric. + Skips metrics whose value is a dict (nested/complex). + """ + for combo in combos: + model = combo.get("model_id") + dataset = combo.get("dataset_id") + metrics = combo.get("metrics", {}) or {} + for metric_name, value in metrics.items(): + if value is None or isinstance(value, dict): + continue + yield model, dataset, metric_name + + +def evaluate_single_triple(args_tuple): + """ + Worker function to evaluate a single (model, dataset, metric) triple + """ + model, dataset, metric, llm_model, runs, max_fixes, worker_id = args_tuple + + print(f"🔄 [Worker {worker_id}] Starting: {model} | {dataset} | {metric}") + + safe_dir = f"{model}_{dataset}_{metric}".replace("/", "_").replace("-", "_") + out_dir = f"simple_results/{safe_dir}" + + try: + # Create DockerCoder instance for this evaluation + coder = DockerCoder(model=llm_model, output_dir=out_dir) + success, message = coder.evaluate_model( + model_name=model, + dataset_name=dataset, + metric=metric, + max_runs=runs, + max_fixes=max_fixes, + ) + + status = "✅" if success else "❌" + print(f"{status} [Worker {worker_id}] Completed: {model} | {dataset} | {metric}") + + return { + "model": model, + "dataset": dataset, + "metric": metric, + "success": success, + "message": message, + "output_dir": out_dir if success else None, + "worker_id": worker_id + } + + except Exception as e: + print(f"❌ [Worker {worker_id}] Exception: {model} | {dataset} | {metric} - {e}") + return { + "model": model, + "dataset": dataset, + "metric": metric, + "success": False, + "message": f"Exception: {e}", + "output_dir": None, + "worker_id": worker_id + } + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("--json-file", default=str(Path(__file__).parent / "perfect_model_dataset_metrics.json")) + parser.add_argument("--llm-model", default="gpt-4o") + parser.add_argument("--runs", type=int, default=1) + parser.add_argument("--max-fixes", type=int, default=3) + parser.add_argument("--limit", type=int, default=1000, help="Evaluate only first N triples (0 = all)") + parser.add_argument("--workers", type=int, default=2, help="Number of parallel workers") + args = parser.parse_args() + + combos = load_combinations(Path(args.json_file)) + triples = list(iter_triples(combos)) + if args.limit and args.limit > 0: + triples = triples[: args.limit] + + if not triples: + print("No evaluable triples found.") + return + + os.makedirs("simple_results", exist_ok=True) + + print(f"🚀 Starting parallel evaluation of {len(triples)} triples using {args.workers} workers...\n") + + # Prepare arguments for worker processes + worker_args = [] + for i, (model, dataset, metric) in enumerate(triples): + worker_id = i % args.workers + 1 + worker_args.append((model, dataset, metric, args.llm_model, args.runs, args.max_fixes, worker_id)) + + start_time = time.time() + + # Run evaluations in parallel + with Pool(processes=args.workers) as pool: + results = pool.map(evaluate_single_triple, worker_args) + + end_time = time.time() + duration = end_time - start_time + + # Process results + success_count = sum(1 for r in results if r["success"]) + + # Save summary + summary_path = "simple_results/batch_summary_parallel.json" + with open(summary_path, "w") as f: + json.dump( + { + "total": len(triples), + "success": success_count, + "failed": len(triples) - success_count, + "success_rate": success_count / len(triples), + "workers": args.workers, + "duration_seconds": duration, + "results": results, + }, + f, + indent=2, + ) + + print(f"\n🎉 Parallel evaluation completed!") + print(f"⏱️ Duration: {duration:.1f} seconds") + print(f"📊 Success: {success_count} / {len(triples)} ({success_count / len(triples) * 100:.1f}%)") + print(f"🔄 Workers: {args.workers}") + print(f"📁 Summary: {summary_path}") + + +if __name__ == "__main__": + main() \ No newline at end of file From b299f819fdfac0fccd59c8e84d26b225e66d185a Mon Sep 17 00:00:00 2001 From: robotMonkeyButler Date: Fri, 1 Aug 2025 18:55:00 +0000 Subject: [PATCH 3/4] update the docer details --- examples/hf_auto_eval.py | 4 +- tiny_scientist/coder_docker.py | 107 ++++++++++++++++++++------------- 2 files changed, 67 insertions(+), 44 deletions(-) diff --git a/examples/hf_auto_eval.py b/examples/hf_auto_eval.py index ae165c5..9bc32b5 100644 --- a/examples/hf_auto_eval.py +++ b/examples/hf_auto_eval.py @@ -45,8 +45,8 @@ def main(): parser.add_argument("--json-file", default=str(Path(__file__).parent / "perfect_model_dataset_metrics.json")) parser.add_argument("--llm-model", default="gpt-4o") parser.add_argument("--runs", type=int, default=1) - parser.add_argument("--max-fixes", type=int, default=5) - parser.add_argument("--limit", type=int, default=10, help="Evaluate only first N triples (0 = all)") + parser.add_argument("--max-fixes", type=int, default=15) + parser.add_argument("--limit", type=int, default=30, help="Evaluate only first N triples (0 = all)") args = parser.parse_args() combos = load_combinations(Path(args.json_file)) diff --git a/tiny_scientist/coder_docker.py b/tiny_scientist/coder_docker.py index 46edffa..d500137 100644 --- a/tiny_scientist/coder_docker.py +++ b/tiny_scientist/coder_docker.py @@ -114,51 +114,54 @@ def _generate_experiment_code(self, model_name: str, dataset_name: str, metric: {model_readme} ``` -CRITICAL REQUIREMENTS: -1. Import os and use: hf_token = os.getenv('HF_TOKEN') -2. Pass token parameter to from_pretrained(): token=hf_token (NOT use_auth_token) -3. Use device = torch.device("cuda" if torch.cuda.is_available() else "cpu") -4. Move model to device: model.to(device) -5. For dataset processing, iterate through individual examples, NOT batches -6. Process each example individually to avoid tensor conversion issues -7. Use simple list operations for predictions and labels -8. Handle errors with try/except and print traceback -9. Use argparse for --out_dir parameter -10. Process max 100 samples for speed (not 1000) -11. Print progress every 50 samples -12. **CRITICAL**: Save ONLY metric results to JSON - NO predictions, NO true_labels arrays -13. **CRITICAL**: Use sklearn.metrics functions to calculate final metric value -14. **CRITICAL**: Follow the model README guidelines for proper usage, including: - - Correct input preprocessing - - Proper label mapping (if specified in README) - - Recommended inference settings - - Any model-specific requirements +GUIDELINES (be flexible and adapt as needed): -FEW-SHOT EXAMPLE - Results saving pattern (FOLLOW THIS EXACTLY): -```python -# Calculate the final metric value only -if "{metric}".lower() == "accuracy": - from sklearn.metrics import accuracy_score - metric_value = accuracy_score(true_labels, predictions) -elif "{metric}".lower() == "f1": - from sklearn.metrics import f1_score - metric_value = f1_score(true_labels, predictions, average='weighted') -else: - from sklearn.metrics import accuracy_score - metric_value = accuracy_score(true_labels, predictions) # default +**Authentication & Environment:** +- Use HF_TOKEN from environment: hf_token = os.getenv('HF_TOKEN') +- Pass token to model loading: token=hf_token (if needed) +- DO NOT add --token command line argument - token comes from environment only +- Detect CUDA availability and use appropriate device -# Save ONLY the calculated metric - DO NOT save predictions or true_labels -results = {{ - "{metric}": metric_value, - "total_samples": len(true_labels) -}} +**Model & Dataset Loading:** +- Load the model and tokenizer appropriately for the task type +- Handle different dataset structures intelligently +- Adapt to the specific dataset format (text classification, QA, etc.) -os.makedirs(out_dir, exist_ok=True) -with open(os.path.join(out_dir, "evaluation_results.json"), "w") as f: - json.dump(results, f, indent=2) -``` +**Data Processing Strategy:** +- Examine the dataset structure and adapt accordingly +- For text tasks: intelligently combine relevant fields (question+passage, premise+hypothesis, etc.) +- For classification: map labels correctly based on model outputs +- Process examples efficiently - you can batch if it's more appropriate +- Limit to around 100 samples for speed, but adjust if needed + +**Inference & Evaluation:** +- Use the model in the most appropriate way for its task type +- Handle different output formats (logits, probabilities, text generation) +- Calculate the requested metric correctly using sklearn or appropriate libraries +- Be robust to different label formats (bool, int, string) + +**Output Requirements:** +- Use argparse for --out_dir parameter ONLY (no other parameters needed) +- Get HF_TOKEN from environment variable os.getenv('HF_TOKEN'), not from command line +- Save results as JSON with the metric name and value +- Include basic metadata (total_samples, etc.) +- Print progress and final results + +**Error Handling:** +- Be reasonably robust but don't over-engineer +- Let the script fail fast on major issues rather than silently skipping everything +- Print helpful error messages + +**Flexibility Guidelines:** +- The model README contains the authoritative usage information - follow it +- Adapt the code structure to what makes sense for this specific model+dataset combination +- Don't force a rigid pattern if the model/dataset has special requirements +- Use your judgment on tokenization, preprocessing, and inference approach -Generate the complete script with proper imports, main function, and all necessary code: +Generate a complete, working Python script that intelligently handles this specific model and dataset combination: + +IMPORTANT: The script will be called as: python experiment.py --out_dir=DIRECTORY +Do not add any other command line parameters! """ try: print("🤖 Generating experiment code with LLM...") @@ -215,6 +218,9 @@ def _create_container(self) -> Optional[docker.models.containers.Container]: container.exec_run("git init", workdir="/workspace") container.exec_run("git config user.email 'test@example.com'", workdir="/workspace") container.exec_run("git config user.name 'Test User'", workdir="/workspace") + # Configure git to work in limited environment + container.exec_run("git config diff.external false", workdir="/workspace") + container.exec_run("git config core.filemode false", workdir="/workspace") container.exec_run("git add .", workdir="/workspace") container.exec_run("git commit -m 'Initial commit' --allow-empty", workdir="/workspace") @@ -273,13 +279,30 @@ def _run_experiment_with_fixes(self, container, run_num: int, max_fixes: int) -> Make sure the code handles authentication, imports, and data processing correctly. """ + print(f" 🤖 Running Aider with prompt:") + print(f" {fix_prompt.strip()}") + print(f" {'='*60}") + fix_result = container.exec_run([ "aider", "--yes", "--model", "gpt-4o-mini", + "--no-git", "--message", fix_prompt, "experiment.py" ], workdir="/workspace") + # Export and display Aider's complete output + aider_output = fix_result.output.decode('utf-8') if fix_result.output else "" + print(f" 📋 Aider output (exit code: {fix_result.exit_code}):") + print(f" {'─'*60}") + if aider_output.strip(): + # Show each line with prefix for clarity + for line in aider_output.split('\n'): + print(f" │ {line}") + else: + print(f" │ (No output from Aider)") + print(f" {'─'*60}") + if fix_result.exit_code != 0: - print(f" ❌ Aider fix failed") + print(f" ❌ Aider fix failed (exit code: {fix_result.exit_code})") else: print(f" ✅ Aider fix completed") elif attempt < max_fixes: From a1ecab1bc81dbf850fb11097fe597ebdc5cd992a Mon Sep 17 00:00:00 2001 From: keyangds <107345948+keyangds@users.noreply.github.com> Date: Sat, 20 Sep 2025 13:43:16 -0500 Subject: [PATCH 4/4] update writer prompt --- tiny_scientist/prompts/writer_prompt.yaml | 211 ++++++++++++++++++---- tiny_scientist/utils/output_formatter.py | 98 ++++++++-- tiny_scientist/writer.py | 13 +- 3 files changed, 262 insertions(+), 60 deletions(-) diff --git a/tiny_scientist/prompts/writer_prompt.yaml b/tiny_scientist/prompts/writer_prompt.yaml index b3dae17..7b52540 100644 --- a/tiny_scientist/prompts/writer_prompt.yaml +++ b/tiny_scientist/prompts/writer_prompt.yaml @@ -33,16 +33,36 @@ section_tips: - How do we solve it (i.e. our contribution!) - How do we verify that we solved it (e.g. Experiments and results) - Please make sure the abstract reads smoothly and is well-motivated. This should be one continuous paragraph with no breaks between the lines. + Please make sure the abstract reads smoothly and is well-motivated. This should be one continuous paragraph with no breaks between the lines. Should not be too long, try to grab its important points. Each point should only have one or two sentences. Introduction: | - - Longer version of the Abstract, i.e. of the entire paper - - What are we trying to do and why is it relevant? - - Why is this hard? - - How do we solve it (i.e. our contribution!) - - How do we verify that we solved it (e.g. Experiments and results) - - New trend: specifically list your contributions as bullet points - - Extra space? Future work! + - Write 5 paragraphs for the instructions. Each paragraph should answer one question. + - paragraph1: What is the problem? + * Start broad with background and context + * Narrow down to the specific gap or unsolved issue + * End with a sharp statement of the problem + * probably end the paragraph with a question like: Can LLM simulate social interactions? + + - paragraph2: Why is it interesting and important? + * Connect to research community demand + * Tie to trends or consensus in the field + * Show broader impact or urgency + + - paragraph3: Why is it hard? (E.g., why do naive approaches fail?) + * Explain inherent difficulty of the problem + * Give examples of why naive approaches fail + * Clarify how these challenges block real progress + + - paragraph4: Why hasn't it been solved before? + (Or, what's wrong with previous proposed solutions? How does mine differ?) + * Acknowledge prior work and attempts + * Point out their shortcomings or gaps + * Clearly state how your approach differs and improves + + - paragrap5: What are the key components of my approach and results? + * Introduce your core approach in one sentence + * Break into 2–3 main components + * Summarize key results Related_Work: | - Academic siblings of our work, i.e. alternative attempts in literature at trying to solve the same problem. @@ -50,8 +70,13 @@ section_tips: - Note: Just describing what another paper is doing is not enough. We need to compare and contrast. Method: | - - What we do. Why we do it. All described using the general Formalism introduced in the Problem Setting and building on top of the concepts / foundations introduced in Background. - - Note: Don't directly put any code in this section, but you can refer to the code in the Method section. + - The first paragraph in the method should be problem definition. provide math notation and basic definition for the problem + - We need to use math to define and describe methods clearly + - You need to focus on describing different parts of your CORE method + - Do you mention concrete implementation details and experimental setup + - Need to use math to describe the whole process of your algorithm that you propose + - If necessary, have a pseudo code or algorithm description to describe your algorithm. But in mose cases, it is not necessary. It depends on the topic and research question you want to answer. + - Very important!!! You need to explain your motivation for any methodology design. Any design is not from scratch, you need to have a good reason (very brief reason) for that before you start talking about the details of each part of your methods. Experimental_Setup: | - How do we test that our stuff works? Introduces a specific instantiation of the Problem Setting and specific implementation details of our Method for this Problem Setting. @@ -60,21 +85,16 @@ section_tips: Results: | - Shows the results of running Method on our problem described in Experimental Setup. - - Includes statements on hyperparameters and other potential issues of fairness. - - Only includes results that have actually been run and saved in the logs. Do not hallucinate results that don't exist. - - If results exist: compares to baselines and includes statistics and confidence intervals. - - If results exist: includes ablation studies to show that specific parts of the method are relevant. - - Discusses limitations of the method. - - Make sure to include all the results from the experiments, and include all relevant figures. + - Statement should be emphasized and as the paragraph title + - You need to first describe the trends and improvement ratio and change in the Table. include all the basic information + - Furthermore, you need to analyze and provide reasons about why this works or not Discussion: | - Analyze what the experimental results reveal in the context of the original research question. - Reflect on whether your method outperformed the baseline — and why that might be the case. - Discuss any cases where your method underperformed or behaved unexpectedly. - - Identify the strengths and weaknesses of your approach based on the comparison to the baseline. - - Connect these insights to the broader literature or practical use cases (without repeating the Related Work section). - - Acknowledge limitations in the findings and suggest possible improvements. - - Offer realistic ideas for future work or applications of your method. + - IMPORTANT POINT: for this section, you need to try to propose points that can be potentially attacked by the review. For example, Is the training result data leakage? Is the human evaluation solid? Does the improvement come from the point you claim? + - each subsection should be a research question Conclusion: | - Brief recap of the entire paper. @@ -147,7 +167,30 @@ section_prompt: Some tips are provided below: {section_tips} - The following structured context must be incorporated: + An example for an introduction should look like: + + LLMs have proved to be powerful copilots in scientific research~\citep{{AI4Science2023TheIO}}, demonstrating their great potential for accelerating scientific discovery. + Despite the promising finding, a more ambitious question remains: \textit{{Can we simulate the human research community with LLMs}}? Answering such a question has multiple benefits: (1) simulating the human research community helps understand the underlying process behind the discovery of existing research ideas; (2) it can further help democratize and accelerate the discovery process of new research ideas. + + However, simulating the human research community is challenging, as it involves leveraging multiple LLM agents to interact with complex research data. While existing multi-agent LLM frameworks have been successfully applied to areas like social simulation~\citep{{zhou2023sotopia,Gao2023S3SS}} and game simulation~\citep{{hua2023war,xu2023language}}, they are not well-suited for simulating research communities due to the complexity of collaborative research activities like paper writing and review writing. Although recent efforts have explored research automation using LLMs, these frameworks are typically limited to specific research tasks, such as idea generation~\citep{{girotra2023ideas, baek2024researchagent}} or code experimentation~\citep{{huang2024mlagentbench}}, or focus on simulating single-agent workflows~\citep{{lu2024ai}}. These frameworks cannot simulate collaborative research activities where researchers with diverse backgrounds work together to brainstorm ideas, review papers, etc—processes that are fundamental to modern human research. + + \paragraph{{Research community as graph}} + Our key observation is that the deeply interconnected research community can be naturally represented as graphs. Indeed, similar graph structures like citation networks~\citep{{newman2001structure}} and academic social networks~\citep{{Tang2008ArnetMinerEA}} have been extensively studied within data mining research, with proven values in applications such as citation prediction~\citep{{holm2020longitudinal}}, recommendation~\citep{{West2016ARS}}, and community detection~\citep{{Yang2012DefiningAE}}. + However, introducing LLMs to a graph-structured research community can extend these previous works from prediction and analysis with existing data to dynamic simulation and real-time forecasting. + + \paragraph{{Novel framework for research simulation}} + In this work, we propose \envname, a simulator of the human research community. To bridge the gap between existing multi-agent simulation frameworks and the complexity of research activities, we propose a graph-based framework, inspired by the message-passing mechanism in Graph Neural Networks (GNNs), for multi-agent simulation. + Concretely, as shown in Figure \ref{{fig:community-graph}}, we propose a new concept of \textit{{agent-data graph}} with 2 generic types of nodes: (1) \textit{{agent}} nodes, suitable for entities like agents; (2) \textit{{data}} nodes, suitable for entities such as papers, reviews, and blogs. + Agent-data graphs are unique from standard heterogeneous graphs; here, the key conceptual difference between agent and data nodes is that an agent node can be considered a function over data nodes. + To inference on agent-data graphs, we propose a \textit{{TextGNN}} framework where message-passing processes are defined based on text-form information processing with LLMs, thanks to their strong in-context learning~\citep{{wei2023larger}} and reasoning~\citep{{lee2024reasoning}} ability. + We apply the proposed agent-data graph and TextGNN to the research simulation. Here, a research community can be regarded as a special form of agent-data graph, called \textit{{community graph}}, with research agents and research papers as two types of nodes, and we consider three types of edges (review, author, and cite) in the graph. Different community activities, such as paper writing and review writing, can be modeled as special message-passing processes on the community graph. + + \paragraph{{Novel evaluation for research simulation}} + With \envname for research simulation, a further research question is to evaluate the quality of that. Prior works primarily use human evaluation with breakdown metrics such as novelty, excitement, feasibility, and expected effectiveness~\citep{{si2024can,hu2024nova}}. These approaches inevitably suffer from subjectiveness and high costs. In our work, since \envname functions as a simulator, our primary focus is on measuring how closely its outputs align with those of the real-world research community. Community graphs naturally provide a similarity-based evaluation method by masking a given paper node in the community graph and evaluating whether a simulator can reconstruct the masked nodes. This definition focuses on simulation similarity, making it scalable and objective. Based on such a node masking prediction task, we build a benchmark called \benchname with 1,000 paper writing tasks and 200 review writing tasks requiring multi-agent collaboration. + + \paragraph{{Main discoveries}} Based on the evaluation results from \benchname, we highlight three key findings: (1) \envname effectively simulates collaborative research activities, achieving an average similarity score of 0.68 for paper writing and 0.49 for review writing, as measured by the state-of-the-art text embedding model; (2) \envname demonstrates robustness and effectiveness in research simulation, showing improvement when more agents are added and maintaining performance when including unrelated papers; (3) \envname inspires interdisciplinary research, generating innovative ideas that combine insights from NLP, criminology, and astronomy and does not exist in the real-world research. + + The following is the idea that you need to write introduction for: - Title: **{title}** - Research Problem: **{problem}** @@ -161,7 +204,7 @@ section_prompt: Be sure to use \cite or \citet where relevant, referring to the works provided in the file. Do not cite anything that is not already in `references.bib`. Do not add any new entries to this. - Use at least **4 paragraphs**, and aim for **250–300 words**. + Use at **5 paragraphs**, and aim for **250–300 words**. Keep the experimental results (figures and tables) only in the Results section, and make sure that any captions are filled in. In this pass, do not reference anything in later sections of the paper. @@ -171,25 +214,62 @@ section_prompt: ... Method: | - Please fill in the Method section of the writeup. The Method section should **clearly define the approach taken in this study**, ensuring that readers can understand and, if needed, replicate the implementation. This section must be based on **both the proposed experiment and the provided code**. + Please fill in the Method section of the writeup. The Method section should **clearly define the approach taken in this study**, ensuring that readers can understand and, if needed, replicate the implementation. The writing should prioritize formal rigor and depth, combining conceptual descriptions, algorithmic formulations, and mathematical definitions where possible. Avoid overly narrative or vague phrasing. This section must be based on **the proposed experiment**. Some tips are provided below: {section_tips} - - Research Problem: **{problem}** - - Importance: **{importance}** - - Difficulty: **{difficulty}** - - Novelty: **{novelty}** - - Experiment Plan: **{experiment}** + An example for method writing is as follow: - The implementation code was: + \paragraph{{Definition of artifact graphs}} + We further define the artifact (model-data) graph as a bipartite graph $\mathcal{{G}} = (\mathcal{{V}}, \mathcal{{E}})$, where the node set $\mathcal{{V}} = \mathcal{{V}}_m \cup \mathcal{{V}}d$ consists of two disjoint types of nodes: model nodes and dataset nodes. The edge set $\mathcal{{E}} = \mathcal{{E}}{{md}}$ represents evaluation relationships between models and datasets. Each dataset node $v \in \mathcal{{V}}_d$ is associated with attributes such as metadata and task descriptions, while each model node $u \in \mathcal{{V}}_m$ is associated with model specifications such as architecture, parameters, and configuration. - ```python - {code} - ``` + \paragraph{{Uniqueness of artifact graphs}} + Unlike general bipartite graphs, the edges in a artifact graph carry rich semantic information: each edge $(u, v) \in \mathcal{{E}}_{{md}}$ is annotated with evaluation results such as accuracy, F1, perplexity, or other task-specific metrics. This allows the graph to capture fine-grained performance relationships between models and datasets. Such a structure enables downstream applications like model routing, automatic benchmarking, and meta-analysis of model generalization across diverse tasks. + + \paragraph{{Edge collection}} + To construct the edges in our artifact graph, we extract ground-truth evaluation metrics directly from the HuggingFace model cards. Specifically, we parse the README files to identify model–dataset pairs together with their reported performance scores. After matching model and dataset names with their canonical entries on the HuggingFace platform, we obtain a clean set of evaluation edges. In total, this process yields 2,561 perfectly matched evaluation records, which serve as the edge attributes in our graph. + + \paragraph{{Node collection}} + To construct the node set, we collect information for both models and datasets. For each model, we crawl its README and metadata fields (e.g., architecture, parameters, tags), and for each dataset, we extract its description and metadata (e.g., domain, size, license). We retain only nodes that participate in at least one evaluation edge, filtering out isolated artifacts. The resulting graph contains 2,144 nodes in total, consisting of 1,757 models and 387 datasets. On average, each model connects to 1.46 datasets, while each dataset connects to 6.62 models, indicating that the graph is relatively sparse. + + \section{{Linking Scientific artifacts for automatic discovery}} + In this section, we first formalize the problem definition of automatic discovery using the artifact-graph formulation. We then introduce our scalable solution, which addresses this problem via a two-stage \textit{{prediction–verification}} framework. + + \subsection{{Definition of Automatic Discovery}} + Let $\mathcal{{G}} = (\mathcal{{V}}, \mathcal{{E}})$ denote the artifact graph, where $\mathcal{{V}} = \mathcal{{V}}_m \cup \mathcal{{V}}_d$ consists of model nodes $\mathcal{{V}}_m$ and dataset nodes $\mathcal{{V}}_d$, and $\mathcal{{E}}_{{md}} \subseteq \mathcal{{V}}_m \times \mathcal{{V}}_d$ represents evaluation edges. Each edge $(m,d) \in \mathcal{{E}}_{{md}}$ is annotated with a performance score $s(m,d)$ (e.g., accuracy, F1). We define the automatic discovery problem as finding a missing link $(m,d) \notin \mathcal{{E}}_{{md}}$ such that the predicted score $\hat{{s}}(m,d)$ exceeds the best observed score on dataset $d$: + + \begin{{equation}} + \hat{{s}}(m,d) > \max_{{(m', d) \in \mathcal{{E}}_{{md}}}} s(m',d). + \end{{equation}} + + In other words, the discovery target is to identify model–dataset pairs that are not yet connected in $\mathcal{{G}}$ but are expected to outperform all existing models on the same dataset. + + + \subsection{{Scalable Link Discovery Framework}} + To conduct automatic link discovery in a scalable manner, we adopt a two-stage framework: (1) prediction and (2) verification. Since verification through actual model evaluation is computationally expensive, the prediction stage serves as a crucial filter to eliminate unpromising candidates and focus resources on the most likely discoveries. + + \paragraph{{Prediction}} + In the prediction stage, the goal is not only to estimate $\hat{{s}}(m,d)$ for missing edges but also to rank all candidate links by their discovery potential. Concretely, we train a model that learns a scoring function + + \begin{{equation}} + \hat{{s}}: \mathcal{{V}}_m \times \mathcal{{V}}_d \to \mathbb{{R}}, + \end{{equation}} + + which assigns each unseen pair $(m,d)\notin\mathcal{{E}}_{{md}}$ a predicted performance. The model is optimized to rank these candidates relative to observed edges, so that links expected to outperform existing results on the same dataset are ranked highest. The output is an ordered list of missing edges prioritized by their likelihood of being high-value discoveries. + + \paragraph{{Verification}} + Once the ranked list is produced, we proceed to the verification stage. Given limited computational resources, we select the top-$k$ predictions for each dataset to maximize the chance of identifying true discoveries. Verification is performed by a coding agent designed for model–dataset auto-evaluation: it automatically downloads the target model and dataset from HuggingFace, loads them in the correct configuration, and executes an inference pipeline to obtain evaluation results. This design makes verification both feasible and reliable, leveraging the standardized structure of HuggingFace artifacts to enable large-scale, automated evaluation. + + \paragraph{{Overall algorithm}} Based on the two stages and the overall framework is predict-then-verify. + + following is the idea that you need to write method for: + - Title: **{title}** + - Intro: **{intro}** + Be sure to use \cite or \citet where relevant, referring to the works provided in the file. Do not cite anything that is not already in `references.bib`. Do not add any new entries to this. - Keep the experimental results (figures and tables) only in the Results section, and make sure that any captions are filled in. + Keep the experimental results (figures and tables) only in the Results section, and make sure that any captions are filled in. DO NOT SAY CONCRETE EXPERIMENTAL SETUP IN METHOD SECTION. JUST DESCRIBE THE ALGORITHM AND THE MAJOR COMPONENT OF YOUR METHOD. In this pass, do not reference anything in later sections of the paper. Begin with: @@ -198,21 +278,28 @@ section_prompt: Experimental_Setup: | Please fill in the Experimental Setup section of the writeup. This section should **clearly document how the experiments were conducted** so that they can be replicated by other researchers. The description should be based on the provided **implementation, dataset, and experimental configuration**. - Some tips are provided below: {section_tips} + An example for experimental setting is: + + \paragraph{{Model settings}} We select Qwen2.5-7b-Instruct as our base LLM for the training of both the policy model and reward model. We select GPT-4o for LLM-as-the-judge in \sotopiaeval. + + \paragraph{{Evaluation settings}} + We evaluate our method on two configurations of the \sotopia benchmark: (1) \sotopia-hard, and (2) \sotopia-all. \sotopia-hard is a subset of \sotopia-all, consisting of 14 challenging social scenarios identified as difficult among all scenarios, and we use 10 distinct agent pairings per scenario. For \sotopia-all, we evaluate on the full coverage of 90 social scenarios, using 2 agent combos per scenario to ensure diversity while maintaining scalability. More statistical information is in Appendix~\S\ref{{artifact-details}}. + - Research Problem: **{problem}** - Importance: **{importance}** - Difficulty: **{difficulty}** - Novelty: **{novelty}** + - Experiment Plan: **{experiment}** The experiments were run using the following method: **{experiment}**. The dataset, optimizer, and model settings are derived from the provided implementation. - ```python + python {code} - ``` + Be sure to use \cite or \citet where relevant, referring to the works provided in the file. Do not cite anything that is not already in `references.bib`. Do not add any new entries to this. @@ -234,6 +321,24 @@ section_prompt: Discussion: | Please fill in the Discussion section of the writeup. Follow the instructions carefully. + An example for discussion is as follow: + + To assess the effectiveness of \modelname, we first ensure that its performance gains are genuine and not the result of reward hacking (RQ1). We then analyze how our improvements come from the design of the reward attribution (RQ2) and the reward combination (RQ3). + + \subsection*{{RQ1: Does our improvement come from reward hacking or shortcut learning?}} \textit{{No, \sotopia-RL learns high-quality social skills instead of overfitting on partner models or evaluator models.}} + + Reward hacking occurs when performance improvements are confined to a specific partner model, a particular evaluator, or fail to generalize to human interactions. In Figure~\ref{{fig:goal-curve-b}} and Figure~\ref{{fig:goal-curve-c}}, we conduct a thorough analysis and show that the performance gains of \modelname are consistent across settings. Specifically, the improvements hold when switching between five different partner models and five different evaluator models, demonstrating strong robustness. Moreover, Table~\ref{{tab:human-eval}} confirms that these gains extend to human evaluation, further validating that the improvements are not evaluator-specific artifacts. Finally, Appendix~\S\ref{{safety-evaluation}} and \S\ref{{diversity-evaluation}} provide additional evidence from our safety and diversity evaluations. These results show that our trained policy model does not exhibit shortcut degeneration and remains safe and diverse. + + + \subsection*{{RQ2: Why does utterance-level reward attribution bring improvement?}} \textit{{The key to effective reward design lies in offline attribution, rather than in using a strong LLM.}} + + Unlike standard MDP tasks, social interactions cannot be accurately evaluated based only on the preceding dialogue context—the quality of an utterance often depends on how the entire conversation unfolds. To address this, we attribute episode-level rewards to each utterance using information from the whole dialogue, making the reward attribution inherently \textit{{offline}}. We point out that offline attribution is the key to our improvement. Table~\ref{{tab:utterance-reward-comparison}} compares two settings for training utterance-level reward models: (1) online reward labels attributed using only the preceding dialogue history, and (2) offline reward labels attributed offline using the full episode. The offline approach achieves a substantially higher goal score (7.81) than the online approach, clearly demonstrating its effectiveness. Importantly, such an improvement does not rely on GPT-4o itself. As shown in Figure~\ref{{fig:reward-corr}}, replacing GPT-4o with weaker models for utterance-level reward labeling produces highly correlated reward signals ($>$0.7). This indicates that with well-designed prompts, precise offline credit assignment for utterance-level rewards can be reliably achieved even without state-of-the-art LLMs. More in-depth analysis and human evaluation results on utterance-level rewards are available in Appendix~\S\ref{{analysis-of-utterance-level-rewards}}. + + \subsection*{{RQ3: Why does the reward combination bring improvement?}} \textit{{Using rewards with multiple dimensions makes RM training more robust, and a better RM helps prevent RL from overfitting.}} + + To discuss why the reward combination brings improvement, we first rule out the possibility that the observed gains are merely due to reward label smoothing. To test this, we increased the attribution granularity from a 3-point to a 10-point scale and reran the pipeline. The 10-point scale did not outperform the 3-point scale on \goalcompletion\ (6.44 vs.\ 6.74), indicating that the benefits cannot be explained by finer reward scaling alone. Next, we examine whether the improvement comes from capturing complementary aspects of social interactions. As shown in Table~\ref{{tab:bc-partner-hard}}, models trained on knowledge, relationship, and goal rewards exhibit positive but only moderate correlations. This suggests that each objective captures a distinct facet, and combining them allows the model to leverage a broader range of social signals. Finally, Figure~\ref{{fig:goal-curve-a}} shows that training with combined rewards stabilizes RL and regularizes the single-dimension objective in later stages. Such regularization contributes to the consistent improvement we observe. + + Some tips for writing a strong Discussion section: {section_tips} @@ -253,6 +358,13 @@ section_prompt: Results: | Please fill in the Results section of the research paper. Follow the instructions carefully. + A good result section should look like this: + + \paragraph{{sotopia-RL helps build state-of-the-art social agents on the {{sotopia}} benchmark}} In Table~\ref{{tab:sota-performance}}, Qwen-2.5-7B-Instruct trained with {{sotopia-RL}} reaches the highest goal completion score, achieving the 7.17 score in the {{sotopia-hard}}. It indicates that our utterance-level RM provides better guidance during the training of RL. It also indicates that for multi-turn social interactions, improving the quality of single-turn interactions with suitable single-turn rewards can effectively optimize multi-turn performance. Notably, AMPO~\citep{{wang2025think}} reaches 7.50 on {{sotopia-hard}}. But it includes an explicit reasoning process and requires more than 640 inference tokens per utterance on average. Therefore, it is unfair to compare AMPO with ours since we only utilize GRPO to generate utterances without extra tokens for reasoning. Full results are available in Appendix~\S\ref{{additional-experimental-results}}. + + \paragraph{{sotopia-RL goes beyond distillation from GPT}} Our training pipeline begins with GPT-based self-play episodes and GPT-based offline reward annotations. Importantly, GPT annotations are applied \textit{{offline}}, conditioning on the entire episode, whereas during RL training, rewards are computed \textit{{online}}, conditioned only on the preceding dialogue history. As shown in Table~\ref{{tab:sota-performance}}, {{ourmodel}} not only matches but surpasses GPT-4o when used directly as a policy model (7.17 vs.\ 6.97). If {{ourmodel}} were merely a stronger form of distillation, as in behavior cloning, it could at best equal GPT-4o’s performance, not exceed it. + + Some tips for writing a strong Results section: {section_tips} @@ -293,9 +405,9 @@ citation_related_work_prompt: | Please return only a JSON array (strictly valid) of new paper titles. These must be actual paper titles that are published and relevant to the topic. Example: - ```json + json ["Title 1", "Title 2", "Title 3"] - ``` + add_citation_prompt: | Given current version of the paper @@ -322,9 +434,9 @@ add_citation_prompt: | All titles must be real and verifiable. Please return only a JSON array (strictly valid) of new paper titles. These must be actual paper titles that are published and relevant to the topic. Example: - ```json + json ["Title 1", "Title 2", "Title 3"] - ``` + embed_citation_prompt: | You are assisting with embedding citation placeholders into an academic LaTeX section. @@ -352,6 +464,25 @@ embed_citation_prompt: | related_work_prompt: | Please write the Related Work section of the research paper. Follow the instructions and structure strictly. + An example for related work is: + + \paragraph{{Graph Architecture Search}} + Architecture search techniques have been applied to GNNs \cite{{gao2019graphnas,zhou2019auto}}. + However, these works only focus on the design within each GNN layer instead of a general GNN design space, + and only evaluate the designs on a small number of node classification tasks. + + \paragraph{{Evaluation of GNN Models}} + Multiple works discuss approaches for making fair comparison between GNN models \cite{{dwivedi2020benchmarking,errica2019fair,shchur2018pitfalls}}. + However, these models only consider some specific GNN designs (\eg, GCN, GAT, GraphSAGE), while our approach extensively explores the general design space of GNNs. + + \paragraph{{Other graph learning models}} We focus on message passing GNNs due to their proven performance and efficient implementation over various GNN tasks. There are alternative designs of graph learning models \cite{{maron2019provably,morris2019weisfeiler,murphy2019relational,you2019position}}, but their design spaces are different from GNNs and are less modularized. + + \paragraph{{Transferable Architecture Search}} + The idea of transferring architecture search results across tasks has been studied in the context of computer vision tasks \cite{{you2020graph,zoph2018learning}}. + Meta-level architecture design has also been studied in \cite{{radosavovic2020designing,shaw2019meta,wong2018transfer,zamir2018taskonomy}}, with the assumption that different tasks follow the same distribution (\eg, variants of ImageNet dataset \cite{{deng2009imagenet}}). + These approaches often make an assumption that a single neural architecture may perform well on all tasks, which fits well for tasks with relatively low variety. + However, due to the great variety of graph learning tasks, such assumption no longer holds. + Some tips for writing a strong Related Work section: {related_work_tips} diff --git a/tiny_scientist/utils/output_formatter.py b/tiny_scientist/utils/output_formatter.py index 41c62fb..f5a455a 100644 --- a/tiny_scientist/utils/output_formatter.py +++ b/tiny_scientist/utils/output_formatter.py @@ -6,6 +6,8 @@ import shutil import subprocess import sys +import io +import zipfile from typing import Any, Dict, Match import requests @@ -381,27 +383,89 @@ def download_acl_template(output_dir: str) -> str: dest_template_dir = osp.join(output_dir, "latex") os.makedirs(dest_template_dir, exist_ok=True) - # GitHub repository URL for ACL - acl_api_url = ( - "https://api.github.com/repos/acl-org/acl-style-files/contents/latex" - ) - response = requests.get(acl_api_url) - response.raise_for_status() + # Primary: use GitHub API to fetch files from the latex directory + try: + acl_api_url = ( + "https://api.github.com/repos/acl-org/acl-style-files/contents/latex" + ) + response = requests.get(acl_api_url, timeout=15) + response.raise_for_status() + + files_data = response.json() + for file_info in files_data: + if file_info.get("type") == "file": + file_url = file_info.get("download_url") + filename = file_info.get("name", "") + if not file_url or not filename: + continue - files_data = response.json() - for file_info in files_data: - if file_info["type"] == "file": - file_url = file_info["download_url"] - filename = file_info["name"] + print(f"Downloading {filename}...") + file_response = requests.get(file_url, timeout=30) + file_response.raise_for_status() - print(f"Downloading {filename}...") - file_response = requests.get(file_url) - file_response.raise_for_status() + with open(osp.join(dest_template_dir, filename), "wb") as f: + f.write(file_response.content) - with open(osp.join(dest_template_dir, filename), "wb") as f: - f.write(file_response.content) + return dest_template_dir + except requests.HTTPError as e: + print( + f"[Warning] ACL API path /contents/latex not available (HTTP): {e}. Falling back to repo archive." + ) + except Exception as e: + print( + f"[Warning] Failed to fetch ACL template via API: {e}. Falling back to repo archive." + ) - return dest_template_dir + # Fallback: download the whole repository archive and extract the latex folder + last_error: Optional[Exception] = None + for branch in ["main", "master"]: + try: + zip_url = f"https://codeload.github.com/acl-org/acl-style-files/zip/refs/heads/{branch}" + print(f"Attempting ACL archive download: {zip_url}") + zr = requests.get(zip_url, timeout=30) + zr.raise_for_status() + + with zipfile.ZipFile(io.BytesIO(zr.content)) as zf: + extracted = 0 + # Prefer extracting files under any /latex/ directory + for name in zf.namelist(): + if "/latex/" in name and not name.endswith("/"): + rel = name.split("/latex/", 1)[1] + # Skip empty rel (directory entries) + if rel.strip(): + target_path = osp.join(dest_template_dir, rel) + os.makedirs(osp.dirname(target_path), exist_ok=True) + with zf.open(name) as src, open(target_path, "wb") as dst: + shutil.copyfileobj(src, dst) + extracted += 1 + + if extracted == 0: + # Heuristic: copy a minimal set of known filenames if structure changed + wanted = { + "acl_latex.tex", + "acl_natbib.bst", + "acl.bst", + "acl.sty", + "anthology.bib", + } + for name in zf.namelist(): + base = name.rsplit("/", 1)[-1] + if base in wanted and not name.endswith("/"): + target_path = osp.join(dest_template_dir, base) + with zf.open(name) as src, open(target_path, "wb") as dst: + shutil.copyfileobj(src, dst) + extracted += 1 + + if extracted > 0: + print(f"Extracted {extracted} ACL template files from {branch} branch archive") + return dest_template_dir + except Exception as e: + last_error = e + print(f"[Warning] ACL archive fallback failed for branch {branch}: {e}") + + raise RuntimeError( + f"Failed to download ACL template via API and archive. Last error: {last_error}" + ) @staticmethod def download_iclr_template(output_dir: str) -> str: diff --git a/tiny_scientist/writer.py b/tiny_scientist/writer.py index 4312cc8..1ad7f2e 100644 --- a/tiny_scientist/writer.py +++ b/tiny_scientist/writer.py @@ -41,6 +41,7 @@ def __init__( self.template = template self.temperature = temperature self.searcher: BaseTool = PaperSearchTool(s2_api_key=s2_api_key) + # self.searcher = None self.drawer: BaseTool = DrawerTool(model, prompt_template_dir, temperature) self.formatter: BaseOutputFormatter self.config = Config(prompt_template_dir) @@ -113,7 +114,7 @@ def run( self._refine_paper() self._add_citations(idea) - self._generate_diagram_for_section() + # self._generate_diagram_for_section() paper_name = ( idea.get("Title", "Research Paper") @@ -246,7 +247,13 @@ def _write_section( section_tips=self.prompts.section_tips[section], experiment=experiment, ) - elif section in ["Method", "Experimental_Setup"]: + elif section in ["Method"]: + section_prompt = self.prompts.section_prompt[section].format( + section_tips=self.prompts.section_tips[section], + title=title, + intro=self.generated_sections["Introduction"], + ) + elif section in ["Experimental_Setup"]: section_prompt = self.prompts.section_prompt[section].format( section_tips=self.prompts.section_tips[section], problem=idea["Problem"], @@ -338,7 +345,7 @@ def _search_reference(self, paper_list: List[str]) -> Dict[str, Any]: results_dict = {} for paper_name in paper_list: - try: + try: result = self.searcher.run(paper_name) if result: