diff --git a/README.md b/README.md index cf01b5d..b36e07b 100644 --- a/README.md +++ b/README.md @@ -16,7 +16,7 @@ cd methods2test git lfs pull ``` -Please refer to this [web page](https://docs.microsoft.com/en-us/azure/devops/repos/git/manage-large-files?view=azure-devops) for more details about Gut LFS and working with large files. +Please refer to this [web page](https://docs.microsoft.com/en-us/azure/devops/repos/git/manage-large-files?view=azure-devops) for more details about Git LFS and working with large files. ## What is Unit Test Case? Unit testing is a level of software testing where individual software components are tested with a purpose of validating that each software component performs as designed. A unit is the smallest testable part of any software. In this work, we are focusing on testing Java methods. @@ -103,7 +103,7 @@ The corpus is organized in different levels of focal context, incorporating info - *FM_FC_MS_FF*: focal method + focal class name + constructor signatures + public method signatures + public fields ### Methods2Test v1.0 -The `methods2test-v1.0` folder contains the previous version of this dataset. More information are availble in the README within the folder. +The `methods2test-v1.0` folder contains the previous version of this dataset. More information is available in the README within the folder. # Statistics @@ -113,7 +113,7 @@ The dataset contains 780,944 test cases mapped to their corresponding focal meth - Repositories: 9,410 - Instances: 780,944 -We split the dataset in training (80%), validaiton (10%), and test (10%) sets. The split is performed avoiding data leakage at repository-level, that is, all instances from a given repository will appears in a single set (e.g., in training but not in test). Duplicate pairs with same code representation have been removed. +We split the dataset in training (80%), validation (10%), and test (10%) sets. The split is performed avoiding data leakage at repository-level, that is, all instances from a given repository will appears in a single set (e.g., in training but not in test). Duplicate pairs with same code representation have been removed. **Training** - Repositories: 7,440 diff --git a/scripts/README.md b/scripts/README.md index d46ccba..c030974 100644 --- a/scripts/README.md +++ b/scripts/README.md @@ -2,7 +2,7 @@ The `script` folder contains the following files: - `find_map_test_cases.py`: the main script to find and map test cases in a repository - `TestParser.py`: an utility class that parses test cases - `java-grammar.so`: tree-sitter Java grammar file - +- `libtree-sitter-java.dylib`: tree-sitter Java grammar file for macOS ## Extraction & Mapping To extract test cases and map them to focal methods, we can run the `find_map_test_cases.py` script, which takes the following arguments: diff --git a/scripts/find_map_test_cases.py b/scripts/find_map_test_cases.py index 9f2af1f..633831c 100644 --- a/scripts/find_map_test_cases.py +++ b/scripts/find_map_test_cases.py @@ -9,6 +9,7 @@ import tqdm import copy from TestParser import TestParser +import platform @@ -74,8 +75,12 @@ def find_map_test_cases(root, grammar_file, language, output, repo): return 0, 0, 0, 0 #Java Files + os_system = platform.system() try: - result = subprocess.check_output(['find', '-name', '*.java']) + if os_system == "Darwin": + result = subprocess.check_output(['find', '.', '-iname', '*.java']) + else: + result = subprocess.check_output(['find', '-name', '*.java']) java = result.decode('ascii').splitlines() java = [j.replace("./", "") for j in java] except: @@ -100,6 +105,8 @@ def find_map_test_cases(root, grammar_file, language, output, repo): for test in tests: tests_norm = test.lower().replace("/src/test/", "/src/main/") tests_norm = tests_norm.replace("test", "") + if os_system == "Darwin": + tests_norm = tests_norm.replace("./", "") if tests_norm in focals_norm: index = focals_norm.index(tests_norm) @@ -343,6 +350,9 @@ def main(): repo_git = args['repo_url'] repo_id = args['repo_id'] grammar_file = args['grammar'] + if not os.path.isabs(grammar_file): + current_dir = os.getcwd() + grammar_file = os.path.join(current_dir, grammar_file) tmp = args['tmp'] output = args['output'] diff --git a/scripts/libtree-sitter-java.dylib b/scripts/libtree-sitter-java.dylib new file mode 100755 index 0000000..b92131d Binary files /dev/null and b/scripts/libtree-sitter-java.dylib differ