apache · JoshRosen · Jan 24, 2016 · Jan 24, 2016 · Jan 24, 2016 · Jan 24, 2016
diff --git a/NOTICE b/NOTICE
@@ -650,3 +650,19 @@ For CSV functionality:
  */
 
 
+===============================================================================
+For dev/sparktestsupport/toposort.py:
+
+Copyright 2014 True Blade Systems, Inc.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
diff --git a/dev/run-tests.py b/dev/run-tests.py
@@ -29,6 +29,7 @@
 
 from sparktestsupport import SPARK_HOME, USER_HOME, ERROR_CODES
 from sparktestsupport.shellutils import exit_from_command_with_retcode, run_cmd, rm_r, which
+from sparktestsupport.toposort import toposort_flatten, toposort
 import sparktestsupport.modules as modules
 
 
@@ -43,7 +44,7 @@ def determine_modules_for_files(filenames):
     If a file is not associated with a more specific submodule, then this method will consider that
     file to belong to the 'root' module.
 
-    >>> sorted(x.name for x in determine_modules_for_files(["python/pyspark/a.py", "sql/test/foo"]))
+    >>> sorted(x.name for x in determine_modules_for_files(["python/pyspark/a.py", "sql/core/foo"]))
     ['pyspark-core', 'sql']
     >>> [x.name for x in determine_modules_for_files(["file_not_matched_by_any_subproject"])]
     ['root']
@@ -99,14 +100,16 @@ def determine_modules_to_test(changed_modules):
     Given a set of modules that have changed, compute the transitive closure of those modules'
     dependent modules in order to determine the set of modules that should be tested.
 
-    >>> sorted(x.name for x in determine_modules_to_test([modules.root]))
+    Returns a topologically-sorted list of modules (ties are broken by sorting on module names).
+
+    >>> [x.name for x in determine_modules_to_test([modules.root])]
     ['root']
-    >>> sorted(x.name for x in determine_modules_to_test([modules.graphx]))
-    ['examples', 'graphx']
-    >>> x = sorted(x.name for x in determine_modules_to_test([modules.sql]))
+    >>> [x.name for x in determine_modules_to_test([modules.graphx])]
+    ['graphx', 'examples']
+    >>> x = [x.name for x in determine_modules_to_test([modules.sql])]
     >>> x # doctest: +NORMALIZE_WHITESPACE
-    ['examples', 'hive-thriftserver', 'mllib', 'pyspark-ml', \
-     'pyspark-mllib', 'pyspark-sql', 'sparkr', 'sql']
+    ['sql', 'hive', 'mllib', 'examples', 'hive-thriftserver', 'pyspark-sql', 'sparkr',
+     'pyspark-mllib', 'pyspark-ml']
     """
     # If we're going to have to run all of the tests, then we can just short-circuit
     # and return 'root'. No module depends on root, so if it appears then it will be
@@ -116,7 +119,9 @@ def determine_modules_to_test(changed_modules):
     modules_to_test = set()
     for module in changed_modules:
         modules_to_test = modules_to_test.union(determine_modules_to_test(module.dependent_modules))
-    return modules_to_test.union(set(changed_modules))
+    modules_to_test = modules_to_test.union(set(changed_modules))
+    return toposort_flatten(
+        {m: set(m.dependencies).intersection(modules_to_test) for m in modules_to_test}, sort=True)
 
 
 def determine_tags_to_exclude(changed_modules):
@@ -377,12 +382,12 @@ def run_scala_tests_maven(test_profiles):
 
 def run_scala_tests_sbt(test_modules, test_profiles):
 
-    sbt_test_goals = set(itertools.chain.from_iterable(m.sbt_test_goals for m in test_modules))
+    sbt_test_goals = list(itertools.chain.from_iterable(m.sbt_test_goals for m in test_modules))
 
     if not sbt_test_goals:
         return
 
-    profiles_and_goals = test_profiles + list(sbt_test_goals)
+    profiles_and_goals = test_profiles + sbt_test_goals
 
     print("[info] Running Spark tests using SBT with these arguments: ",
           " ".join(profiles_and_goals))

diff --git a/dev/sparktestsupport/modules.py b/dev/sparktestsupport/modules.py
@@ -15,12 +15,14 @@
 # limitations under the License.
 #
 
+from functools import total_ordering
 import itertools
 import re
 
 all_modules = []
 
 
+@total_ordering
 class Module(object):
     """
     A module is the basic abstraction in our test runner script. Each module consists of a set of
@@ -75,20 +77,56 @@ def __init__(self, name, dependencies, source_file_regexes, build_profile_flags=
     def contains_file(self, filename):
         return any(re.match(p, filename) for p in self.source_file_prefixes)
 
+    def __repr__(self):
+        return "Module<%s>" % self.name
+
+    def __lt__(self, other):
+        return self.name < other.name
+
+    def __eq__(self, other):
+        return self.name == other.name
+
+    def __ne__(self, other):
+        return not (self.name == other.name)
+
+    def __hash__(self):
+        return hash(self.name)
+
+
+catalyst = Module(
+    name="catalyst",
+    dependencies=[],
+    source_file_regexes=[
+        "sql/catalyst/",
+    ],
+    sbt_test_goals=[
+        "catalyst/test",
+    ],
+)
+
 
 sql = Module(
     name="sql",
-    dependencies=[],
+    dependencies=[catalyst],
     source_file_regexes=[
-        "sql/(?!hive-thriftserver)",
+        "sql/core/",
+    ],
+    sbt_test_goals=[
+        "sql/test",
+    ],
+)
+
+hive = Module(
+    name="hive",
+    dependencies=[sql],
+    source_file_regexes=[
+        "sql/hive/",
         "bin/spark-sql",
     ],
     build_profile_flags=[
         "-Phive",
     ],
     sbt_test_goals=[
-        "catalyst/test",
-        "sql/test",
         "hive/test",
     ],
     test_tags=[
@@ -99,7 +137,7 @@ def contains_file(self, filename):
 
 hive_thriftserver = Module(
     name="hive-thriftserver",
-    dependencies=[sql],
+    dependencies=[hive],
     source_file_regexes=[
         "sql/hive-thriftserver",
         "sbin/start-thriftserver.sh",
@@ -282,7 +320,7 @@ def contains_file(self, filename):
 
 examples = Module(
     name="examples",
-    dependencies=[graphx, mllib, streaming, sql],
+    dependencies=[graphx, mllib, streaming, hive],
     source_file_regexes=[
         "examples/",
     ],
@@ -314,7 +352,7 @@ def contains_file(self, filename):
 
 pyspark_sql = Module(
     name="pyspark-sql",
-    dependencies=[pyspark_core, sql],
+    dependencies=[pyspark_core, hive],
     source_file_regexes=[
         "python/pyspark/sql"
     ],
@@ -404,7 +442,7 @@ def contains_file(self, filename):
 
 sparkr = Module(
     name="sparkr",
-    dependencies=[sql, mllib],
+    dependencies=[hive, mllib],
     source_file_regexes=[
         "R/",
     ],

diff --git a/dev/sparktestsupport/toposort.py b/dev/sparktestsupport/toposort.py
@@ -0,0 +1,85 @@
+#######################################################################
+# Implements a topological sort algorithm.
+#
+# Copyright 2014 True Blade Systems, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Notes:
+#  Based on http://code.activestate.com/recipes/578272-topological-sort
+#   with these major changes:
+#    Added unittests.
+#    Deleted doctests (maybe not the best idea in the world, but it cleans
+#     up the docstring).
+#    Moved functools import to the top of the file.
+#    Changed assert to a ValueError.
+#    Changed iter[items|keys] to [items|keys], for python 3
+#     compatibility. I don't think it matters for python 2 these are
+#     now lists instead of iterables.
+#    Copy the input so as to leave it unmodified.
+#    Renamed function from toposort2 to toposort.
+#    Handle empty input.
+#    Switch tests to use set literals.
+#
+########################################################################
+
+from functools import reduce as _reduce
+
+
+__all__ = ['toposort', 'toposort_flatten']
+
+
+def toposort(data):
+    """Dependencies are expressed as a dictionary whose keys are items
+and whose values are a set of dependent items. Output is a list of
+sets in topological order. The first set consists of items with no
+dependences, each subsequent set consists of items that depend upon
+items in the preceeding sets.
+"""
+
+    # Special case empty input.
+    if len(data) == 0:
+        return
+
+    # Copy the input so as to leave it unmodified.
+    data = data.copy()
+
+    # Ignore self dependencies.
+    for k, v in data.items():
+        v.discard(k)
+    # Find all items that don't depend on anything.
+    extra_items_in_deps = _reduce(set.union, data.values()) - set(data.keys())
+    # Add empty dependences where needed.
+    data.update({item: set() for item in extra_items_in_deps})
+    while True:
+        ordered = set(item for item, dep in data.items() if len(dep) == 0)
+        if not ordered:
+            break
+        yield ordered
+        data = {item: (dep - ordered)
+                for item, dep in data.items()
+                if item not in ordered}
+    if len(data) != 0:
+        raise ValueError('Cyclic dependencies exist among these items: {}'.format(
+            ', '.join(repr(x) for x in data.items())))
+
+
+def toposort_flatten(data, sort=True):
+    """Returns a single list of dependencies. For any set returned by
+toposort(), those items are sorted and appended to the result (just to
+make the results deterministic)."""
+
+    result = []
+    for d in toposort(data):
+        result.extend((sorted if sort else list)(d))
+    return result