-
Notifications
You must be signed in to change notification settings - Fork 28.9k
[SPARK-8725][PROJECT-INFRA] Test modules in topologically-sorted order in dev/run-tests #10885
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
adcc843
6a9cc2a
76a446e
c122c55
00a817a
a17c3d0
0bc9da0
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -15,12 +15,14 @@ | |
| # limitations under the License. | ||
| # | ||
|
|
||
| from functools import total_ordering | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is only available in Python 2.7+, but that might be fine: AMPLab Jenkins runs a compatible Python version and I figure that people have control over their dev environments and can upgrade if necessary. |
||
| import itertools | ||
| import re | ||
|
|
||
| all_modules = [] | ||
|
|
||
|
|
||
| @total_ordering | ||
| class Module(object): | ||
| """ | ||
| A module is the basic abstraction in our test runner script. Each module consists of a set of | ||
|
|
@@ -75,20 +77,56 @@ def __init__(self, name, dependencies, source_file_regexes, build_profile_flags= | |
| def contains_file(self, filename): | ||
| return any(re.match(p, filename) for p in self.source_file_prefixes) | ||
|
|
||
| def __repr__(self): | ||
| return "Module<%s>" % self.name | ||
|
|
||
| def __lt__(self, other): | ||
| return self.name < other.name | ||
|
|
||
| def __eq__(self, other): | ||
| return self.name == other.name | ||
|
|
||
| def __ne__(self, other): | ||
| return not (self.name == other.name) | ||
|
|
||
| def __hash__(self): | ||
| return hash(self.name) | ||
|
|
||
|
|
||
| catalyst = Module( | ||
| name="catalyst", | ||
| dependencies=[], | ||
| source_file_regexes=[ | ||
| "sql/catalyst/", | ||
| ], | ||
| sbt_test_goals=[ | ||
| "catalyst/test", | ||
| ], | ||
| ) | ||
|
|
||
|
|
||
| sql = Module( | ||
| name="sql", | ||
| dependencies=[], | ||
| dependencies=[catalyst], | ||
| source_file_regexes=[ | ||
| "sql/(?!hive-thriftserver)", | ||
| "sql/core/", | ||
| ], | ||
| sbt_test_goals=[ | ||
| "sql/test", | ||
| ], | ||
| ) | ||
|
|
||
| hive = Module( | ||
| name="hive", | ||
| dependencies=[sql], | ||
| source_file_regexes=[ | ||
| "sql/hive/", | ||
| "bin/spark-sql", | ||
| ], | ||
| build_profile_flags=[ | ||
| "-Phive", | ||
| ], | ||
| sbt_test_goals=[ | ||
| "catalyst/test", | ||
| "sql/test", | ||
| "hive/test", | ||
| ], | ||
| test_tags=[ | ||
|
|
@@ -99,7 +137,7 @@ def contains_file(self, filename): | |
|
|
||
| hive_thriftserver = Module( | ||
| name="hive-thriftserver", | ||
| dependencies=[sql], | ||
| dependencies=[hive], | ||
| source_file_regexes=[ | ||
| "sql/hive-thriftserver", | ||
| "sbin/start-thriftserver.sh", | ||
|
|
@@ -282,7 +320,7 @@ def contains_file(self, filename): | |
|
|
||
| examples = Module( | ||
| name="examples", | ||
| dependencies=[graphx, mllib, streaming, sql], | ||
| dependencies=[graphx, mllib, streaming, hive], | ||
| source_file_regexes=[ | ||
| "examples/", | ||
| ], | ||
|
|
@@ -314,7 +352,7 @@ def contains_file(self, filename): | |
|
|
||
| pyspark_sql = Module( | ||
| name="pyspark-sql", | ||
| dependencies=[pyspark_core, sql], | ||
| dependencies=[pyspark_core, hive], | ||
| source_file_regexes=[ | ||
| "python/pyspark/sql" | ||
| ], | ||
|
|
@@ -404,7 +442,7 @@ def contains_file(self, filename): | |
|
|
||
| sparkr = Module( | ||
| name="sparkr", | ||
| dependencies=[sql, mllib], | ||
| dependencies=[hive, mllib], | ||
| source_file_regexes=[ | ||
| "R/", | ||
| ], | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,85 @@ | ||
| ####################################################################### | ||
| # Implements a topological sort algorithm. | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Oops, I missed this. It looks like it's Apache licensed and you have correctly preserved the header. https://bitbucket.org/ericvsmith/toposort/src/25b5894c4229cb888f77cf0c077c05e2464446ac/LICENSE.txt?fileviewer=file-view-default You'll need to put the That should be all.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do I just append the entire contents of the notice verbatim? Is there a shorthand way of doing this (e.g. just including just the copyright)? I got a bit confused looking at the existing NOTICE file and since it hasn't been updated in a while I didn't spot any smaller patches / diffs to model my change on.
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes just append. I don't know if we've got a strong format going, except to try to precede the text from Foo with a line like "Foo" or "For Foo:". Anything sane should be OK. Technically you reproduce all relevant parts of the
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Alright, updated the NOTICE to append it verbatim. |
||
| # | ||
| # Copyright 2014 True Blade Systems, Inc. | ||
| # | ||
| # Licensed under the Apache License, Version 2.0 (the "License"); | ||
| # you may not use this file except in compliance with the License. | ||
| # You may obtain a copy of the License at | ||
| # | ||
| # http://www.apache.org/licenses/LICENSE-2.0 | ||
| # | ||
| # Unless required by applicable law or agreed to in writing, software | ||
| # distributed under the License is distributed on an "AS IS" BASIS, | ||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| # See the License for the specific language governing permissions and | ||
| # limitations under the License. | ||
| # | ||
| # Notes: | ||
| # Based on http://code.activestate.com/recipes/578272-topological-sort | ||
| # with these major changes: | ||
| # Added unittests. | ||
| # Deleted doctests (maybe not the best idea in the world, but it cleans | ||
| # up the docstring). | ||
| # Moved functools import to the top of the file. | ||
| # Changed assert to a ValueError. | ||
| # Changed iter[items|keys] to [items|keys], for python 3 | ||
| # compatibility. I don't think it matters for python 2 these are | ||
| # now lists instead of iterables. | ||
| # Copy the input so as to leave it unmodified. | ||
| # Renamed function from toposort2 to toposort. | ||
| # Handle empty input. | ||
| # Switch tests to use set literals. | ||
| # | ||
| ######################################################################## | ||
|
|
||
| from functools import reduce as _reduce | ||
|
|
||
|
|
||
| __all__ = ['toposort', 'toposort_flatten'] | ||
|
|
||
|
|
||
| def toposort(data): | ||
| """Dependencies are expressed as a dictionary whose keys are items | ||
| and whose values are a set of dependent items. Output is a list of | ||
| sets in topological order. The first set consists of items with no | ||
| dependences, each subsequent set consists of items that depend upon | ||
| items in the preceeding sets. | ||
| """ | ||
|
|
||
| # Special case empty input. | ||
| if len(data) == 0: | ||
| return | ||
|
|
||
| # Copy the input so as to leave it unmodified. | ||
| data = data.copy() | ||
|
|
||
| # Ignore self dependencies. | ||
| for k, v in data.items(): | ||
| v.discard(k) | ||
| # Find all items that don't depend on anything. | ||
| extra_items_in_deps = _reduce(set.union, data.values()) - set(data.keys()) | ||
| # Add empty dependences where needed. | ||
| data.update({item: set() for item in extra_items_in_deps}) | ||
| while True: | ||
| ordered = set(item for item, dep in data.items() if len(dep) == 0) | ||
| if not ordered: | ||
| break | ||
| yield ordered | ||
| data = {item: (dep - ordered) | ||
| for item, dep in data.items() | ||
| if item not in ordered} | ||
| if len(data) != 0: | ||
| raise ValueError('Cyclic dependencies exist among these items: {}'.format( | ||
| ', '.join(repr(x) for x in data.items()))) | ||
|
|
||
|
|
||
| def toposort_flatten(data, sort=True): | ||
| """Returns a single list of dependencies. For any set returned by | ||
| toposort(), those items are sorted and appended to the result (just to | ||
| make the results deterministic).""" | ||
|
|
||
| result = [] | ||
| for d in toposort(data): | ||
| result.extend((sorted if sort else list)(d)) | ||
| return result | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
AFAIK there aren't any duplicates here, so we might as well maintain the sorted module order.