From 684183b337b7bf097918ed9bab578a13ddcdf29e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom-Olav=20B=C3=B8yum?= Date: Fri, 25 Oct 2019 15:35:57 +0200 Subject: [PATCH 01/34] monkeypatched extract_dates --- .gitignore | 1 + ipyparallel/client/client.py | 3 ++- ipyparallel/controller/hub.py | 10 ++++++++++ ipyparallel/controller/scheduler.py | 5 ++++- 4 files changed, 17 insertions(+), 2 deletions(-) diff --git a/.gitignore b/.gitignore index b0aecc908..77eb0e8f4 100644 --- a/.gitignore +++ b/.gitignore @@ -21,3 +21,4 @@ __pycache__ .coverage *coverage.xml .coverage.* +.idea diff --git a/ipyparallel/client/client.py b/ipyparallel/client/client.py index 644c90f70..92e687608 100644 --- a/ipyparallel/client/client.py +++ b/ipyparallel/client/client.py @@ -55,7 +55,8 @@ from .asyncresult import AsyncResult, AsyncHubResult from .futures import MessageFuture, multi_future from .view import DirectView, LoadBalancedView - +import jupyter_client.session +jupyter_client.session.extract_dates = lambda obj: obj #-------------------------------------------------------------------------- # Decorators for Client methods #-------------------------------------------------------------------------- diff --git a/ipyparallel/controller/hub.py b/ipyparallel/controller/hub.py index 931caceb9..df8269b06 100644 --- a/ipyparallel/controller/hub.py +++ b/ipyparallel/controller/hub.py @@ -14,6 +14,7 @@ import sys import time +from jupyter_client.jsonutil import parse_date from tornado.gen import coroutine, maybe_future import zmq from zmq.eventloop.zmqstream import ZMQStream @@ -28,6 +29,7 @@ DottedObjectName, observe ) +from datetime import datetime from ipyparallel import error, util from ipyparallel.factory import RegistrationFactory @@ -70,9 +72,15 @@ def empty_record(): 'stderr': '', } +def ensure_date_is_parsed(header): + if not isinstance(header['date'], datetime): + header['date'] = parse_date(header['date']) + def init_record(msg): """Initialize a TaskRecord based on a request.""" header = msg['header'] + + ensure_date_is_parsed(header) return { 'msg_id' : header['msg_id'], 'header' : header, @@ -686,6 +694,7 @@ def save_queue_result(self, idents, msg): # update record anyway, because the unregistration could have been premature rheader = msg['header'] md = msg['metadata'] + ensure_date_is_parsed(rheader) completed = util.ensure_timezone(rheader['date']) started = extract_dates(md.get('started', None)) result = { @@ -792,6 +801,7 @@ def save_task_result(self, idents, msg): self.completed[eid].append(msg_id) if msg_id in self.tasks[eid]: self.tasks[eid].remove(msg_id) + ensure_date_is_parsed(header) completed = util.ensure_timezone(header['date']) started = extract_dates(md.get('started', None)) result = { diff --git a/ipyparallel/controller/scheduler.py b/ipyparallel/controller/scheduler.py index 2988bec8e..b7c94133c 100644 --- a/ipyparallel/controller/scheduler.py +++ b/ipyparallel/controller/scheduler.py @@ -36,6 +36,9 @@ from .dependency import Dependency +import jupyter_client.session +jupyter_client.session.extract_dates = lambda obj: obj + @decorator def logged(f,self,*args,**kwargs): # print ("#--------------------") @@ -615,7 +618,7 @@ def submit_task(self, job, indices=None): # Result Handling #----------------------------------------------------------------------- - + @util.log_errors def dispatch_result(self, raw_msg): """dispatch method for result replies""" From 1cde478f96b674e27b142b01e1f781be9c312a1a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom-Olav=20B=C3=B8yum?= Date: Mon, 9 Dec 2019 17:16:51 +0100 Subject: [PATCH 02/34] Test commit to trigger asv benchmarking for multiple commits. --- ipyparallel/client/client.py | 1 + 1 file changed, 1 insertion(+) diff --git a/ipyparallel/client/client.py b/ipyparallel/client/client.py index 92e687608..699ca65a2 100644 --- a/ipyparallel/client/client.py +++ b/ipyparallel/client/client.py @@ -85,6 +85,7 @@ def unpack_message(f, self, msg_parts): Please double-check your profile and ensure that a cluster is running. """ + class ExecuteReply(RichOutput): """wrapper for finished Execute results""" def __init__(self, msg_id, content, metadata): From 055b8d501248ce14ad016e955f2f712df40d9a23 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom-Olav=20B=C3=B8yum?= Date: Mon, 13 Jan 2020 13:11:06 +0100 Subject: [PATCH 03/34] WIP --- ipyparallel/client/client.py | 18 ++++---- ipyparallel/client/view.py | 44 ++++++++++++++++++- ipyparallel/controller/broadcast_scheduler.py | 5 +++ 3 files changed, 56 insertions(+), 11 deletions(-) create mode 100644 ipyparallel/controller/broadcast_scheduler.py diff --git a/ipyparallel/client/client.py b/ipyparallel/client/client.py index 699ca65a2..546670463 100644 --- a/ipyparallel/client/client.py +++ b/ipyparallel/client/client.py @@ -7,7 +7,7 @@ try: from collections.abc import Iterable -except ImportError: # py2 +except ImportError: # py2 from collections import Iterable import socket from concurrent.futures import Future @@ -37,10 +37,7 @@ from IPython.paths import get_ipython_dir from IPython.utils.path import compress_user from ipython_genutils.py3compat import cast_bytes, string_types, xrange, iteritems -from traitlets import ( - HasTraits, Instance, Unicode, - Dict, List, Bool, Set, Any -) +from traitlets import HasTraits, Instance, Unicode, Dict, List, Bool, Set, Any from decorator import decorator from ipyparallel import Reference @@ -54,12 +51,12 @@ from ..util import ioloop from .asyncresult import AsyncResult, AsyncHubResult from .futures import MessageFuture, multi_future -from .view import DirectView, LoadBalancedView +from .view import DirectView, LoadBalancedView, BroadCastView import jupyter_client.session jupyter_client.session.extract_dates = lambda obj: obj -#-------------------------------------------------------------------------- +# -------------------------------------------------------------------------- # Decorators for Client methods -#-------------------------------------------------------------------------- +# -------------------------------------------------------------------------- @decorator @@ -75,9 +72,10 @@ def unpack_message(f, self, msg_parts): pprint(msg) return f(self, msg) -#-------------------------------------------------------------------------- + +# -------------------------------------------------------------------------- # Classes -#-------------------------------------------------------------------------- +# -------------------------------------------------------------------------- _no_connection_file_msg = """ diff --git a/ipyparallel/client/view.py b/ipyparallel/client/view.py index 2905bc182..8437bcd53 100644 --- a/ipyparallel/client/view.py +++ b/ipyparallel/client/view.py @@ -838,6 +838,48 @@ def activate(self, suffix=''): ip.magics_manager.register(M) +class BroadCastView(View): + def __init__(self, client=None, socket=None, targets=None): + super().__init__(client=client, socket=socket, targets=targets) + + + @sync_results + @save_ids + def _really_apply(self, f, args=None, kwargs=None, targets=None, block=None, track=None): + args = [] if args is None else args + kwargs = {} if kwargs is None else kwargs + block = self.block if block is None else block + track = self.track if track is None else track + targets = self.targets if targets is None else targets + + idents, _targets = self.client._build_targets(targets) + futures = [] + + pf = PrePickled(f) + pargs = [PrePickled(arg) for arg in args] + pkwargs = {k: PrePickled(v) for k, v in kwargs.items()} + + + future = self.client.send_apply_request( + self._socket, pf, pargs, pkwargs, + track=track, ident=idents, metadata={'is_broadcast': True}) + futures.append(future) + if isinstance(targets, int): + futures = futures[0] + ar = AsyncResult(self.client, futures, fname=getname(f), targets=_targets, + owner=True) + if block: + try: + return ar.get() + except KeyboardInterrupt: + pass + return ar + + def map(self, f, *sequences, **kwargs): + pass + + + class LoadBalancedView(View): """An load-balancing View that only executes via the Task scheduler. @@ -1165,5 +1207,5 @@ def shutdown(self, wait=True): if wait: self.view.wait() -__all__ = ['LoadBalancedView', 'DirectView', 'ViewExecutor'] +__all__ = ['LoadBalancedView', 'DirectView', 'ViewExecutor', 'BroadCastView'] diff --git a/ipyparallel/controller/broadcast_scheduler.py b/ipyparallel/controller/broadcast_scheduler.py new file mode 100644 index 000000000..07e7dfbd5 --- /dev/null +++ b/ipyparallel/controller/broadcast_scheduler.py @@ -0,0 +1,5 @@ +from ipyparallel.controller.scheduler import TaskScheduler + + +class BroadCastScheduler(TaskScheduler): + pass From e8e05f2d4769131b328f79731f63b0c94f0548aa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom-Olav=20B=C3=B8yum?= Date: Fri, 17 Jan 2020 10:58:58 +0100 Subject: [PATCH 04/34] BroadcastScheduler prototype --- ipyparallel/client/client.py | 23 ++- ipyparallel/client/view.py | 52 ++++- ipyparallel/controller/broadcast_scheduler.py | 5 - ipyparallel/controller/scheduler.py | 185 +++++++++++++++++- ipyparallel/util.py | 2 +- 5 files changed, 251 insertions(+), 16 deletions(-) delete mode 100644 ipyparallel/controller/broadcast_scheduler.py diff --git a/ipyparallel/client/client.py b/ipyparallel/client/client.py index 546670463..962dffa88 100644 --- a/ipyparallel/client/client.py +++ b/ipyparallel/client/client.py @@ -1570,8 +1570,27 @@ def direct_view(self, targets='all', **kwargs): targets = self._build_targets(targets)[1] if single: targets = targets[0] - return DirectView(client=self, socket=self._mux_stream, targets=targets, - **kwargs) + return DirectView( + client=self, socket=self._mux_stream, targets=targets, **kwargs + ) + + def broadcast_view(self, targets='all', **kwargs): + """construct a BroadCastView object. + If no arguments are specified, create a BroadCastView using all engines + using all engines. + + Parameters + ---------- + + targets: list,slice,int,etc. [default: use all engines] + The subset of engines across which to load-balance execution + kwargs: passed to BroadCastView + """ + targets = self._build_targets(targets)[1] + + return BroadCastView( + client=self, socket=self._task_stream, targets=targets, **kwargs + ) #-------------------------------------------------------------------------- # Query methods diff --git a/ipyparallel/client/view.py b/ipyparallel/client/view.py index 8437bcd53..5c68204b6 100644 --- a/ipyparallel/client/view.py +++ b/ipyparallel/client/view.py @@ -128,7 +128,7 @@ def __len__(self): return 1 else: return len(self.client) - + def set_flags(self, **kwargs): """set my attribute flags by keyword. @@ -819,6 +819,7 @@ def activate(self, suffix=''): Parameters ---------- + suffix: str [default: ''] The suffix, if any, for the magics. This allows you to have multiple views associated with parallel magics at the same time. @@ -845,7 +846,9 @@ def __init__(self, client=None, socket=None, targets=None): @sync_results @save_ids - def _really_apply(self, f, args=None, kwargs=None, targets=None, block=None, track=None): + def _really_apply(self, f, args=None, kwargs=None, block=None, track=None, + after=None, follow=None, timeout=None, + targets=None, retries=None): args = [] if args is None else args kwargs = {} if kwargs is None else kwargs block = self.block if block is None else block @@ -859,10 +862,13 @@ def _really_apply(self, f, args=None, kwargs=None, targets=None, block=None, tra pargs = [PrePickled(arg) for arg in args] pkwargs = {k: PrePickled(v) for k, v in kwargs.items()} - + after = self._render_dependency(after) + follow = self._render_dependency(follow) + metadata = dict(after=after, follow=follow, timeout=timeout, targets=idents, retries=retries, is_broadcast=True) + future = self.client.send_apply_request( self._socket, pf, pargs, pkwargs, - track=track, ident=idents, metadata={'is_broadcast': True}) + track=track, ident=idents, metadata=metadata) futures.append(future) if isinstance(targets, int): futures = futures[0] @@ -877,7 +883,43 @@ def _really_apply(self, f, args=None, kwargs=None, targets=None, block=None, tra def map(self, f, *sequences, **kwargs): pass - + + def _validate_dependency(self, dep): + """validate a dependency. + + For use in `set_flags`. + """ + if dep is None or isinstance(dep, string_types + (AsyncResult, Dependency)): + return True + elif isinstance(dep, (list,set, tuple)): + for d in dep: + if not isinstance(d, string_types + (AsyncResult,)): + return False + elif isinstance(dep, dict): + if set(dep.keys()) != set(Dependency().as_dict().keys()): + return False + if not isinstance(dep['msg_ids'], list): + return False + for d in dep['msg_ids']: + if not isinstance(d, string_types): + return False + else: + return False + + return True + + def _render_dependency(self, dep): + """helper for building jsonable dependencies from various input forms.""" + if isinstance(dep, Dependency): + return dep.as_dict() + elif isinstance(dep, AsyncResult): + return dep.msg_ids + elif dep is None: + return [] + else: + # pass to Dependency constructor + return list(Dependency(dep)) + class LoadBalancedView(View): diff --git a/ipyparallel/controller/broadcast_scheduler.py b/ipyparallel/controller/broadcast_scheduler.py deleted file mode 100644 index 07e7dfbd5..000000000 --- a/ipyparallel/controller/broadcast_scheduler.py +++ /dev/null @@ -1,5 +0,0 @@ -from ipyparallel.controller.scheduler import TaskScheduler - - -class BroadCastScheduler(TaskScheduler): - pass diff --git a/ipyparallel/controller/scheduler.py b/ipyparallel/controller/scheduler.py index b7c94133c..ce48e2870 100644 --- a/ipyparallel/controller/scheduler.py +++ b/ipyparallel/controller/scheduler.py @@ -15,6 +15,7 @@ from random import randint, random from types import FunctionType + try: import numpy except ImportError: @@ -111,7 +112,6 @@ def leastload(loads): # Classes #--------------------------------------------------------------------- - # store empty default dependency: MET = Dependency([]) @@ -620,7 +620,7 @@ def submit_task(self, job, indices=None): @util.log_errors - def dispatch_result(self, raw_msg): + def dispatch_result(self, raw_msg): # maybe_dispatch_reults ? """dispatch method for result replies""" try: idents,msg = self.session.feed_identities(raw_msg, copy=False) @@ -836,11 +836,19 @@ def launch_scheduler(in_addr, out_addr, mon_addr, not_addr, reg_addr, config=Non else: log = local_logger(logname, loglevel) - scheduler = TaskScheduler(client_stream=ins, engine_stream=outs, + scheduler = BroadCastScheduler(client_stream=ins, engine_stream=outs, mon_stream=mons, notifier_stream=nots, query_stream=querys, loop=loop, log=log, config=config) + + # scheduler = TaskScheduler(client_stream=ins, engine_stream=outs, + # mon_stream=mons, notifier_stream=nots, + # query_stream=querys, + # loop=loop, log=log, + # config=config) + + # TODO: How to start broadcastScheduler? scheduler.start() if not in_thread: try: @@ -848,3 +856,174 @@ def launch_scheduler(in_addr, out_addr, mon_addr, not_addr, reg_addr, config=Non except KeyboardInterrupt: scheduler.log.critical("Interrupted, exiting...") + +class BroadCastScheduler(TaskScheduler): + jobs_running_on_targets = {} + accumulated_results = {} + + @util.log_errors + def dispatch_submission(self, raw_msg): + self.notifier_stream.flush() + try: + idents, msg = self.session.feed_identities(raw_msg, copy=False) + msg = self.session.deserialize(msg, content=False, copy=False) + except Exception as e: + self.log.error("task::Invaid task msg: %r" % raw_msg, exc_info=True) + return + + # send to monitor + self.mon_stream.send_multipart([b'intask'] + raw_msg, copy=False) + + header = msg['header'] + md = msg['metadata'] + msg_id = header['msg_id'] + + # get targets as a set of bytes objects + # from a list of unicode objects + targets = md.get('targets', []) + targets = set(map(cast_bytes, targets)) + + retries = md.get('retries', 0) + + self.accumulated_results[msg_id] = [] + + new_msg_ids = [] + for target in targets: + target_string = target.decode('utf-8') + msg_and_target_id = f'{msg_id}_{target_string}' + self.all_ids.add(msg_and_target_id) + self.retries[msg_and_target_id]= retries + new_msg_ids.append((msg_id, target, msg_and_target_id)) + + after = md.get('after', None) + if after: + after = Dependency(after) + if after.all: + if after.success: + after = Dependency(after.difference(self.all_completed), + success=after.success, + failure=after.failure, + all=after.all, + ) + if after.failure: + after = Dependency(after.difference(self.all_failed), + success=after.success, + failure=after.failure, + all=after.all, + ) + if after.check(self.all_completed, self.all_failed): + # recast as empty set, if `after` already met, + # to prevent unnecessary set comparisons + after = MET + else: + after = MET + + # location dependencies + follow = Dependency(md.get('follow', [])) + + timeout = md.get('timeout', None) + if timeout: + timeout = float(timeout) + md['original_msg_id'] = msg_id + + jobs = [] + self.jobs_running_on_targets[msg_id] = [] + + for msg_id, target, msg_and_target_id in new_msg_ids: + self.jobs_running_on_targets[msg_id].append(msg_and_target_id) + jobs.append(Job(msg_id=msg_and_target_id, raw_msg=raw_msg, idents=idents, msg=msg, + header=header, targets=[target], after=after, follow=follow, + timeout=timeout, metadata=md)) + + # # validate and reduce dependencies: + # for dep in after, follow: + # if not dep: # empty dependency + # continue + # # check valid: + # if msg_id in dep or dep.difference(self.all_ids): + # self.queue_map[msg_id] = job + # return self.fail_unreachable(msg_id, error.InvalidDependency) + # # check if unreachable: + # if dep.unreachable(self.all_completed, self.all_failed): + # self.queue_map[msg_id] = job + # return self.fail_unreachable(msg_id) + + if after.check(self.all_completed, self.all_failed): + # time deps already met, try to run + for job in jobs: + if not self.maybe_run(job): + # can't run yet + if job.msg_id not in self.all_failed: + # could have failed as unreachable + self.save_unmet(job) + else: + for job in jobs: + self.save_unmet(job) + + @util.log_errors + def dispatch_result(self, raw_msg): + """ Dispatch method for result replies""" + try: + idents, msg = self.session.feed_identities(raw_msg, copy=False) + msg = self.session.deserialize(msg, content=False, copy=False) + engine = idents[0] + + except Exception: + self.log.error("task::Invalid result: %r", raw_msg, exc_info=True) + return + md = msg['metadata'] + parent = msg['parent_header'] + if md.get('dependencies_met', True): + success = (md['status'] == 'ok') + msg_id = parent['msg_id'] + msg_and_target_id = f'{msg_id}_{engine.decode("utf-8")}' + retries = self.retries[msg_and_target_id] + if not success and retries > 0: + # failed + self.retries[msg_and_target_id] = retries - 1 + self.handle_unmet_dependency(idents, parent) + else: + del self.retries[msg_and_target_id] + # relay to client and update graph + self.handle_result(idents, parent, raw_msg, msg_and_target_id, success) + # send to Hub monitor + self.mon_stream.send_multipart([b'outtask'] + raw_msg, copy=False) + else: + self.handle_unmet_dependency(idents, parent) + + def handle_result(self, idents, parent, raw_msg, msg_and_target_id, success=True): + """handle a real task result, either success or failure""" + engine = idents[0] + client = idents[1] + # swap_ids for ROUTER-ROUTER mirror + raw_msg[:2] = [client, engine] + # print (map(str, raw_msg[:4])) + # now, update our data structures + msg_id = parent['msg_id'] + self.all_completed.add(msg_and_target_id) + if success: + self.all_completed.add(msg_and_target_id) + self.accumulated_results[msg_id].append(raw_msg[2:]) + else: + self.all_failed.add(msg_and_target_id) + self.accumulated_results[msg_id].append(None)# Probably choose another value here + + if all( + msg_and_target_id in self.all_completed + or msg_and_target_id in self.all_failed + for msg_and_target_id in self.jobs_running_on_targets[msg_id] + ): + accumulated_msg = raw_msg[:2] + self.accumulated_results[msg_id] + self.client_stream.send_multipart(raw_msg, copy=False) + self.all_done.add(msg_id) + self.update_graph(msg_id, success) #? + + + def submit_task(self, job, indices=None): + targets = [self.targets[i] for i in indices] # Should only be on target, consider changing + # send job to engines + for target in targets: + self.engine_stream.send(target, flags=zmq.SNDMORE, copy=False) + self.engine_stream.send_multipart(job.raw_msg, copy=False) + + diff --git a/ipyparallel/util.py b/ipyparallel/util.py index 6fd04feb6..573db5867 100644 --- a/ipyparallel/util.py +++ b/ipyparallel/util.py @@ -123,7 +123,7 @@ def log_errors(f, self, *args, **kwargs): """ try: return f(self, *args, **kwargs) - except Exception: + except Exception as e: self.log.error("Uncaught exception in %r" % f, exc_info=True) From 25d2a9206a6a15ea4d557ce2285b5daf0cd22e60 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom-Olav=20B=C3=B8yum?= Date: Mon, 27 Jan 2020 16:36:37 +0100 Subject: [PATCH 05/34] BroadcastSchedulerNonCoalescing prototype --- ipyparallel/apps/ipcontrollerapp.py | 6 +- ipyparallel/client/client.py | 46 +- ipyparallel/client/view.py | 62 +- ipyparallel/controller/broadcast_scheduler.py | 67 ++ ipyparallel/controller/hub.py | 2 +- ipyparallel/controller/scheduler.py | 994 ++---------------- ipyparallel/controller/task_scheduler.py | 791 ++++++++++++++ ipyparallel/engine/engine.py | 2 +- ipyparallel/engine/kernel.py | 7 +- 9 files changed, 979 insertions(+), 998 deletions(-) create mode 100644 ipyparallel/controller/broadcast_scheduler.py create mode 100644 ipyparallel/controller/task_scheduler.py diff --git a/ipyparallel/apps/ipcontrollerapp.py b/ipyparallel/apps/ipcontrollerapp.py index 2bab5cb4f..3355d0986 100755 --- a/ipyparallel/apps/ipcontrollerapp.py +++ b/ipyparallel/apps/ipcontrollerapp.py @@ -37,9 +37,11 @@ Session, session_aliases, session_flags, ) +from ipyparallel.controller.broadcast_scheduler import BroadcastSchedulerNonCoalescing from ipyparallel.controller.heartmonitor import HeartMonitor from ipyparallel.controller.hub import HubFactory -from ipyparallel.controller.scheduler import TaskScheduler,launch_scheduler +from ipyparallel.controller.scheduler import launch_scheduler +from ipyparallel.controller.task_scheduler import TaskScheduler from ipyparallel.controller.dictdb import DictDB from ipyparallel.util import disambiguate_url @@ -398,7 +400,7 @@ def init_schedulers(self): else: self.log.info("task::using Python %s Task scheduler"%scheme) - sargs = (f.client_url('task'), f.engine_url('task'), + sargs = (BroadcastSchedulerNonCoalescing, f.client_url('task'), f.engine_url('task'), monitor_url, disambiguate_url(f.client_url('notification')), disambiguate_url(f.client_url('registration')), ) diff --git a/ipyparallel/client/client.py b/ipyparallel/client/client.py index 962dffa88..33455c1e9 100644 --- a/ipyparallel/client/client.py +++ b/ipyparallel/client/client.py @@ -51,7 +51,7 @@ from ..util import ioloop from .asyncresult import AsyncResult, AsyncHubResult from .futures import MessageFuture, multi_future -from .view import DirectView, LoadBalancedView, BroadCastView +from .view import DirectView, LoadBalancedView, BroadcastView import jupyter_client.session jupyter_client.session.extract_dates = lambda obj: obj # -------------------------------------------------------------------------- @@ -65,6 +65,9 @@ def unpack_message(f, self, msg_parts): idents, msg = self.session.feed_identities(msg_parts, copy=False) try: msg = self.session.deserialize(msg, content=True, copy=False) + if 'is_broadcast' in msg['metadata'] and msg['metadata']['is_broadcast']: + msg['parent_header']['msg_id'] =\ + f'{msg["parent_header"]["msg_id"]}_{msg["metadata"]["engine"]}' except: self.log.error("Invalid Message", exc_info=True) else: @@ -981,6 +984,21 @@ def _dispatch_iopub(self, msg): # unhandled msg_type (status, etc.) pass + def create_message_futures(self, msg_id, async_result=False, track=False): + msg_future = MessageFuture(msg_id, track=track) + futures = [msg_future] + self._futures[msg_id] = msg_future + if async_result: + output = MessageFuture(msg_id) + # add future for output + self._output_futures[msg_id] = output + # hook up metadata + output.metadata = self.metadata[msg_id] + output.metadata['submitted'] = util.utcnow() + msg_future.output = output + futures.append(output) + return futures + def _send(self, socket, msg_type, content=None, parent=None, ident=None, buffers=None, track=False, header=None, metadata=None): """Send a message in the IO thread @@ -991,22 +1009,12 @@ def _send(self, socket, msg_type, content=None, parent=None, ident=None, msg = self.session.msg(msg_type, content=content, parent=parent, header=header, metadata=metadata) msg_id = msg['header']['msg_id'] - asyncresult = False - if msg_type in {'execute_request', 'apply_request'}: - asyncresult = True - # add future for output - self._output_futures[msg_id] = output = MessageFuture(msg_id) - # hook up metadata - output.metadata = self.metadata[msg_id] - - - self._futures[msg_id] = future = MessageFuture(msg_id, track=track) - futures = [future] - if asyncresult: - future.output = output - futures.append(output) - output.metadata['submitted'] = util.utcnow() + futures = self.create_message_futures( + msg_id, + async_result=msg_type in {'execute_request', 'apply_request'}, + track=track + ) def cleanup(f): """Purge caches on Future resolution""" @@ -1020,11 +1028,11 @@ def cleanup(f): def _really_send(): sent = self.session.send(socket, msg, track=track, buffers=buffers, ident=ident) if track: - future.tracker.set_result(sent['tracker']) + futures[0].tracker.set_result(sent['tracker']) # hand off actual send to IO thread self._io_loop.add_callback(_really_send) - return future + return futures[0] def _send_recv(self, *args, **kwargs): """Send a message in the IO thread and return its reply""" @@ -1588,7 +1596,7 @@ def broadcast_view(self, targets='all', **kwargs): """ targets = self._build_targets(targets)[1] - return BroadCastView( + return BroadcastView( client=self, socket=self._task_stream, targets=targets, **kwargs ) diff --git a/ipyparallel/client/view.py b/ipyparallel/client/view.py index 5c68204b6..84c85e5d2 100644 --- a/ipyparallel/client/view.py +++ b/ipyparallel/client/view.py @@ -839,16 +839,14 @@ def activate(self, suffix=''): ip.magics_manager.register(M) -class BroadCastView(View): +class BroadcastView(DirectView): def __init__(self, client=None, socket=None, targets=None): super().__init__(client=client, socket=socket, targets=targets) @sync_results @save_ids - def _really_apply(self, f, args=None, kwargs=None, block=None, track=None, - after=None, follow=None, timeout=None, - targets=None, retries=None): + def _really_apply(self, f, args=None, kwargs=None, block=None, track=None, targets=None): args = [] if args is None else args kwargs = {} if kwargs is None else kwargs block = self.block if block is None else block @@ -862,14 +860,21 @@ def _really_apply(self, f, args=None, kwargs=None, block=None, track=None, pargs = [PrePickled(arg) for arg in args] pkwargs = {k: PrePickled(v) for k, v in kwargs.items()} - after = self._render_dependency(after) - follow = self._render_dependency(follow) - metadata = dict(after=after, follow=follow, timeout=timeout, targets=idents, retries=retries, is_broadcast=True) + metadata = dict(targets=idents, is_broadcast=True) - future = self.client.send_apply_request( + original_future = self.client.send_apply_request( self._socket, pf, pargs, pkwargs, track=track, ident=idents, metadata=metadata) - futures.append(future) + + original_msg_id = original_future.msg_id + + for ident in idents: + msg_and_target_id = f'{original_msg_id}_{ident.decode("utf-8")}' + future = self.client.create_message_futures(msg_and_target_id, async_result=True, track=True) + self.client.outstanding.add(msg_and_target_id) + self.outstanding.add(msg_and_target_id) + futures.append(future[0]) + if isinstance(targets, int): futures = futures[0] ar = AsyncResult(self.client, futures, fname=getname(f), targets=_targets, @@ -884,43 +889,6 @@ def _really_apply(self, f, args=None, kwargs=None, block=None, track=None, def map(self, f, *sequences, **kwargs): pass - def _validate_dependency(self, dep): - """validate a dependency. - - For use in `set_flags`. - """ - if dep is None or isinstance(dep, string_types + (AsyncResult, Dependency)): - return True - elif isinstance(dep, (list,set, tuple)): - for d in dep: - if not isinstance(d, string_types + (AsyncResult,)): - return False - elif isinstance(dep, dict): - if set(dep.keys()) != set(Dependency().as_dict().keys()): - return False - if not isinstance(dep['msg_ids'], list): - return False - for d in dep['msg_ids']: - if not isinstance(d, string_types): - return False - else: - return False - - return True - - def _render_dependency(self, dep): - """helper for building jsonable dependencies from various input forms.""" - if isinstance(dep, Dependency): - return dep.as_dict() - elif isinstance(dep, AsyncResult): - return dep.msg_ids - elif dep is None: - return [] - else: - # pass to Dependency constructor - return list(Dependency(dep)) - - class LoadBalancedView(View): """An load-balancing View that only executes via the Task scheduler. @@ -1249,5 +1217,5 @@ def shutdown(self, wait=True): if wait: self.view.wait() -__all__ = ['LoadBalancedView', 'DirectView', 'ViewExecutor', 'BroadCastView'] +__all__ = ['LoadBalancedView', 'DirectView', 'ViewExecutor', 'BroadcastView'] diff --git a/ipyparallel/controller/broadcast_scheduler.py b/ipyparallel/controller/broadcast_scheduler.py new file mode 100644 index 000000000..1cae2a38f --- /dev/null +++ b/ipyparallel/controller/broadcast_scheduler.py @@ -0,0 +1,67 @@ +import zmq +from ipython_genutils.py3compat import cast_bytes + +from ipyparallel import util +from ipyparallel.controller.scheduler import Scheduler + + +class BroadcastSchedulerNonCoalescing(Scheduler): + @util.log_errors + def dispatch_submission(self, raw_msg): + try: + idents, msg_list = self.session.feed_identities(raw_msg, copy=False) + msg = self.session.deserialize(msg_list, content=False, copy=False) + except Exception as e: + self.log.error( + f'broadcast::Invalid broadcast msg: {raw_msg}', exc_info=True + ) + return + + # send to monitor + self.mon_stream.send_multipart([b'intask'] + raw_msg, copy=False) + + header = msg['header'] + metadata = msg['metadata'] + original_msg_id = header['msg_id'] + + targets = metadata.get('targets', []) + + for target in targets: + msg_and_target_id = f'{original_msg_id}_{target}' + self.all_ids.add(msg_and_target_id) + header['msg_id'] = msg_and_target_id + raw_msg[1] = self.session.pack(header) + #TODO: Might have to change raw_msg to add new msg_id + self.engine_stream.send(cast_bytes(target), flags=zmq.SNDMORE, copy=False) + self.engine_stream.send_multipart(raw_msg, copy=False) + + @util.log_errors + def dispatch_result(self, raw_msg): + try: + idents, msg = self.session.feed_identities(raw_msg, copy=False) + msg = self.session.deserialize(msg, content=False, copy=False) + engine, client = idents[:2] # TODO: Make sure this is actually engine + except Exception as e: + self.log.error( + f'broadcast::Invalid broadcast msg: {raw_msg}', exc_info=True + ) + return + + metadata = msg['metadata'] + parent = msg['parent_header'] + + original_msg_id = parent['msg_id'] + msg_and_target_id = f'{original_msg_id}_{engine.decode("utf-8")}' + success = metadata['status'] == 'ok' + if success: + self.all_completed.add(msg_and_target_id) + else: + self.all_failed.add(msg_and_target_id) + + # swap ids for ROUTER-ROUTER mirror + raw_msg[:2] = [client, engine] + self.client_stream.send_multipart(raw_msg, copy=False) + self.all_done.add(msg_and_target_id) + + # send to Hub monitor TODO:Figure out if this is needed + self.mon_stream.send_multipart([b'outtask'] + raw_msg, copy=False) diff --git a/ipyparallel/controller/hub.py b/ipyparallel/controller/hub.py index df8269b06..6acbf5ca5 100644 --- a/ipyparallel/controller/hub.py +++ b/ipyparallel/controller/hub.py @@ -270,7 +270,7 @@ def init_hub(self): if 'TaskScheduler.scheme_name' in self.config: scheme = self.config.TaskScheduler.scheme_name else: - from .scheduler import TaskScheduler + from .task_scheduler import TaskScheduler scheme = TaskScheduler.scheme_name.default_value # build connection dicts diff --git a/ipyparallel/controller/scheduler.py b/ipyparallel/controller/scheduler.py index ce48e2870..578abfed6 100644 --- a/ipyparallel/controller/scheduler.py +++ b/ipyparallel/controller/scheduler.py @@ -9,17 +9,8 @@ # Distributed under the terms of the Modified BSD License. import logging -import time +from traitlets import observe, Instance, Set, CBytes -from collections import deque -from random import randint, random -from types import FunctionType - - -try: - import numpy -except ImportError: - numpy = None import zmq from zmq.eventloop import zmqstream @@ -28,210 +19,53 @@ from decorator import decorator from traitlets.config.application import Application from traitlets.config.loader import Config -from traitlets import Instance, Dict, List, Set, Integer, Enum, CBytes, observe -from ipython_genutils.py3compat import cast_bytes -from ipyparallel import error, util -from ipyparallel.factory import SessionFactory +from ipyparallel import util from ipyparallel.util import connect_logger, local_logger, ioloop -from .dependency import Dependency - import jupyter_client.session jupyter_client.session.extract_dates = lambda obj: obj +from jupyter_client.session import SessionFactory + @decorator -def logged(f,self,*args,**kwargs): +def logged(f, self, *args, **kwargs): # print ("#--------------------") self.log.debug("scheduler::%s(*%s,**%s)", f.__name__, args, kwargs) # print ("#--") - return f(self,*args, **kwargs) - -#---------------------------------------------------------------------- -# Chooser functions -#---------------------------------------------------------------------- - -def plainrandom(loads): - """Plain random pick.""" - n = len(loads) - return randint(0,n-1) - -def lru(loads): - """Always pick the front of the line. - - The content of `loads` is ignored. - - Assumes LRU ordering of loads, with oldest first. - """ - return 0 - -def twobin(loads): - """Pick two at random, use the LRU of the two. - - The content of loads is ignored. - - Assumes LRU ordering of loads, with oldest first. - """ - n = len(loads) - a = randint(0,n-1) - b = randint(0,n-1) - return min(a,b) - -def weighted(loads): - """Pick two at random using inverse load as weight. - - Return the less loaded of the two. - """ - # weight 0 a million times more than 1: - weights = 1./(1e-6+numpy.array(loads)) - sums = weights.cumsum() - t = sums[-1] - x = random()*t - y = random()*t - idx = 0 - idy = 0 - while sums[idx] < x: - idx += 1 - while sums[idy] < y: - idy += 1 - if weights[idy] > weights[idx]: - return idy - else: - return idx - -def leastload(loads): - """Always choose the lowest load. - - If the lowest load occurs more than once, the first - occurance will be used. If loads has LRU ordering, this means - the LRU of those with the lowest load is chosen. - """ - return loads.index(min(loads)) - -#--------------------------------------------------------------------- -# Classes -#--------------------------------------------------------------------- - -# store empty default dependency: -MET = Dependency([]) - - -class Job(object): - """Simple container for a job""" - def __init__(self, msg_id, raw_msg, idents, msg, header, metadata, - targets, after, follow, timeout): - self.msg_id = msg_id - self.raw_msg = raw_msg - self.idents = idents - self.msg = msg - self.header = header - self.metadata = metadata - self.targets = targets - self.after = after - self.follow = follow - self.timeout = timeout - - self.removed = False # used for lazy-delete from sorted queue - self.timestamp = time.time() - self.timeout_id = 0 - self.blacklist = set() - - def __lt__(self, other): - return self.timestamp < other.timestamp - - def __cmp__(self, other): - return cmp(self.timestamp, other.timestamp) - - @property - def dependents(self): - return self.follow.union(self.after) - - -class TaskScheduler(SessionFactory): - """Python TaskScheduler object. - - This is the simplest object that supports msg_id based - DAG dependencies. *Only* task msg_ids are checked, not - msg_ids of jobs submitted via the MUX queue. - - """ - - hwm = Integer(1, config=True, - help="""specify the High Water Mark (HWM) for the downstream - socket in the Task scheduler. This is the maximum number - of allowed outstanding tasks on each engine. - - The default (1) means that only one task can be outstanding on each - engine. Setting TaskScheduler.hwm=0 means there is no limit, and the - engines continue to be assigned tasks while they are working, - effectively hiding network latency behind computation, but can result - in an imbalance of work when submitting many heterogenous tasks all at - once. Any positive value greater than one is a compromise between the - two. - - """ - ) - scheme_name = Enum(('leastload', 'pure', 'lru', 'plainrandom', 'weighted', 'twobin'), - 'leastload', config=True, -help="""select the task scheduler scheme [default: Python LRU] - Options are: 'pure', 'lru', 'plainrandom', 'weighted', 'twobin','leastload'""" - ) - - @observe('scheme_name') - def _scheme_name_changed(self, change): - self.log.debug("Using scheme %r" % change['new']) - self.scheme = globals()[change['new']] - - # input arguments: - scheme = Instance(FunctionType) # function for determining the destination - def _scheme_default(self): - return leastload - client_stream = Instance(zmqstream.ZMQStream, allow_none=True) # client-facing stream - engine_stream = Instance(zmqstream.ZMQStream, allow_none=True) # engine-facing stream - notifier_stream = Instance(zmqstream.ZMQStream, allow_none=True) # hub-facing sub stream - mon_stream = Instance(zmqstream.ZMQStream, allow_none=True) # hub-facing pub stream - query_stream = Instance(zmqstream.ZMQStream, allow_none=True) # hub-facing DEALER stream - - # internals: - queue = Instance(deque) # sorted list of Jobs - def _queue_default(self): - return deque() - queue_map = Dict() # dict by msg_id of Jobs (for O(1) access to the Queue) - graph = Dict() # dict by msg_id of [ msg_ids that depend on key ] - retries = Dict() # dict by msg_id of retries remaining (non-neg ints) - # waiting = List() # list of msg_ids ready to run, but haven't due to HWM - pending = Dict() # dict by engine_uuid of submitted tasks - completed = Dict() # dict by engine_uuid of completed tasks - failed = Dict() # dict by engine_uuid of failed tasks - destinations = Dict() # dict by msg_id of engine_uuids where jobs ran (reverse of completed+failed) - clients = Dict() # dict by msg_id for who submitted the task - targets = List() # list of target IDENTs - loads = List() # list of engine loads - # full = Set() # set of IDENTs that have HWM outstanding tasks - all_completed = Set() # set of all completed tasks - all_failed = Set() # set of all failed tasks - all_done = Set() # set of all finished tasks=union(completed,failed) - all_ids = Set() # set of all submitted task IDs - - ident = CBytes() # ZMQ identity. This should just be self.session.session - # but ensure Bytes + return f(self, *args, **kwargs) + + +class Scheduler(SessionFactory): + client_stream = Instance( + zmqstream.ZMQStream, allow_none=True + ) # client-facing stream + engine_stream = Instance( + zmqstream.ZMQStream, allow_none=True + ) # engine-facing stream + notifier_stream = Instance( + zmqstream.ZMQStream, allow_none=True + ) # hub-facing sub stream + mon_stream = Instance(zmqstream.ZMQStream, allow_none=True) # hub-facing pub stream + query_stream = Instance( + zmqstream.ZMQStream, allow_none=True + ) # hub-facing DEALER stream + + all_completed = Set() # set of all completed tasks + all_failed = Set() # set of all failed tasks + all_done = Set() # set of all finished tasks=union(completed,failed) + all_ids = Set() # set of all submitted task IDs + + ident = CBytes() # ZMQ identity. This should just be self.session.session + + # but ensure Bytes def _ident_default(self): return self.session.bsession def start(self): - self.query_stream.on_recv(self.dispatch_query_reply) - self.session.send(self.query_stream, "connection_request", {}) - self.engine_stream.on_recv(self.dispatch_result, copy=False) self.client_stream.on_recv(self.dispatch_submission, copy=False) - self._notification_handlers = dict( - registration_notification = self._register_engine, - unregistration_notification = self._unregister_engine - ) - self.notifier_stream.on_recv(self.dispatch_notification) - self.log.info("Scheduler started [%s]" % self.scheme_name) - def resume_receiving(self): """Resume accepting jobs.""" self.client_stream.on_recv(self.dispatch_submission, copy=False) @@ -241,557 +75,28 @@ def stop_receiving(self): Leave them in the ZMQ queue.""" self.client_stream.on_recv(None) - #----------------------------------------------------------------------- - # [Un]Registration Handling - #----------------------------------------------------------------------- - - - def dispatch_query_reply(self, msg): - """handle reply to our initial connection request""" - try: - idents,msg = self.session.feed_identities(msg) - except ValueError: - self.log.warn("task::Invalid Message: %r",msg) - return - try: - msg = self.session.deserialize(msg) - except ValueError: - self.log.warn("task::Unauthorized message from: %r"%idents) - return - - content = msg['content'] - for uuid in content.get('engines', {}).values(): - self._register_engine(cast_bytes(uuid)) - - - @util.log_errors - def dispatch_notification(self, msg): - """dispatch register/unregister events.""" - try: - idents,msg = self.session.feed_identities(msg) - except ValueError: - self.log.warn("task::Invalid Message: %r",msg) - return - try: - msg = self.session.deserialize(msg) - except ValueError: - self.log.warn("task::Unauthorized message from: %r"%idents) - return - - msg_type = msg['header']['msg_type'] - - handler = self._notification_handlers.get(msg_type, None) - if handler is None: - self.log.error("Unhandled message type: %r"%msg_type) - else: - try: - handler(cast_bytes(msg['content']['uuid'])) - except Exception: - self.log.error("task::Invalid notification msg: %r", msg, exc_info=True) - - def _register_engine(self, uid): - """New engine with ident `uid` became available.""" - # head of the line: - self.targets.insert(0,uid) - self.loads.insert(0,0) - - # initialize sets - self.completed[uid] = set() - self.failed[uid] = set() - self.pending[uid] = {} - - # rescan the graph: - self.update_graph(None) - - def _unregister_engine(self, uid): - """Existing engine with ident `uid` became unavailable.""" - if len(self.targets) == 1: - # this was our only engine - pass - - # handle any potentially finished tasks: - self.engine_stream.flush() - - # don't pop destinations, because they might be used later - # map(self.destinations.pop, self.completed.pop(uid)) - # map(self.destinations.pop, self.failed.pop(uid)) - - # prevent this engine from receiving work - idx = self.targets.index(uid) - self.targets.pop(idx) - self.loads.pop(idx) - - # wait 5 seconds before cleaning up pending jobs, since the results might - # still be incoming - if self.pending[uid]: - self.loop.add_timeout( - self.loop.time() + 5, - lambda: self.handle_stranded_tasks(uid), - ) - else: - self.completed.pop(uid) - self.failed.pop(uid) - - def handle_stranded_tasks(self, engine): - """Deal with jobs resident in an engine that died.""" - lost = self.pending[engine] - for msg_id in list(lost.keys()): - if msg_id not in lost: - # prevent double-handling of messages - continue - - raw_msg = lost[msg_id].raw_msg - idents, msg = self.session.feed_identities(raw_msg, copy=False) - parent = self.session.unpack(msg[1].bytes) - idents = [engine, idents[0]] - - # build fake error reply - try: - raise error.EngineError("Engine %r died while running task %r"%(engine, msg_id)) - except: - content = error.wrap_exception() - # build fake metadata - md = dict( - status=u'error', - engine=engine.decode('ascii'), - date=util.utcnow(), - ) - msg = self.session.msg('apply_reply', content, parent=parent, metadata=md) - raw_reply = list(map(zmq.Message, self.session.serialize(msg, ident=idents))) - # and dispatch it - self.dispatch_result(raw_reply) - - # finally scrub completed/failed lists - self.completed.pop(engine) - self.failed.pop(engine) - - - #----------------------------------------------------------------------- - # Job Submission - #----------------------------------------------------------------------- - + def dispatch_result(self, raw_msg): + raise NotImplementedError("Implement in subclasses") - @util.log_errors def dispatch_submission(self, raw_msg): - """Dispatch job submission to appropriate handlers.""" - # ensure targets up to date: - self.notifier_stream.flush() - try: - idents, msg = self.session.feed_identities(raw_msg, copy=False) - msg = self.session.deserialize(msg, content=False, copy=False) - except Exception: - self.log.error("task::Invaid task msg: %r"%raw_msg, exc_info=True) - return - - - # send to monitor - self.mon_stream.send_multipart([b'intask']+raw_msg, copy=False) - - header = msg['header'] - md = msg['metadata'] - msg_id = header['msg_id'] - self.all_ids.add(msg_id) + raise NotImplementedError("Implement in subclasses") - # get targets as a set of bytes objects - # from a list of unicode objects - targets = md.get('targets', []) - targets = set(map(cast_bytes, targets)) - retries = md.get('retries', 0) - self.retries[msg_id] = retries - - # time dependencies - after = md.get('after', None) - if after: - after = Dependency(after) - if after.all: - if after.success: - after = Dependency(after.difference(self.all_completed), - success=after.success, - failure=after.failure, - all=after.all, - ) - if after.failure: - after = Dependency(after.difference(self.all_failed), - success=after.success, - failure=after.failure, - all=after.all, - ) - if after.check(self.all_completed, self.all_failed): - # recast as empty set, if `after` already met, - # to prevent unnecessary set comparisons - after = MET - else: - after = MET - - # location dependencies - follow = Dependency(md.get('follow', [])) - - timeout = md.get('timeout', None) - if timeout: - timeout = float(timeout) - - job = Job(msg_id=msg_id, raw_msg=raw_msg, idents=idents, msg=msg, - header=header, targets=targets, after=after, follow=follow, - timeout=timeout, metadata=md, - ) - # validate and reduce dependencies: - for dep in after,follow: - if not dep: # empty dependency - continue - # check valid: - if msg_id in dep or dep.difference(self.all_ids): - self.queue_map[msg_id] = job - return self.fail_unreachable(msg_id, error.InvalidDependency) - # check if unreachable: - if dep.unreachable(self.all_completed, self.all_failed): - self.queue_map[msg_id] = job - return self.fail_unreachable(msg_id) - - if after.check(self.all_completed, self.all_failed): - # time deps already met, try to run - if not self.maybe_run(job): - # can't run yet - if msg_id not in self.all_failed: - # could have failed as unreachable - self.save_unmet(job) - else: - self.save_unmet(job) - - def job_timeout(self, job, timeout_id): - """callback for a job's timeout. - - The job may or may not have been run at this point. - """ - if job.timeout_id != timeout_id: - # not the most recent call - return - now = time.time() - if job.timeout >= (now + 1): - self.log.warn("task %s timeout fired prematurely: %s > %s", - job.msg_id, job.timeout, now - ) - if job.msg_id in self.queue_map: - # still waiting, but ran out of time - self.log.info("task %r timed out", job.msg_id) - self.fail_unreachable(job.msg_id, error.TaskTimeout) - def fail_unreachable(self, msg_id, why=error.ImpossibleDependency): - """a task has become unreachable, send a reply with an ImpossibleDependency - error.""" - if msg_id not in self.queue_map: - self.log.error("task %r already failed!", msg_id) - return - job = self.queue_map.pop(msg_id) - # lazy-delete from the queue - job.removed = True - for mid in job.dependents: - if mid in self.graph: - self.graph[mid].remove(msg_id) - - try: - raise why() - except: - content = error.wrap_exception() - self.log.debug("task %r failing as unreachable with: %s", msg_id, content['ename']) - - self.all_done.add(msg_id) - self.all_failed.add(msg_id) - - msg = self.session.send(self.client_stream, 'apply_reply', content, - parent=job.header, ident=job.idents) - self.session.send(self.mon_stream, msg, ident=[b'outtask']+job.idents) - - self.update_graph(msg_id, success=False) - - def available_engines(self): - """return a list of available engine indices based on HWM""" - if not self.hwm: - return list(range(len(self.targets))) - available = [] - for idx in range(len(self.targets)): - if self.loads[idx] < self.hwm: - available.append(idx) - return available - - def maybe_run(self, job): - """check location dependencies, and run if they are met.""" - msg_id = job.msg_id - self.log.debug("Attempting to assign task %s", msg_id) - available = self.available_engines() - if not available: - # no engines, definitely can't run - return False - - if job.follow or job.targets or job.blacklist or self.hwm: - # we need a can_run filter - def can_run(idx): - # check hwm - if self.hwm and self.loads[idx] == self.hwm: - return False - target = self.targets[idx] - # check blacklist - if target in job.blacklist: - return False - # check targets - if job.targets and target not in job.targets: - return False - # check follow - return job.follow.check(self.completed[target], self.failed[target]) - - indices = list(filter(can_run, available)) - - if not indices: - # couldn't run - if job.follow.all: - # check follow for impossibility - dests = set() - relevant = set() - if job.follow.success: - relevant = self.all_completed - if job.follow.failure: - relevant = relevant.union(self.all_failed) - for m in job.follow.intersection(relevant): - dests.add(self.destinations[m]) - if len(dests) > 1: - self.queue_map[msg_id] = job - self.fail_unreachable(msg_id) - return False - if job.targets: - # check blacklist+targets for impossibility - job.targets.difference_update(job.blacklist) - if not job.targets or not job.targets.intersection(self.targets): - self.queue_map[msg_id] = job - self.fail_unreachable(msg_id) - return False - return False - else: - indices = None - - self.submit_task(job, indices) - return True - - def save_unmet(self, job): - """Save a message for later submission when its dependencies are met.""" - msg_id = job.msg_id - self.log.debug("Adding task %s to the queue", msg_id) - self.queue_map[msg_id] = job - self.queue.append(job) - # track the ids in follow or after, but not those already finished - for dep_id in job.after.union(job.follow).difference(self.all_done): - if dep_id not in self.graph: - self.graph[dep_id] = set() - self.graph[dep_id].add(msg_id) - - # schedule timeout callback - if job.timeout: - timeout_id = job.timeout_id = job.timeout_id + 1 - self.loop.add_timeout(time.time() + job.timeout, - lambda : self.job_timeout(job, timeout_id) - ) - - - def submit_task(self, job, indices=None): - """Submit a task to any of a subset of our targets.""" - if indices: - loads = [self.loads[i] for i in indices] - else: - loads = self.loads - idx = self.scheme(loads) - if indices: - idx = indices[idx] - target = self.targets[idx] - # print (target, map(str, msg[:3])) - # send job to the engine - self.engine_stream.send(target, flags=zmq.SNDMORE, copy=False) - self.engine_stream.send_multipart(job.raw_msg, copy=False) - # update load - self.add_job(idx) - self.pending[target][job.msg_id] = job - # notify Hub - content = dict(msg_id=job.msg_id, engine_id=target.decode('ascii')) - self.session.send(self.mon_stream, 'task_destination', content=content, - ident=[b'tracktask',self.ident]) - - - #----------------------------------------------------------------------- - # Result Handling - #----------------------------------------------------------------------- - - - @util.log_errors - def dispatch_result(self, raw_msg): # maybe_dispatch_reults ? - """dispatch method for result replies""" - try: - idents,msg = self.session.feed_identities(raw_msg, copy=False) - msg = self.session.deserialize(msg, content=False, copy=False) - engine = idents[0] - try: - idx = self.targets.index(engine) - except ValueError: - pass # skip load-update for dead engines - else: - self.finish_job(idx) - except Exception: - self.log.error("task::Invalid result: %r", raw_msg, exc_info=True) - return - - md = msg['metadata'] - parent = msg['parent_header'] - if md.get('dependencies_met', True): - success = (md['status'] == 'ok') - msg_id = parent['msg_id'] - retries = self.retries[msg_id] - if not success and retries > 0: - # failed - self.retries[msg_id] = retries - 1 - self.handle_unmet_dependency(idents, parent) - else: - del self.retries[msg_id] - # relay to client and update graph - self.handle_result(idents, parent, raw_msg, success) - # send to Hub monitor - self.mon_stream.send_multipart([b'outtask']+raw_msg, copy=False) - else: - self.handle_unmet_dependency(idents, parent) - - def handle_result(self, idents, parent, raw_msg, success=True): - """handle a real task result, either success or failure""" - # first, relay result to client - engine = idents[0] - client = idents[1] - # swap_ids for ROUTER-ROUTER mirror - raw_msg[:2] = [client,engine] - # print (map(str, raw_msg[:4])) - self.client_stream.send_multipart(raw_msg, copy=False) - # now, update our data structures - msg_id = parent['msg_id'] - self.pending[engine].pop(msg_id) - if success: - self.completed[engine].add(msg_id) - self.all_completed.add(msg_id) - else: - self.failed[engine].add(msg_id) - self.all_failed.add(msg_id) - self.all_done.add(msg_id) - self.destinations[msg_id] = engine - - self.update_graph(msg_id, success) - - def handle_unmet_dependency(self, idents, parent): - """handle an unmet dependency""" - engine = idents[0] - msg_id = parent['msg_id'] - - job = self.pending[engine].pop(msg_id) - job.blacklist.add(engine) - - if job.blacklist == job.targets: - self.queue_map[msg_id] = job - self.fail_unreachable(msg_id) - elif not self.maybe_run(job): - # resubmit failed - if msg_id not in self.all_failed: - # put it back in our dependency tree - self.save_unmet(job) - - if self.hwm: - try: - idx = self.targets.index(engine) - except ValueError: - pass # skip load-update for dead engines - else: - if self.loads[idx] == self.hwm-1: - self.update_graph(None) - - def update_graph(self, dep_id=None, success=True): - """dep_id just finished. Update our dependency - graph and submit any jobs that just became runnable. - - Called with dep_id=None to update entire graph for hwm, but without finishing a task. - """ - # print ("\n\n***********") - # pprint (dep_id) - # pprint (self.graph) - # pprint (self.queue_map) - # pprint (self.all_completed) - # pprint (self.all_failed) - # print ("\n\n***********\n\n") - # update any jobs that depended on the dependency - msg_ids = self.graph.pop(dep_id, []) - - # recheck *all* jobs if - # a) we have HWM and an engine just become no longer full - # or b) dep_id was given as None - - if dep_id is None or self.hwm and any( [ load==self.hwm-1 for load in self.loads ]): - jobs = self.queue - using_queue = True - else: - using_queue = False - jobs = deque(sorted( self.queue_map[msg_id] for msg_id in msg_ids )) - - to_restore = [] - while jobs: - job = jobs.popleft() - if job.removed: - continue - msg_id = job.msg_id - - put_it_back = True - - if job.after.unreachable(self.all_completed, self.all_failed)\ - or job.follow.unreachable(self.all_completed, self.all_failed): - self.fail_unreachable(msg_id) - put_it_back = False - - elif job.after.check(self.all_completed, self.all_failed): # time deps met, maybe run - if self.maybe_run(job): - put_it_back = False - self.queue_map.pop(msg_id) - for mid in job.dependents: - if mid in self.graph: - self.graph[mid].remove(msg_id) - - # abort the loop if we just filled up all of our engines. - # avoids an O(N) operation in situation of full queue, - # where graph update is triggered as soon as an engine becomes - # non-full, and all tasks after the first are checked, - # even though they can't run. - if not self.available_engines(): - break - - if using_queue and put_it_back: - # popped a job from the queue but it neither ran nor failed, - # so we need to put it back when we are done - # make sure to_restore preserves the same ordering - to_restore.append(job) - - # put back any tasks we popped but didn't run - if using_queue: - self.queue.extendleft(to_restore) - - #---------------------------------------------------------------------- - # methods to be overridden by subclasses - #---------------------------------------------------------------------- - - def add_job(self, idx): - """Called after self.targets[idx] just got the job with header. - Override with subclasses. The default ordering is simple LRU. - The default loads are the number of outstanding jobs.""" - self.loads[idx] += 1 - for lis in (self.targets, self.loads): - lis.append(lis.pop(idx)) - - def finish_job(self, idx): - """Called after self.targets[idx] just finished a job. - Override with subclasses.""" - self.loads[idx] -= 1 - - -def launch_scheduler(in_addr, out_addr, mon_addr, not_addr, reg_addr, config=None, - logname='root', log_url=None, loglevel=logging.DEBUG, - identity=b'task', in_thread=False): +def launch_scheduler( + scheduler_class, + in_addr, + out_addr, + mon_addr, + not_addr, + reg_addr, + config=None, + logname='root', + log_url=None, + loglevel=logging.DEBUG, + identity=b'task', + in_thread=False, +): ZMQStream = zmqstream.ZMQStream @@ -808,23 +113,23 @@ def launch_scheduler(in_addr, out_addr, mon_addr, not_addr, reg_addr, config=Non # for safety with multiprocessing ctx = zmq.Context() loop = ioloop.IOLoop() - ins = ZMQStream(ctx.socket(zmq.ROUTER),loop) + ins = ZMQStream(ctx.socket(zmq.ROUTER), loop) util.set_hwm(ins, 0) ins.setsockopt(zmq.IDENTITY, identity + b'_in') ins.bind(in_addr) - outs = ZMQStream(ctx.socket(zmq.ROUTER),loop) + outs = ZMQStream(ctx.socket(zmq.ROUTER), loop) util.set_hwm(outs, 0) outs.setsockopt(zmq.IDENTITY, identity + b'_out') outs.bind(out_addr) - mons = zmqstream.ZMQStream(ctx.socket(zmq.PUB),loop) + mons = zmqstream.ZMQStream(ctx.socket(zmq.PUB), loop) util.set_hwm(mons, 0) mons.connect(mon_addr) - nots = zmqstream.ZMQStream(ctx.socket(zmq.SUB),loop) + nots = zmqstream.ZMQStream(ctx.socket(zmq.SUB), loop) nots.setsockopt(zmq.SUBSCRIBE, b'') nots.connect(not_addr) - querys = ZMQStream(ctx.socket(zmq.DEALER),loop) + querys = ZMQStream(ctx.socket(zmq.DEALER), loop) querys.connect(reg_addr) # setup logging. @@ -832,15 +137,22 @@ def launch_scheduler(in_addr, out_addr, mon_addr, not_addr, reg_addr, config=Non log = Application.instance().log else: if log_url: - log = connect_logger(logname, ctx, log_url, root="scheduler", loglevel=loglevel) + log = connect_logger( + logname, ctx, log_url, root="scheduler", loglevel=loglevel + ) else: log = local_logger(logname, loglevel) - scheduler = BroadCastScheduler(client_stream=ins, engine_stream=outs, - mon_stream=mons, notifier_stream=nots, - query_stream=querys, - loop=loop, log=log, - config=config) + scheduler = scheduler_class( + client_stream=ins, + engine_stream=outs, + mon_stream=mons, + notifier_stream=nots, + query_stream=querys, + loop=loop, + log=log, + config=config, + ) # scheduler = TaskScheduler(client_stream=ins, engine_stream=outs, # mon_stream=mons, notifier_stream=nots, @@ -854,176 +166,4 @@ def launch_scheduler(in_addr, out_addr, mon_addr, not_addr, reg_addr, config=Non try: loop.start() except KeyboardInterrupt: - scheduler.log.critical("Interrupted, exiting...") - - -class BroadCastScheduler(TaskScheduler): - jobs_running_on_targets = {} - accumulated_results = {} - - @util.log_errors - def dispatch_submission(self, raw_msg): - self.notifier_stream.flush() - try: - idents, msg = self.session.feed_identities(raw_msg, copy=False) - msg = self.session.deserialize(msg, content=False, copy=False) - except Exception as e: - self.log.error("task::Invaid task msg: %r" % raw_msg, exc_info=True) - return - - # send to monitor - self.mon_stream.send_multipart([b'intask'] + raw_msg, copy=False) - - header = msg['header'] - md = msg['metadata'] - msg_id = header['msg_id'] - - # get targets as a set of bytes objects - # from a list of unicode objects - targets = md.get('targets', []) - targets = set(map(cast_bytes, targets)) - - retries = md.get('retries', 0) - - self.accumulated_results[msg_id] = [] - - new_msg_ids = [] - for target in targets: - target_string = target.decode('utf-8') - msg_and_target_id = f'{msg_id}_{target_string}' - self.all_ids.add(msg_and_target_id) - self.retries[msg_and_target_id]= retries - new_msg_ids.append((msg_id, target, msg_and_target_id)) - - after = md.get('after', None) - if after: - after = Dependency(after) - if after.all: - if after.success: - after = Dependency(after.difference(self.all_completed), - success=after.success, - failure=after.failure, - all=after.all, - ) - if after.failure: - after = Dependency(after.difference(self.all_failed), - success=after.success, - failure=after.failure, - all=after.all, - ) - if after.check(self.all_completed, self.all_failed): - # recast as empty set, if `after` already met, - # to prevent unnecessary set comparisons - after = MET - else: - after = MET - - # location dependencies - follow = Dependency(md.get('follow', [])) - - timeout = md.get('timeout', None) - if timeout: - timeout = float(timeout) - md['original_msg_id'] = msg_id - - jobs = [] - self.jobs_running_on_targets[msg_id] = [] - - for msg_id, target, msg_and_target_id in new_msg_ids: - self.jobs_running_on_targets[msg_id].append(msg_and_target_id) - jobs.append(Job(msg_id=msg_and_target_id, raw_msg=raw_msg, idents=idents, msg=msg, - header=header, targets=[target], after=after, follow=follow, - timeout=timeout, metadata=md)) - - # # validate and reduce dependencies: - # for dep in after, follow: - # if not dep: # empty dependency - # continue - # # check valid: - # if msg_id in dep or dep.difference(self.all_ids): - # self.queue_map[msg_id] = job - # return self.fail_unreachable(msg_id, error.InvalidDependency) - # # check if unreachable: - # if dep.unreachable(self.all_completed, self.all_failed): - # self.queue_map[msg_id] = job - # return self.fail_unreachable(msg_id) - - if after.check(self.all_completed, self.all_failed): - # time deps already met, try to run - for job in jobs: - if not self.maybe_run(job): - # can't run yet - if job.msg_id not in self.all_failed: - # could have failed as unreachable - self.save_unmet(job) - else: - for job in jobs: - self.save_unmet(job) - - @util.log_errors - def dispatch_result(self, raw_msg): - """ Dispatch method for result replies""" - try: - idents, msg = self.session.feed_identities(raw_msg, copy=False) - msg = self.session.deserialize(msg, content=False, copy=False) - engine = idents[0] - - except Exception: - self.log.error("task::Invalid result: %r", raw_msg, exc_info=True) - return - md = msg['metadata'] - parent = msg['parent_header'] - if md.get('dependencies_met', True): - success = (md['status'] == 'ok') - msg_id = parent['msg_id'] - msg_and_target_id = f'{msg_id}_{engine.decode("utf-8")}' - retries = self.retries[msg_and_target_id] - if not success and retries > 0: - # failed - self.retries[msg_and_target_id] = retries - 1 - self.handle_unmet_dependency(idents, parent) - else: - del self.retries[msg_and_target_id] - # relay to client and update graph - self.handle_result(idents, parent, raw_msg, msg_and_target_id, success) - # send to Hub monitor - self.mon_stream.send_multipart([b'outtask'] + raw_msg, copy=False) - else: - self.handle_unmet_dependency(idents, parent) - - def handle_result(self, idents, parent, raw_msg, msg_and_target_id, success=True): - """handle a real task result, either success or failure""" - engine = idents[0] - client = idents[1] - # swap_ids for ROUTER-ROUTER mirror - raw_msg[:2] = [client, engine] - # print (map(str, raw_msg[:4])) - # now, update our data structures - msg_id = parent['msg_id'] - self.all_completed.add(msg_and_target_id) - if success: - self.all_completed.add(msg_and_target_id) - self.accumulated_results[msg_id].append(raw_msg[2:]) - else: - self.all_failed.add(msg_and_target_id) - self.accumulated_results[msg_id].append(None)# Probably choose another value here - - if all( - msg_and_target_id in self.all_completed - or msg_and_target_id in self.all_failed - for msg_and_target_id in self.jobs_running_on_targets[msg_id] - ): - accumulated_msg = raw_msg[:2] + self.accumulated_results[msg_id] - self.client_stream.send_multipart(raw_msg, copy=False) - self.all_done.add(msg_id) - self.update_graph(msg_id, success) #? - - - def submit_task(self, job, indices=None): - targets = [self.targets[i] for i in indices] # Should only be on target, consider changing - # send job to engines - for target in targets: - self.engine_stream.send(target, flags=zmq.SNDMORE, copy=False) - self.engine_stream.send_multipart(job.raw_msg, copy=False) - - + scheduler.log.critical("Interrupted, exiting...") \ No newline at end of file diff --git a/ipyparallel/controller/task_scheduler.py b/ipyparallel/controller/task_scheduler.py new file mode 100644 index 000000000..656de8a42 --- /dev/null +++ b/ipyparallel/controller/task_scheduler.py @@ -0,0 +1,791 @@ +import time +from collections import deque +from random import randint +from types import FunctionType + +import zmq +from ipython_genutils.py3compat import cast_bytes +from traitlets import Integer, Enum, observe, Instance, Dict, List + +from ipyparallel import util, error, Dependency +from ipyparallel.controller.scheduler import Scheduler + +try: + import numpy +except ImportError: + numpy = None + +# ---------------------------------------------------------------------- +# Chooser functions +# ---------------------------------------------------------------------- + + +def plainrandom(loads): + """Plain random pick.""" + n = len(loads) + return randint(0, n - 1) + + +def lru(loads): + """Always pick the front of the line. + + The content of `loads` is ignored. + + Assumes LRU ordering of loads, with oldest first. + """ + return 0 + + +def twobin(loads): + """Pick two at random, use the LRU of the two. + + The content of loads is ignored. + + Assumes LRU ordering of loads, with oldest first. + """ + n = len(loads) + a = randint(0, n - 1) + b = randint(0, n - 1) + return min(a, b) + + +def weighted(loads): + """Pick two at random using inverse load as weight. + + Return the less loaded of the two. + """ + # weight 0 a million times more than 1: + weights = 1.0 / (1e-6 + numpy.array(loads)) + sums = weights.cumsum() + t = sums[-1] + x = random() * t + y = random() * t + idx = 0 + idy = 0 + while sums[idx] < x: + idx += 1 + while sums[idy] < y: + idy += 1 + if weights[idy] > weights[idx]: + return idy + else: + return idx + + +def leastload(loads): + """Always choose the lowest load. + + If the lowest load occurs more than once, the first + occurance will be used. If loads has LRU ordering, this means + the LRU of those with the lowest load is chosen. + """ + return loads.index(min(loads)) + + +# --------------------------------------------------------------------- +# Classes +# --------------------------------------------------------------------- + +# store empty default dependency: +MET = Dependency([]) + + +class Job(object): + """Simple container for a job""" + + def __init__( + self, + msg_id, + raw_msg, + idents, + msg, + header, + metadata, + targets, + after, + follow, + timeout, + ): + self.msg_id = msg_id + self.raw_msg = raw_msg + self.idents = idents + self.msg = msg + self.header = header + self.metadata = metadata + self.targets = targets + self.after = after + self.follow = follow + self.timeout = timeout + + self.removed = False # used for lazy-delete from sorted queue + self.timestamp = time.time() + self.timeout_id = 0 + self.blacklist = set() + + def __lt__(self, other): + return self.timestamp < other.timestamp + + def __cmp__(self, other): + return cmp(self.timestamp, other.timestamp) + + @property + def dependents(self): + return self.follow.union(self.after) + + +class TaskScheduler(Scheduler): + """Python TaskScheduler object. + + This is the simplest object that supports msg_id based + DAG dependencies. *Only* task msg_ids are checked, not + msg_ids of jobs submitted via the MUX queue. + + """ + + hwm = Integer( + 1, + config=True, + help="""specify the High Water Mark (HWM) for the downstream + socket in the Task scheduler. This is the maximum number + of allowed outstanding tasks on each engine. + + The default (1) means that only one task can be outstanding on each + engine. Setting TaskScheduler.hwm=0 means there is no limit, and the + engines continue to be assigned tasks while they are working, + effectively hiding network latency behind computation, but can result + in an imbalance of work when submitting many heterogenous tasks all at + once. Any positive value greater than one is a compromise between the + two. + + """, + ) + + scheme_name = Enum( + ('leastload', 'pure', 'lru', 'plainrandom', 'weighted', 'twobin'), + 'leastload', + config=True, + help="""select the task scheduler scheme [default: Python LRU] + Options are: 'pure', 'lru', 'plainrandom', 'weighted', 'twobin','leastload'""", + ) + + @observe('scheme_name') + def _scheme_name_changed(self, change): + self.log.debug("Using scheme %r" % change['new']) + self.scheme = globals()[change['new']] + + # input arguments: + scheme = Instance(FunctionType) # function for determining the destination + + @observe('scheme_name') + def _scheme_name_changed(self, change): + self.log.debug("Using scheme %r" % change['new']) + self.scheme = globals()[change['new']] + + # input arguments: + scheme = Instance(FunctionType) # function for determining the destination + + def _scheme_default(self): + return leastload + + # internals: + queue = Instance(deque) # sorted list of Jobs + + def _queue_default(self): + return deque() + + queue_map = Dict() # dict by msg_id of Jobs (for O(1) access to the Queue) + graph = Dict() # dict by msg_id of [ msg_ids that depend on key ] + retries = Dict() # dict by msg_id of retries remaining (non-neg ints) + # waiting = List() # list of msg_ids ready to run, but haven't due to HWM + pending = Dict() # dict by engine_uuid of submitted tasks + completed = Dict() # dict by engine_uuid of completed tasks + failed = Dict() # dict by engine_uuid of failed tasks + destinations = ( + Dict() + ) # dict by msg_id of engine_uuids where jobs ran (reverse of completed+failed) + clients = Dict() # dict by msg_id for who submitted the task + targets = List() # list of target IDENTs + loads = List() # list of engine loads + # full = Set() # set of IDENTs that have HWM outstanding tasks + + def start(self): + super().start() + self.query_stream.on_recv(self.dispatch_query_reply) + self.session.send(self.query_stream, "connection_request", {}) + self._notification_handlers = dict( + registration_notification=self._register_engine, + unregistration_notification=self._unregister_engine, + ) + self.log.info("Scheduler started [%s]" % self.scheme_name) + self.notifier_stream.on_recv(self.dispatch_notification) + + # ----------------------------------------------------------------------- + # [Un]Registration Handling + # ----------------------------------------------------------------------- + + def dispatch_query_reply(self, msg): + """handle reply to our initial connection request""" + try: + idents, msg = self.session.feed_identities(msg) + except ValueError: + self.log.warn("task::Invalid Message: %r", msg) + return + try: + msg = self.session.deserialize(msg) + except ValueError: + self.log.warn("task::Unauthorized message from: %r" % idents) + return + + content = msg['content'] + for uuid in content.get('engines', {}).values(): + self._register_engine(cast_bytes(uuid)) + + @util.log_errors + def dispatch_notification(self, msg): + """dispatch register/unregister events.""" + try: + idents, msg = self.session.feed_identities(msg) + except ValueError: + self.log.warn("task::Invalid Message: %r", msg) + return + try: + msg = self.session.deserialize(msg) + except ValueError: + self.log.warn("task::Unauthorized message from: %r" % idents) + return + + msg_type = msg['header']['msg_type'] + + handler = self._notification_handlers.get(msg_type, None) + if handler is None: + self.log.error("Unhandled message type: %r" % msg_type) + else: + try: + handler(cast_bytes(msg['content']['uuid'])) + except Exception: + self.log.error("task::Invalid notification msg: %r", msg, exc_info=True) + + def _register_engine(self, uid): + """New engine with ident `uid` became available.""" + # head of the line: + self.targets.insert(0, uid) + self.loads.insert(0, 0) + + # initialize sets + self.completed[uid] = set() + self.failed[uid] = set() + self.pending[uid] = {} + + # rescan the graph: + self.update_graph(None) + + def _unregister_engine(self, uid): + """Existing engine with ident `uid` became unavailable.""" + if len(self.targets) == 1: + # this was our only engine + pass + + # handle any potentially finished tasks: + self.engine_stream.flush() + + # don't pop destinations, because they might be used later + # map(self.destinations.pop, self.completed.pop(uid)) + # map(self.destinations.pop, self.failed.pop(uid)) + + # prevent this engine from receiving work + idx = self.targets.index(uid) + self.targets.pop(idx) + self.loads.pop(idx) + + # wait 5 seconds before cleaning up pending jobs, since the results might + # still be incoming + if self.pending[uid]: + self.loop.add_timeout( + self.loop.time() + 5, lambda: self.handle_stranded_tasks(uid) + ) + else: + self.completed.pop(uid) + self.failed.pop(uid) + + def handle_stranded_tasks(self, engine): + """Deal with jobs resident in an engine that died.""" + lost = self.pending[engine] + for msg_id in list(lost.keys()): + if msg_id not in lost: + # prevent double-handling of messages + continue + + raw_msg = lost[msg_id].raw_msg + idents, msg = self.session.feed_identities(raw_msg, copy=False) + parent = self.session.unpack(msg[1].bytes) + idents = [engine, idents[0]] + + # build fake error reply + try: + raise error.EngineError( + "Engine %r died while running task %r" % (engine, msg_id) + ) + except: + content = error.wrap_exception() + # build fake metadata + md = dict( + status=u'error', engine=engine.decode('ascii'), date=util.utcnow() + ) + msg = self.session.msg('apply_reply', content, parent=parent, metadata=md) + raw_reply = list( + map(zmq.Message, self.session.serialize(msg, ident=idents)) + ) + # and dispatch it + self.dispatch_result(raw_reply) + + # finally scrub completed/failed lists + self.completed.pop(engine) + self.failed.pop(engine) + + # ----------------------------------------------------------------------- + # Job Submission + # ----------------------------------------------------------------------- + + @util.log_errors + def dispatch_submission(self, raw_msg): + """Dispatch job submission to appropriate handlers.""" + # ensure targets up to date: + self.notifier_stream.flush() + try: + idents, msg = self.session.feed_identities(raw_msg, copy=False) + msg = self.session.deserialize(msg, content=False, copy=False) + except Exception: + self.log.error("task::Invaid task msg: %r" % raw_msg, exc_info=True) + return + + # send to monitor + self.mon_stream.send_multipart([b'intask'] + raw_msg, copy=False) + + header = msg['header'] + md = msg['metadata'] + msg_id = header['msg_id'] + self.all_ids.add(msg_id) + + # get targets as a set of bytes objects + # from a list of unicode objects + targets = md.get('targets', []) + targets = set(map(cast_bytes, targets)) + + retries = md.get('retries', 0) + self.retries[msg_id] = retries + + # time dependencies + after = md.get('after', None) + if after: + after = Dependency(after) + if after.all: + if after.success: + after = Dependency( + after.difference(self.all_completed), + success=after.success, + failure=after.failure, + all=after.all, + ) + if after.failure: + after = Dependency( + after.difference(self.all_failed), + success=after.success, + failure=after.failure, + all=after.all, + ) + if after.check(self.all_completed, self.all_failed): + # recast as empty set, if `after` already met, + # to prevent unnecessary set comparisons + after = MET + else: + after = MET + + # location dependencies + follow = Dependency(md.get('follow', [])) + + timeout = md.get('timeout', None) + if timeout: + timeout = float(timeout) + + job = Job( + msg_id=msg_id, + raw_msg=raw_msg, + idents=idents, + msg=msg, + header=header, + targets=targets, + after=after, + follow=follow, + timeout=timeout, + metadata=md, + ) + # validate and reduce dependencies: + for dep in after, follow: + if not dep: # empty dependency + continue + # check valid: + if msg_id in dep or dep.difference(self.all_ids): + self.queue_map[msg_id] = job + return self.fail_unreachable(msg_id, error.InvalidDependency) + # check if unreachable: + if dep.unreachable(self.all_completed, self.all_failed): + self.queue_map[msg_id] = job + return self.fail_unreachable(msg_id) + + if after.check(self.all_completed, self.all_failed): + # time deps already met, try to run + if not self.maybe_run(job): + # can't run yet + if msg_id not in self.all_failed: + # could have failed as unreachable + self.save_unmet(job) + else: + self.save_unmet(job) + + def job_timeout(self, job, timeout_id): + """callback for a job's timeout. + + The job may or may not have been run at this point. + """ + if job.timeout_id != timeout_id: + # not the most recent call + return + now = time.time() + if job.timeout >= (now + 1): + self.log.warn( + "task %s timeout fired prematurely: %s > %s", + job.msg_id, + job.timeout, + now, + ) + if job.msg_id in self.queue_map: + # still waiting, but ran out of time + self.log.info("task %r timed out", job.msg_id) + self.fail_unreachable(job.msg_id, error.TaskTimeout) + + def fail_unreachable(self, msg_id, why=error.ImpossibleDependency): + """a task has become unreachable, send a reply with an ImpossibleDependency + error.""" + if msg_id not in self.queue_map: + self.log.error("task %r already failed!", msg_id) + return + job = self.queue_map.pop(msg_id) + # lazy-delete from the queue + job.removed = True + for mid in job.dependents: + if mid in self.graph: + self.graph[mid].remove(msg_id) + + try: + raise why() + except: + content = error.wrap_exception() + self.log.debug( + "task %r failing as unreachable with: %s", msg_id, content['ename'] + ) + + self.all_done.add(msg_id) + self.all_failed.add(msg_id) + + msg = self.session.send( + self.client_stream, + 'apply_reply', + content, + parent=job.header, + ident=job.idents, + ) + self.session.send(self.mon_stream, msg, ident=[b'outtask'] + job.idents) + + self.update_graph(msg_id, success=False) + + def available_engines(self): + """return a list of available engine indices based on HWM""" + if not self.hwm: + return list(range(len(self.targets))) + available = [] + for idx in range(len(self.targets)): + if self.loads[idx] < self.hwm: + available.append(idx) + return available + + def maybe_run(self, job): + """check location dependencies, and run if they are met.""" + msg_id = job.msg_id + self.log.debug("Attempting to assign task %s", msg_id) + available = self.available_engines() + if not available: + # no engines, definitely can't run + return False + + if job.follow or job.targets or job.blacklist or self.hwm: + # we need a can_run filter + def can_run(idx): + # check hwm + if self.hwm and self.loads[idx] == self.hwm: + return False + target = self.targets[idx] + # check blacklist + if target in job.blacklist: + return False + # check targets + if job.targets and target not in job.targets: + return False + # check follow + return job.follow.check(self.completed[target], self.failed[target]) + + indices = list(filter(can_run, available)) + + if not indices: + # couldn't run + if job.follow.all: + # check follow for impossibility + dests = set() + relevant = set() + if job.follow.success: + relevant = self.all_completed + if job.follow.failure: + relevant = relevant.union(self.all_failed) + for m in job.follow.intersection(relevant): + dests.add(self.destinations[m]) + if len(dests) > 1: + self.queue_map[msg_id] = job + self.fail_unreachable(msg_id) + return False + if job.targets: + # check blacklist+targets for impossibility + job.targets.difference_update(job.blacklist) + if not job.targets or not job.targets.intersection(self.targets): + self.queue_map[msg_id] = job + self.fail_unreachable(msg_id) + return False + return False + else: + indices = None + + self.submit_task(job, indices) + return True + + def save_unmet(self, job): + """Save a message for later submission when its dependencies are met.""" + msg_id = job.msg_id + self.log.debug("Adding task %s to the queue", msg_id) + self.queue_map[msg_id] = job + self.queue.append(job) + # track the ids in follow or after, but not those already finished + for dep_id in job.after.union(job.follow).difference(self.all_done): + if dep_id not in self.graph: + self.graph[dep_id] = set() + self.graph[dep_id].add(msg_id) + + # schedule timeout callback + if job.timeout: + timeout_id = job.timeout_id = job.timeout_id + 1 + self.loop.add_timeout( + time.time() + job.timeout, lambda: self.job_timeout(job, timeout_id) + ) + + def submit_task(self, job, indices=None): + """Submit a task to any of a subset of our targets.""" + if indices: + loads = [self.loads[i] for i in indices] + else: + loads = self.loads + idx = self.scheme(loads) + if indices: + idx = indices[idx] + target = self.targets[idx] + # print (target, map(str, msg[:3])) + # send job to the engine + self.engine_stream.send(target, flags=zmq.SNDMORE, copy=False) + self.engine_stream.send_multipart(job.raw_msg, copy=False) + # update load + self.add_job(idx) + self.pending[target][job.msg_id] = job + # notify Hub + content = dict(msg_id=job.msg_id, engine_id=target.decode('ascii')) + self.session.send( + self.mon_stream, + 'task_destination', + content=content, + ident=[b'tracktask', self.ident], + ) + + # ----------------------------------------------------------------------- + # Result Handling + # ----------------------------------------------------------------------- + + @util.log_errors + def dispatch_result(self, raw_msg): # maybe_dispatch_reults ? + """dispatch method for result replies""" + try: + idents, msg = self.session.feed_identities(raw_msg, copy=False) + msg = self.session.deserialize(msg, content=False, copy=False) + engine = idents[0] + try: + idx = self.targets.index(engine) + except ValueError: + pass # skip load-update for dead engines + else: + self.finish_job(idx) + except Exception: + self.log.error("task::Invalid result: %r", raw_msg, exc_info=True) + return + + md = msg['metadata'] + parent = msg['parent_header'] + if md.get('dependencies_met', True): + success = md['status'] == 'ok' + msg_id = parent['msg_id'] + retries = self.retries[msg_id] + if not success and retries > 0: + # failed + self.retries[msg_id] = retries - 1 + self.handle_unmet_dependency(idents, parent) + else: + del self.retries[msg_id] + # relay to client and update graph + self.handle_result(idents, parent, raw_msg, success) + # send to Hub monitor + self.mon_stream.send_multipart([b'outtask'] + raw_msg, copy=False) + else: + self.handle_unmet_dependency(idents, parent) + + def handle_result(self, idents, parent, raw_msg, success=True): + """handle a real task result, either success or failure""" + # first, relay result to client + engine = idents[0] + client = idents[1] + # swap_ids for ROUTER-ROUTER mirror + raw_msg[:2] = [client, engine] + # print (map(str, raw_msg[:4])) + self.client_stream.send_multipart(raw_msg, copy=False) + # now, update our data structures + msg_id = parent['msg_id'] + self.pending[engine].pop(msg_id) + if success: + self.completed[engine].add(msg_id) + self.all_completed.add(msg_id) + else: + self.failed[engine].add(msg_id) + self.all_failed.add(msg_id) + self.all_done.add(msg_id) + self.destinations[msg_id] = engine + + self.update_graph(msg_id, success) + + def handle_unmet_dependency(self, idents, parent): + """handle an unmet dependency""" + engine = idents[0] + msg_id = parent['msg_id'] + + job = self.pending[engine].pop(msg_id) + job.blacklist.add(engine) + + if job.blacklist == job.targets: + self.queue_map[msg_id] = job + self.fail_unreachable(msg_id) + elif not self.maybe_run(job): + # resubmit failed + if msg_id not in self.all_failed: + # put it back in our dependency tree + self.save_unmet(job) + + if self.hwm: + try: + idx = self.targets.index(engine) + except ValueError: + pass # skip load-update for dead engines + else: + if self.loads[idx] == self.hwm - 1: + self.update_graph(None) + + def update_graph(self, dep_id=None, success=True): + """dep_id just finished. Update our dependency + graph and submit any jobs that just became runnable. + + Called with dep_id=None to update entire graph for hwm, but without finishing a task. + """ + # print ("\n\n***********") + # pprint (dep_id) + # pprint (self.graph) + # pprint (self.queue_map) + # pprint (self.all_completed) + # pprint (self.all_failed) + # print ("\n\n***********\n\n") + # update any jobs that depended on the dependency + msg_ids = self.graph.pop(dep_id, []) + + # recheck *all* jobs if + # a) we have HWM and an engine just become no longer full + # or b) dep_id was given as None + + if ( + dep_id is None + or self.hwm + and any([load == self.hwm - 1 for load in self.loads]) + ): + jobs = self.queue + using_queue = True + else: + using_queue = False + jobs = deque(sorted(self.queue_map[msg_id] for msg_id in msg_ids)) + + to_restore = [] + while jobs: + job = jobs.popleft() + if job.removed: + continue + msg_id = job.msg_id + + put_it_back = True + + if job.after.unreachable( + self.all_completed, self.all_failed + ) or job.follow.unreachable(self.all_completed, self.all_failed): + self.fail_unreachable(msg_id) + put_it_back = False + + elif job.after.check( + self.all_completed, self.all_failed + ): # time deps met, maybe run + if self.maybe_run(job): + put_it_back = False + self.queue_map.pop(msg_id) + for mid in job.dependents: + if mid in self.graph: + self.graph[mid].remove(msg_id) + + # abort the loop if we just filled up all of our engines. + # avoids an O(N) operation in situation of full queue, + # where graph update is triggered as soon as an engine becomes + # non-full, and all tasks after the first are checked, + # even though they can't run. + if not self.available_engines(): + break + + if using_queue and put_it_back: + # popped a job from the queue but it neither ran nor failed, + # so we need to put it back when we are done + # make sure to_restore preserves the same ordering + to_restore.append(job) + + # put back any tasks we popped but didn't run + if using_queue: + self.queue.extendleft(to_restore) + + # ---------------------------------------------------------------------- + # methods to be overridden by subclasses + # ---------------------------------------------------------------------- + + def add_job(self, idx): + """Called after self.targets[idx] just got the job with header. + Override with subclasses. The default ordering is simple LRU. + The default loads are the number of outstanding jobs.""" + self.loads[idx] += 1 + for lis in (self.targets, self.loads): + lis.append(lis.pop(idx)) + + def finish_job(self, idx): + """Called after self.targets[idx] just finished a job. + Override with subclasses.""" + self.loads[idx] -= 1 diff --git a/ipyparallel/engine/engine.py b/ipyparallel/engine/engine.py index 58ecd7224..3975ce31e 100644 --- a/ipyparallel/engine/engine.py +++ b/ipyparallel/engine/engine.py @@ -231,7 +231,7 @@ def url(key): heart = Heart(hb_ping, hb_pong, hb_monitor , heart_id=identity) heart.start() - # create Shell Connections (MUX, Task, etc.): + # create Shell Connections (MUX, Task, etc.):# TODO: Extend for broadcast shell_addrs = url('mux'), url('task') # Use only one shell stream for mux and tasks diff --git a/ipyparallel/engine/kernel.py b/ipyparallel/engine/kernel.py index 31112d310..7e22cb249 100644 --- a/ipyparallel/engine/kernel.py +++ b/ipyparallel/engine/kernel.py @@ -52,12 +52,18 @@ def should_handle(self, stream, msg, idents): return False return True + def is_broadcast(self, parent): + return 'metadata' in parent and 'is_broadcast' in parent['metadata']\ + and parent['metadata']['is_broadcast'] + def init_metadata(self, parent): """init metadata dict, for execute/apply_reply""" return { 'started': utcnow(), 'dependencies_met' : True, 'engine' : self.ident, + 'is_broadcast': self.is_broadcast(parent) + } def finish_metadata(self, parent, metadata, reply_content): @@ -86,7 +92,6 @@ def apply_request(self, stream, ident, parent): return md = self.init_metadata(parent) - reply_content, result_buf = self.do_apply(content, bufs, msg_id, md) # put 'ok'/'error' status in header, for scheduler introspection: From 0c009bdabab2078300a0c87ef8f2f2bdbff6475c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom-Olav=20B=C3=B8yum?= Date: Thu, 30 Jan 2020 13:29:46 +0100 Subject: [PATCH 06/34] BroadcastSchedulerNonCoalescing working when taskscheduler is not launched --- ipyparallel/apps/ipcontrollerapp.py | 112 ++++++++++-------- ipyparallel/client/client.py | 26 +++- ipyparallel/controller/broadcast_scheduler.py | 4 + ipyparallel/controller/hub.py | 10 +- ipyparallel/controller/scheduler.py | 10 +- ipyparallel/engine/engine.py | 7 +- ipyparallel/engine/kernel.py | 2 +- 7 files changed, 104 insertions(+), 67 deletions(-) diff --git a/ipyparallel/apps/ipcontrollerapp.py b/ipyparallel/apps/ipcontrollerapp.py index 3355d0986..e9d8b783e 100755 --- a/ipyparallel/apps/ipcontrollerapp.py +++ b/ipyparallel/apps/ipcontrollerapp.py @@ -141,11 +141,11 @@ class IPControllerApp(BaseParallelApplication): description = _description examples = _examples classes = [ProfileDir, Session, HubFactory, TaskScheduler, HeartMonitor, DictDB] + real_dbs - + # change default to True auto_create = Bool(True, config=True, help="""Whether to create profile dir if it doesn't exist.""") - + reuse_files = Bool(False, config=True, help="""Whether to reuse existing json connection files. If False, connection files will be removed on a clean exit. @@ -198,7 +198,7 @@ def _use_threads_changed(self, change): self.mq_class = 'zmq.devices.{}MonitoredQueue'.format( 'Thread' if change['new'] else 'Process' ) - + write_connection_files = Bool(True, help="""Whether to write connection files to disk. True in all cases other than runs with `reuse_files=True` *after the first* @@ -207,7 +207,7 @@ def _use_threads_changed(self, change): aliases = Dict(aliases) flags = Dict(flags) - + def save_connection_dict(self, fname, cdict): """save a connection dict to json file.""" @@ -216,48 +216,48 @@ def save_connection_dict(self, fname, cdict): with open(fname, 'w') as f: f.write(json.dumps(cdict, indent=2)) os.chmod(fname, stat.S_IRUSR|stat.S_IWUSR) - + def load_config_from_json(self): """load config from existing json connector files.""" c = self.config self.log.debug("loading config from JSON") - + # load engine config - + fname = os.path.join(self.profile_dir.security_dir, self.engine_json_file) self.log.info("loading connection info from %s", fname) with open(fname) as f: ecfg = json.loads(f.read()) - + # json gives unicode, Session.key wants bytes c.Session.key = ecfg['key'].encode('ascii') - + xport,ip = ecfg['interface'].split('://') - + c.HubFactory.engine_ip = ip c.HubFactory.engine_transport = xport - + self.location = ecfg['location'] if not self.engine_ssh_server: self.engine_ssh_server = ecfg['ssh'] - + # load client config - + fname = os.path.join(self.profile_dir.security_dir, self.client_json_file) self.log.info("loading connection info from %s", fname) with open(fname) as f: ccfg = json.loads(f.read()) - + for key in ('key', 'registration', 'pack', 'unpack', 'signature_scheme'): assert ccfg[key] == ecfg[key], "mismatch between engine and client info: %r" % key - + xport, ip = ccfg['interface'].split('://') - + c.HubFactory.client_transport = xport c.HubFactory.client_ip = ip if not self.ssh_server: self.ssh_server = ccfg['ssh'] - + # load port config: c.HubFactory.regport = ecfg['registration'] c.HubFactory.hb = (ecfg['hb_ping'], ecfg['hb_pong']) @@ -266,7 +266,7 @@ def load_config_from_json(self): c.HubFactory.task = (ccfg['task'], ecfg['task']) c.HubFactory.iopub = (ccfg['iopub'], ecfg['iopub']) c.HubFactory.notifier_port = ccfg['notification'] - + def cleanup_connection_files(self): if self.reuse_files: self.log.debug("leaving JSON connection files for reuse") @@ -280,7 +280,7 @@ def cleanup_connection_files(self): self.log.error("Failed to cleanup connection file: %s", e) else: self.log.debug(u"removed %s", f) - + def load_secondary_config(self): """secondary config, loading from JSON and setting defaults""" if self.reuse_files: @@ -292,15 +292,15 @@ def load_secondary_config(self): # successfully loaded config from JSON, and reuse=True # no need to wite back the same file self.write_connection_files = False - + self.log.debug("Config changed") self.log.debug(repr(self.config)) - + def init_hub(self): c = self.config - + self.do_import_statements() - + try: self.factory = HubFactory(config=c, log=self.log) # self.start_logging() @@ -310,7 +310,7 @@ def init_hub(self): except Exception: self.log.error("Couldn't construct the Controller", exc_info=True) self.exit(1) - + if self.write_connection_files: # save to new json config files f = self.factory @@ -321,12 +321,12 @@ def init_hub(self): 'unpack' : f.session.unpacker, 'signature_scheme' : f.session.signature_scheme, } - + cdict = {'ssh' : self.ssh_server} cdict.update(f.client_info) cdict.update(base) self.save_connection_dict(self.client_json_file, cdict) - + edict = {'ssh' : self.engine_ssh_server} edict.update(f.engine_info) edict.update(base) @@ -340,10 +340,23 @@ def init_hub(self): # have the same value self.config.Session.key = self.factory.session.key + def launch_python_scheduler(self, sargs, children): + kwargs = dict(logname='scheduler', loglevel=self.log_level, + log_url=self.log_url, config=dict(self.config)) + if 'Process' in self.mq_class: + # run the Python scheduler in a Process + q = Process(target=launch_scheduler, args=sargs, kwargs=kwargs) + q.daemon = True + children.append(q) + else: + # single-threaded Controller + kwargs['in_thread'] = True + launch_scheduler(*sargs, **kwargs) + def init_schedulers(self): children = self.children mq = import_item(str(self.mq_class)) - + f = self.factory ident = f.session.bsession # disambiguate url, in case of * @@ -361,7 +374,7 @@ def init_schedulers(self): # Multiplexer Queue (in a Process) q = mq(zmq.ROUTER, zmq.ROUTER, zmq.PUB, b'in', b'out') - + q.bind_in(f.client_url('mux')) q.setsockopt_in(zmq.IDENTITY, b'mux_in') q.bind_out(f.engine_url('mux')) @@ -400,28 +413,29 @@ def init_schedulers(self): else: self.log.info("task::using Python %s Task scheduler"%scheme) - sargs = (BroadcastSchedulerNonCoalescing, f.client_url('task'), f.engine_url('task'), + sargs = (TaskScheduler, f.client_url('task'), f.engine_url('task'), monitor_url, disambiguate_url(f.client_url('notification')), disambiguate_url(f.client_url('registration')), ) - kwargs = dict(logname='scheduler', loglevel=self.log_level, - log_url = self.log_url, config=dict(self.config)) - if 'Process' in self.mq_class: - # run the Python scheduler in a Process - q = Process(target=launch_scheduler, args=sargs, kwargs=kwargs) - q.daemon=True - children.append(q) - else: - # single-threaded Controller - kwargs['in_thread'] = True - launch_scheduler(*sargs, **kwargs) - + self.launch_python_scheduler(sargs, children) + + sargs = ( + BroadcastSchedulerNonCoalescing, + f.client_url('broadcast_non_coalescing'), + f.engine_url('broadcast_non_coalescing'), + monitor_url, + disambiguate_url(f.client_url('notification')), + disambiguate_url(f.client_url('registration')) + ) + + self.launch_python_scheduler(sargs, children) + # set unlimited HWM for all relay devices if hasattr(zmq, 'SNDHWM'): q = children[0] q.setsockopt_in(zmq.RCVHWM, 0) q.setsockopt_out(zmq.SNDHWM, 0) - + for q in children[1:]: if not hasattr(q, 'setsockopt_in'): continue @@ -430,7 +444,7 @@ def init_schedulers(self): q.setsockopt_out(zmq.SNDHWM, 0) q.setsockopt_out(zmq.RCVHWM, 0) q.setsockopt_mon(zmq.SNDHWM, 0) - + def terminate_children(self): child_procs = [] @@ -447,12 +461,12 @@ def terminate_children(self): except OSError: # already dead pass - + def handle_signal(self, sig, frame): self.log.critical("Received signal %i, shutting down", sig) self.terminate_children() self.loop.stop() - + def init_signal(self): for sig in (SIGINT, SIGABRT, SIGTERM): signal(sig, self.handle_signal) @@ -476,7 +490,7 @@ def forward_logging(self): handler.root_topic = 'controller' handler.setLevel(self.log_level) self.log.addHandler(handler) - + @catch_config_error def initialize(self, argv=None): super(IPControllerApp, self).initialize(argv) @@ -484,7 +498,7 @@ def initialize(self, argv=None): self.load_secondary_config() self.init_hub() self.init_schedulers() - + def start(self): # Start the subprocesses: self.factory.start() @@ -502,7 +516,7 @@ def start(self): self.log.critical("Interrupted, Exiting...\n") finally: self.cleanup_connection_files() - + def launch_new_instance(*args, **kwargs): """Create and run the IPython controller""" @@ -510,7 +524,7 @@ def launch_new_instance(*args, **kwargs): # make sure we don't get called from a multiprocessing subprocess # this can result in infinite Controllers being started on Windows # which doesn't have a proper fork, so multiprocessing is wonky - + # this only comes up when IPython has been installed using vanilla # setuptools, and *not* distribute. import multiprocessing diff --git a/ipyparallel/client/client.py b/ipyparallel/client/client.py index 33455c1e9..fd1802d44 100644 --- a/ipyparallel/client/client.py +++ b/ipyparallel/client/client.py @@ -65,9 +65,6 @@ def unpack_message(f, self, msg_parts): idents, msg = self.session.feed_identities(msg_parts, copy=False) try: msg = self.session.deserialize(msg, content=True, copy=False) - if 'is_broadcast' in msg['metadata'] and msg['metadata']['is_broadcast']: - msg['parent_header']['msg_id'] =\ - f'{msg["parent_header"]["msg_id"]}_{msg["metadata"]["engine"]}' except: self.log.error("Invalid Message", exc_info=True) else: @@ -362,6 +359,7 @@ def _profile_default(self): _notification_socket=Instance('zmq.Socket', allow_none=True) _mux_socket=Instance('zmq.Socket', allow_none=True) _task_socket=Instance('zmq.Socket', allow_none=True) + _broadcast_non_coalescing_socket=Instance('zmq.Socket', allow_none=True) _task_scheme=Unicode() _closed = False @@ -443,7 +441,7 @@ def __init__(self, url_file=None, profile=None, profile_dir=None, ipython_dir=No cfg['interface'] = "%s://%s" % (proto, addr) # turn interface,port into full urls: - for key in ('control', 'task', 'mux', 'iopub', 'notification', 'registration'): + for key in ('control', 'task', 'mux', 'iopub', 'notification', 'registration', 'broadcast_non_coalescing'): cfg[key] = cfg['interface'] + ':%i' % cfg[key] url = cfg['registration'] @@ -652,6 +650,9 @@ def connect_socket(s, url): self._task_socket = self._context.socket(zmq.DEALER) connect_socket(self._task_socket, cfg['task']) + self._broadcast_non_coalescing_socket = self._context.socket(zmq.DEALER) + connect_socket(self._broadcast_non_coalescing_socket, cfg['broadcast_non_coalescing']) + self._notification_socket = self._context.socket(zmq.SUB) self._notification_socket.setsockopt(zmq.SUBSCRIBE, b'') connect_socket(self._notification_socket, cfg['notification']) @@ -884,6 +885,8 @@ def _setup_streams(self): self._iopub_stream.on_recv(self._dispatch_iopub, copy=False) self._notification_stream = ZMQStream(self._notification_socket, self._io_loop) self._notification_stream.on_recv(self._dispatch_notification, copy=False) + self._broadcast_non_coalescing_stream = ZMQStream(self._broadcast_non_coalescing_socket, self._io_loop) + self._broadcast_non_coalescing_stream.on_recv(self._dispatch_broadcast_reply, copy=False) def _start_io_thread(self): """Start IOLoop in a background thread.""" @@ -939,6 +942,19 @@ def _dispatch_reply(self, msg): else: handler(msg) + @unpack_message + def _dispatch_broadcast_reply(self, msg): + if 'is_broadcast' in msg['metadata'] and msg['metadata']['is_broadcast']: + msg['parent_header']['msg_id'] =\ + f'{msg["parent_header"]["msg_id"]}_{msg["metadata"]["engine"]}' + + msg_type = msg['header']['msg_type'] + handler = self._queue_handlers.get(msg_type, None) + if handler is None: + raise KeyError(f'Unhandled reply message type: {msg_type}') + else: + handler(msg) + @unpack_message def _dispatch_iopub(self, msg): """handler for IOPub messages""" @@ -1597,7 +1613,7 @@ def broadcast_view(self, targets='all', **kwargs): targets = self._build_targets(targets)[1] return BroadcastView( - client=self, socket=self._task_stream, targets=targets, **kwargs + client=self, socket=self._broadcast_non_coalescing_stream, targets=targets, **kwargs ) #-------------------------------------------------------------------------- diff --git a/ipyparallel/controller/broadcast_scheduler.py b/ipyparallel/controller/broadcast_scheduler.py index 1cae2a38f..cbeb6ee52 100644 --- a/ipyparallel/controller/broadcast_scheduler.py +++ b/ipyparallel/controller/broadcast_scheduler.py @@ -6,8 +6,12 @@ class BroadcastSchedulerNonCoalescing(Scheduler): + def __init__(self, *args, **kwargs): + super().__init__(**kwargs) + self.log.info('Broadcast Scheduler Started') @util.log_errors def dispatch_submission(self, raw_msg): + try: idents, msg_list = self.session.feed_identities(raw_msg, copy=False) msg = self.session.deserialize(msg_list, content=False, copy=False) diff --git a/ipyparallel/controller/hub.py b/ipyparallel/controller/hub.py index 6acbf5ca5..c8801bb39 100644 --- a/ipyparallel/controller/hub.py +++ b/ipyparallel/controller/hub.py @@ -149,6 +149,12 @@ def _mux_default(self): def _task_default(self): return tuple(util.select_random_ports(2)) + broadcast_non_coalescing = Tuple(Integer(), Integer(), config=True, + help="""Client/Engine Port pair for BroadcastNonCoalescing queue""") + + def _broadcast_non_coalescing_default(self): + return tuple(util.select_random_ports(2)) + control = Tuple(Integer(), Integer(), config=True, help="""Client/Engine Port pair for Control queue""") @@ -274,7 +280,7 @@ def init_hub(self): scheme = TaskScheduler.scheme_name.default_value # build connection dicts - engine = self.engine_info = { + engine = self.engine_info = { # TODO: Add broadcast 'interface' : "%s://%s" % (self.engine_transport, self.engine_ip), 'registration' : self.regport, 'control' : self.control[1], @@ -283,6 +289,7 @@ def init_hub(self): 'hb_pong' : self.hb[1], 'task' : self.task[1], 'iopub' : self.iopub[1], + 'broadcast_non_coalescing': self.broadcast_non_coalescing[1] } client = self.client_info = { @@ -294,6 +301,7 @@ def init_hub(self): 'task_scheme' : scheme, 'iopub' : self.iopub[0], 'notification' : self.notifier_port, + 'broadcast_non_coalescing': self.broadcast_non_coalescing[0] } self.log.debug("Hub engine addrs: %s", self.engine_info) diff --git a/ipyparallel/controller/scheduler.py b/ipyparallel/controller/scheduler.py index 578abfed6..6f2e68cb7 100644 --- a/ipyparallel/controller/scheduler.py +++ b/ipyparallel/controller/scheduler.py @@ -113,6 +113,8 @@ def launch_scheduler( # for safety with multiprocessing ctx = zmq.Context() loop = ioloop.IOLoop() + + ins = ZMQStream(ctx.socket(zmq.ROUTER), loop) util.set_hwm(ins, 0) ins.setsockopt(zmq.IDENTITY, identity + b'_in') @@ -153,14 +155,6 @@ def launch_scheduler( log=log, config=config, ) - - # scheduler = TaskScheduler(client_stream=ins, engine_stream=outs, - # mon_stream=mons, notifier_stream=nots, - # query_stream=querys, - # loop=loop, log=log, - # config=config) - - # TODO: How to start broadcastScheduler? scheduler.start() if not in_thread: try: diff --git a/ipyparallel/engine/engine.py b/ipyparallel/engine/engine.py index 3975ce31e..f0d6401f3 100644 --- a/ipyparallel/engine/engine.py +++ b/ipyparallel/engine/engine.py @@ -231,14 +231,15 @@ def url(key): heart = Heart(hb_ping, hb_pong, hb_monitor , heart_id=identity) heart.start() - # create Shell Connections (MUX, Task, etc.):# TODO: Extend for broadcast - shell_addrs = url('mux'), url('task') + # create Shell Connections (MUX, Task, etc.): + shell_addrs = url('mux'), url('task'), url('broadcast_non_coalescing') # Use only one shell stream for mux and tasks stream = zmqstream.ZMQStream(ctx.socket(zmq.ROUTER), loop) stream.setsockopt(zmq.IDENTITY, identity) + shell_streams = [stream] - for addr in shell_addrs: + for addr in shell_addrs: # TODO: Possibly problematic connect(stream, addr) # control stream: diff --git a/ipyparallel/engine/kernel.py b/ipyparallel/engine/kernel.py index 7e22cb249..021a647bb 100644 --- a/ipyparallel/engine/kernel.py +++ b/ipyparallel/engine/kernel.py @@ -100,7 +100,7 @@ def apply_request(self, stream, ident, parent): # flush i/o sys.stdout.flush() sys.stderr.flush() - + self.log.info('ENGINE SENDING APPLY_REPLY', md) self.session.send(stream, u'apply_reply', reply_content, parent=parent, ident=ident, buffers=result_buf, metadata=md) From daed2e2505f12f585f6cb4380ca14a2e1179f9f2 Mon Sep 17 00:00:00 2001 From: Min RK Date: Thu, 30 Jan 2020 15:33:34 +0100 Subject: [PATCH 07/34] make scheduler_args keyword-only - ensure broadcast scheduler has an identity set - make default no setting of identity (avoids collision when unset) --- ipyparallel/apps/ipcontrollerapp.py | 49 ++++++++++++++++++----------- ipyparallel/controller/scheduler.py | 10 +++--- 2 files changed, 37 insertions(+), 22 deletions(-) diff --git a/ipyparallel/apps/ipcontrollerapp.py b/ipyparallel/apps/ipcontrollerapp.py index e9d8b783e..58cec5930 100755 --- a/ipyparallel/apps/ipcontrollerapp.py +++ b/ipyparallel/apps/ipcontrollerapp.py @@ -340,18 +340,25 @@ def init_hub(self): # have the same value self.config.Session.key = self.factory.session.key - def launch_python_scheduler(self, sargs, children): - kwargs = dict(logname='scheduler', loglevel=self.log_level, - log_url=self.log_url, config=dict(self.config)) + def launch_python_scheduler(self, scheduler_args, children): + kwargs = {} + kwargs.update(scheduler_args) + kwargs.update( + dict( + logname='scheduler', + loglevel=self.log_level, + log_url=self.log_url,config=dict(self.config), + ) + ) if 'Process' in self.mq_class: # run the Python scheduler in a Process - q = Process(target=launch_scheduler, args=sargs, kwargs=kwargs) + q = Process(target=launch_scheduler, kwargs=kwargs) q.daemon = True children.append(q) else: # single-threaded Controller kwargs['in_thread'] = True - launch_scheduler(*sargs, **kwargs) + launch_scheduler(**kwargs) def init_schedulers(self): children = self.children @@ -413,22 +420,28 @@ def init_schedulers(self): else: self.log.info("task::using Python %s Task scheduler"%scheme) - sargs = (TaskScheduler, f.client_url('task'), f.engine_url('task'), - monitor_url, disambiguate_url(f.client_url('notification')), - disambiguate_url(f.client_url('registration')), + scheduler_args = dict( + scheduler_class=TaskScheduler, + in_addr=f.client_url('task'), + out_addr=f.engine_url('task'), + mon_addr=monitor_url, + not_addr=disambiguate_url(f.client_url('notification')), + reg_addr=disambiguate_url(f.client_url('registration')), + identity=b'task', ) - self.launch_python_scheduler(sargs, children) - - sargs = ( - BroadcastSchedulerNonCoalescing, - f.client_url('broadcast_non_coalescing'), - f.engine_url('broadcast_non_coalescing'), - monitor_url, - disambiguate_url(f.client_url('notification')), - disambiguate_url(f.client_url('registration')) + self.launch_python_scheduler(scheduler_args, children) + + scheduler_args = dict( + scheduler_class=BroadcastSchedulerNonCoalescing, + in_addr=f.client_url('broadcast_non_coalescing'), + out_addr=f.engine_url('broadcast_non_coalescing'), + mon_addr=monitor_url, + not_addr=disambiguate_url(f.client_url('notification')), + reg_addr=disambiguate_url(f.client_url('registration')), + identity=b'broadcast_non_coalescing', ) - self.launch_python_scheduler(sargs, children) + self.launch_python_scheduler(scheduler_args, children) # set unlimited HWM for all relay devices if hasattr(zmq, 'SNDHWM'): diff --git a/ipyparallel/controller/scheduler.py b/ipyparallel/controller/scheduler.py index 6f2e68cb7..2c8af0b5d 100644 --- a/ipyparallel/controller/scheduler.py +++ b/ipyparallel/controller/scheduler.py @@ -94,7 +94,7 @@ def launch_scheduler( logname='root', log_url=None, loglevel=logging.DEBUG, - identity=b'task', + identity=None, in_thread=False, ): @@ -117,12 +117,14 @@ def launch_scheduler( ins = ZMQStream(ctx.socket(zmq.ROUTER), loop) util.set_hwm(ins, 0) - ins.setsockopt(zmq.IDENTITY, identity + b'_in') + if identity: + ins.setsockopt(zmq.IDENTITY, identity + b'_in') ins.bind(in_addr) outs = ZMQStream(ctx.socket(zmq.ROUTER), loop) util.set_hwm(outs, 0) - outs.setsockopt(zmq.IDENTITY, identity + b'_out') + if identity: + outs.setsockopt(zmq.IDENTITY, identity + b'_out') outs.bind(out_addr) mons = zmqstream.ZMQStream(ctx.socket(zmq.PUB), loop) util.set_hwm(mons, 0) @@ -160,4 +162,4 @@ def launch_scheduler( try: loop.start() except KeyboardInterrupt: - scheduler.log.critical("Interrupted, exiting...") \ No newline at end of file + scheduler.log.critical("Interrupted, exiting...") From 9eb2a3691f86668f2fd941916ddc54035bd0fb0a Mon Sep 17 00:00:00 2001 From: Min RK Date: Thu, 30 Jan 2020 15:34:16 +0100 Subject: [PATCH 08/34] more engine logging --- ipyparallel/engine/engine.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ipyparallel/engine/engine.py b/ipyparallel/engine/engine.py index f0d6401f3..6201a2864 100644 --- a/ipyparallel/engine/engine.py +++ b/ipyparallel/engine/engine.py @@ -237,9 +237,11 @@ def url(key): # Use only one shell stream for mux and tasks stream = zmqstream.ZMQStream(ctx.socket(zmq.ROUTER), loop) stream.setsockopt(zmq.IDENTITY, identity) + self.log.debug("Setting shell identity %r", identity) shell_streams = [stream] for addr in shell_addrs: # TODO: Possibly problematic + self.log.info("Connecting shell to %s", addr) connect(stream, addr) # control stream: From bab6eaaaac38d840020593ec311954afba1e5986 Mon Sep 17 00:00:00 2001 From: Min RK Date: Thu, 30 Jan 2020 15:35:01 +0100 Subject: [PATCH 09/34] make sure routing identities are correct in broadcast scheduler --- ipyparallel/client/view.py | 10 ++++++---- ipyparallel/controller/broadcast_scheduler.py | 15 +++++++++------ 2 files changed, 15 insertions(+), 10 deletions(-) diff --git a/ipyparallel/client/view.py b/ipyparallel/client/view.py index 84c85e5d2..64950b5cd 100644 --- a/ipyparallel/client/view.py +++ b/ipyparallel/client/view.py @@ -860,16 +860,18 @@ def _really_apply(self, f, args=None, kwargs=None, block=None, track=None, targe pargs = [PrePickled(arg) for arg in args] pkwargs = {k: PrePickled(v) for k, v in kwargs.items()} - metadata = dict(targets=idents, is_broadcast=True) + s_idents = [ident.decode("utf8") for ident in idents] + + metadata = dict(targets=s_idents, is_broadcast=True) original_future = self.client.send_apply_request( self._socket, pf, pargs, pkwargs, - track=track, ident=idents, metadata=metadata) + track=track, metadata=metadata) original_msg_id = original_future.msg_id - for ident in idents: - msg_and_target_id = f'{original_msg_id}_{ident.decode("utf-8")}' + for ident in s_idents: + msg_and_target_id = f'{original_msg_id}_{ident}' future = self.client.create_message_futures(msg_and_target_id, async_result=True, track=True) self.client.outstanding.add(msg_and_target_id) self.outstanding.add(msg_and_target_id) diff --git a/ipyparallel/controller/broadcast_scheduler.py b/ipyparallel/controller/broadcast_scheduler.py index cbeb6ee52..f05089883 100644 --- a/ipyparallel/controller/broadcast_scheduler.py +++ b/ipyparallel/controller/broadcast_scheduler.py @@ -22,7 +22,7 @@ def dispatch_submission(self, raw_msg): return # send to monitor - self.mon_stream.send_multipart([b'intask'] + raw_msg, copy=False) + self.mon_stream.send_multipart([b'inbcast'] + raw_msg, copy=False) header = msg['header'] metadata = msg['metadata'] @@ -33,11 +33,14 @@ def dispatch_submission(self, raw_msg): for target in targets: msg_and_target_id = f'{original_msg_id}_{target}' self.all_ids.add(msg_and_target_id) + new_idents = [cast_bytes(target)] + idents + header['msg_id'] = msg_and_target_id - raw_msg[1] = self.session.pack(header) - #TODO: Might have to change raw_msg to add new msg_id - self.engine_stream.send(cast_bytes(target), flags=zmq.SNDMORE, copy=False) - self.engine_stream.send_multipart(raw_msg, copy=False) + new_msg_list = self.session.serialize(msg, ident=new_idents) + new_msg_list.extend(msg['buffers']) + + self.log.debug("Sending %r", new_msg_list) + self.engine_stream.send_multipart(new_msg_list, copy=False) @util.log_errors def dispatch_result(self, raw_msg): @@ -68,4 +71,4 @@ def dispatch_result(self, raw_msg): self.all_done.add(msg_and_target_id) # send to Hub monitor TODO:Figure out if this is needed - self.mon_stream.send_multipart([b'outtask'] + raw_msg, copy=False) + self.mon_stream.send_multipart([b'outbcast'] + raw_msg, copy=False) From 4a78ea39394fe10abc471e3340e71606735c7e98 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom-Olav=20B=C3=B8yum?= Date: Fri, 31 Jan 2020 15:45:03 +0100 Subject: [PATCH 10/34] hub now recognise broadcast messages --- ipyparallel/controller/hub.py | 72 +++++++++++++++++++++++++++++++++++ ipyparallel/engine/kernel.py | 2 +- 2 files changed, 73 insertions(+), 1 deletion(-) diff --git a/ipyparallel/controller/hub.py b/ipyparallel/controller/hub.py index c8801bb39..bd718b54d 100644 --- a/ipyparallel/controller/hub.py +++ b/ipyparallel/controller/hub.py @@ -443,6 +443,8 @@ def __init__(self, **kwargs): b'out': self.save_queue_result, b'intask': self.save_task_request, b'outtask': self.save_task_result, + b'inbcast': self.save_broadcast_request, + b'outbcast': self.save_broadcast_result, b'tracktask': self.save_task_destination, b'incontrol': _passer, b'outcontrol': _passer, @@ -721,6 +723,76 @@ def save_queue_result(self, idents, msg): self.log.error("DB Error updating record %r", msg_id, exc_info=True) + #--------------------- Broadcast traffic ------------------------------ + def save_broadcast_request(self, idents, msg): + client_id = idents[0] + + try: + msg = self.session.deserialize(msg) + except Exception as e: + self.log.error(f'broadcast:: client {client_id} sent invalid broadcast message:' + f' {msg}', exc_info=True) + return + + record = init_record(msg) + + record['client_uuid'] = msg['header']['session'] + header = msg['header'] + msg_id = header['msg_id'] + self.pending.add(msg_id) + + try: + self.db.add_record(msg_id, record) + except Exception as e: + self.log.error(f'DB Error adding record {msg_id}', exc_info=True) + + def save_broadcast_result(self, idents, msg): + client_id = idents[0] + try: + msg = self.session.deserialize(msg) + except Exception as e: + self.log.error(f'broadcast::invalid broadcast result message send to {client_id}:' + f'') + + # save the result of a completed broadcast + parent = msg['parent_header'] + if not parent: + self.log.warn(f'Broadcast message {msg} had no parent') + return + msg_id = parent['msg_id'] + header = msg['header'] + md = msg['metadata'] + engine_uuid = md.get('engine', u'') + eid = self.by_ident.get(cast_bytes(engine_uuid), None) + status = md.get('status', None) + + if msg_id in self.pending: + self.log.info(f'broadcast:: broadcast {msg_id} finished on {eid}') + self.pending.remove(msg_id) + self.all_completed.add(msg_id) + if eid is not None and status != 'aborted': + self.completed[eid].append(msg_id) + ensure_date_is_parsed(header) + completed = util.ensure_timezone(header['date']) + started = extract_dates(md.get('started', None)) + result = { + 'result_header': header, + 'result_metadata': msg['metadata'], + 'result_content': msg['content'], + 'started': started, + 'completed': completed, + 'received': util.utcnow(), + 'engine_uuid': engine_uuid, + 'result_buffers': msg['buffers'] + } + + try: + self.db.update_record(msg_id, result) + except Exception as e: + self.log.error(f'DB Error saving broadcast result {msg_id}', msg_id, exc_info=True) + else: + self.log.debug(f'broadcast::unknown broadcast {msg_id} finished') + #--------------------- Task Queue Traffic ------------------------------ def save_task_request(self, idents, msg): diff --git a/ipyparallel/engine/kernel.py b/ipyparallel/engine/kernel.py index 021a647bb..9e01dd365 100644 --- a/ipyparallel/engine/kernel.py +++ b/ipyparallel/engine/kernel.py @@ -100,7 +100,7 @@ def apply_request(self, stream, ident, parent): # flush i/o sys.stdout.flush() sys.stderr.flush() - self.log.info('ENGINE SENDING APPLY_REPLY', md) + self.log.debug('engine: sending apply_reply') self.session.send(stream, u'apply_reply', reply_content, parent=parent, ident=ident, buffers=result_buf, metadata=md) From 6b65d54283ce02821f22307599b472d05a6f6eb8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom-Olav=20B=C3=B8yum?= Date: Fri, 31 Jan 2020 15:45:45 +0100 Subject: [PATCH 11/34] using dispatch_reply also for broadcast - works now because msg_id of msg is correctly replaced in scheduler --- ipyparallel/client/client.py | 15 +-------------- 1 file changed, 1 insertion(+), 14 deletions(-) diff --git a/ipyparallel/client/client.py b/ipyparallel/client/client.py index fd1802d44..8e54b95b4 100644 --- a/ipyparallel/client/client.py +++ b/ipyparallel/client/client.py @@ -886,7 +886,7 @@ def _setup_streams(self): self._notification_stream = ZMQStream(self._notification_socket, self._io_loop) self._notification_stream.on_recv(self._dispatch_notification, copy=False) self._broadcast_non_coalescing_stream = ZMQStream(self._broadcast_non_coalescing_socket, self._io_loop) - self._broadcast_non_coalescing_stream.on_recv(self._dispatch_broadcast_reply, copy=False) + self._broadcast_non_coalescing_stream.on_recv(self._dispatch_reply, copy=False) def _start_io_thread(self): """Start IOLoop in a background thread.""" @@ -942,19 +942,6 @@ def _dispatch_reply(self, msg): else: handler(msg) - @unpack_message - def _dispatch_broadcast_reply(self, msg): - if 'is_broadcast' in msg['metadata'] and msg['metadata']['is_broadcast']: - msg['parent_header']['msg_id'] =\ - f'{msg["parent_header"]["msg_id"]}_{msg["metadata"]["engine"]}' - - msg_type = msg['header']['msg_type'] - handler = self._queue_handlers.get(msg_type, None) - if handler is None: - raise KeyError(f'Unhandled reply message type: {msg_type}') - else: - handler(msg) - @unpack_message def _dispatch_iopub(self, msg): """handler for IOPub messages""" From a25eab71101787069598bdff25e03a756f2a014a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom-Olav=20B=C3=B8yum?= Date: Fri, 31 Jan 2020 15:46:27 +0100 Subject: [PATCH 12/34] sending msg to monitor for each msg created in the scheduler --- ipyparallel/controller/broadcast_scheduler.py | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/ipyparallel/controller/broadcast_scheduler.py b/ipyparallel/controller/broadcast_scheduler.py index f05089883..90b471b45 100644 --- a/ipyparallel/controller/broadcast_scheduler.py +++ b/ipyparallel/controller/broadcast_scheduler.py @@ -22,7 +22,6 @@ def dispatch_submission(self, raw_msg): return # send to monitor - self.mon_stream.send_multipart([b'inbcast'] + raw_msg, copy=False) header = msg['header'] metadata = msg['metadata'] @@ -39,6 +38,7 @@ def dispatch_submission(self, raw_msg): new_msg_list = self.session.serialize(msg, ident=new_idents) new_msg_list.extend(msg['buffers']) + self.mon_stream.send_multipart([b'inbcast'] + new_msg_list, copy=False) self.log.debug("Sending %r", new_msg_list) self.engine_stream.send_multipart(new_msg_list, copy=False) @@ -47,7 +47,7 @@ def dispatch_result(self, raw_msg): try: idents, msg = self.session.feed_identities(raw_msg, copy=False) msg = self.session.deserialize(msg, content=False, copy=False) - engine, client = idents[:2] # TODO: Make sure this is actually engine + engine, client = idents[:2] except Exception as e: self.log.error( f'broadcast::Invalid broadcast msg: {raw_msg}', exc_info=True @@ -55,20 +55,17 @@ def dispatch_result(self, raw_msg): return metadata = msg['metadata'] - parent = msg['parent_header'] + msg_id = msg['parent_header']['msg_id'] - original_msg_id = parent['msg_id'] - msg_and_target_id = f'{original_msg_id}_{engine.decode("utf-8")}' success = metadata['status'] == 'ok' if success: - self.all_completed.add(msg_and_target_id) + self.all_completed.add(msg_id) else: - self.all_failed.add(msg_and_target_id) + self.all_failed.add(msg_id) # swap ids for ROUTER-ROUTER mirror raw_msg[:2] = [client, engine] self.client_stream.send_multipart(raw_msg, copy=False) - self.all_done.add(msg_and_target_id) + self.all_done.add(msg_id) - # send to Hub monitor TODO:Figure out if this is needed self.mon_stream.send_multipart([b'outbcast'] + raw_msg, copy=False) From f0fafd1508d46c2d60b3fc3935e7e9ef7d30fb2b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom-Olav=20B=C3=B8yum?= Date: Mon, 24 Feb 2020 19:43:11 +0100 Subject: [PATCH 13/34] Coalescing Broadcast scheduler --- ipyparallel/apps/ipcontrollerapp.py | 15 +++- ipyparallel/client/client.py | 59 ++++++++++--- ipyparallel/client/view.py | 46 +++++++++- ipyparallel/controller/broadcast_scheduler.py | 85 ++++++++++++++++++- ipyparallel/controller/hub.py | 16 +++- ipyparallel/engine/engine.py | 2 +- ipyparallel/engine/kernel.py | 10 +-- ipyparallel/serialize/serialize.py | 12 ++- 8 files changed, 217 insertions(+), 28 deletions(-) diff --git a/ipyparallel/apps/ipcontrollerapp.py b/ipyparallel/apps/ipcontrollerapp.py index 58cec5930..ecc1ae6db 100755 --- a/ipyparallel/apps/ipcontrollerapp.py +++ b/ipyparallel/apps/ipcontrollerapp.py @@ -37,7 +37,8 @@ Session, session_aliases, session_flags, ) -from ipyparallel.controller.broadcast_scheduler import BroadcastSchedulerNonCoalescing +from ipyparallel.controller.broadcast_scheduler import BroadcastSchedulerNonCoalescing, \ + BroadcastSchedulerCoalescing from ipyparallel.controller.heartmonitor import HeartMonitor from ipyparallel.controller.hub import HubFactory from ipyparallel.controller.scheduler import launch_scheduler @@ -443,6 +444,18 @@ def init_schedulers(self): self.launch_python_scheduler(scheduler_args, children) + scheduler_args = dict( + scheduler_class=BroadcastSchedulerCoalescing, + in_addr=f.client_url('broadcast_coalescing'), + out_addr=f.engine_url('broadcast_coalescing'), + mon_addr=monitor_url, + not_addr=disambiguate_url(f.client_url('notification')), + reg_addr=disambiguate_url(f.client_url('registration')), + identity=b'broadcast_coalescing', + ) + + self.launch_python_scheduler(scheduler_args, children) + # set unlimited HWM for all relay devices if hasattr(zmq, 'SNDHWM'): q = children[0] diff --git a/ipyparallel/client/client.py b/ipyparallel/client/client.py index 8e54b95b4..ed3aef016 100644 --- a/ipyparallel/client/client.py +++ b/ipyparallel/client/client.py @@ -51,7 +51,7 @@ from ..util import ioloop from .asyncresult import AsyncResult, AsyncHubResult from .futures import MessageFuture, multi_future -from .view import DirectView, LoadBalancedView, BroadcastView +from .view import DirectView, LoadBalancedView, BroadcastViewNonCoalescing, BroadcastViewCoalescing import jupyter_client.session jupyter_client.session.extract_dates = lambda obj: obj # -------------------------------------------------------------------------- @@ -360,6 +360,8 @@ def _profile_default(self): _mux_socket=Instance('zmq.Socket', allow_none=True) _task_socket=Instance('zmq.Socket', allow_none=True) _broadcast_non_coalescing_socket=Instance('zmq.Socket', allow_none=True) + _broadcast_coalescing_socket=Instance('zmq.Socket', allow_none=True) + _task_scheme=Unicode() _closed = False @@ -441,7 +443,15 @@ def __init__(self, url_file=None, profile=None, profile_dir=None, ipython_dir=No cfg['interface'] = "%s://%s" % (proto, addr) # turn interface,port into full urls: - for key in ('control', 'task', 'mux', 'iopub', 'notification', 'registration', 'broadcast_non_coalescing'): + for key in ( + 'control', + 'task', + 'mux', + 'iopub', + 'notification', + 'registration', + 'broadcast_non_coalescing', + 'broadcast_coalescing'): cfg[key] = cfg['interface'] + ':%i' % cfg[key] url = cfg['registration'] @@ -651,7 +661,12 @@ def connect_socket(s, url): connect_socket(self._task_socket, cfg['task']) self._broadcast_non_coalescing_socket = self._context.socket(zmq.DEALER) - connect_socket(self._broadcast_non_coalescing_socket, cfg['broadcast_non_coalescing']) + connect_socket(self._broadcast_non_coalescing_socket, + cfg['broadcast_non_coalescing']) + + self._broadcast_coalescing_socket = self._context.socket(zmq.DEALER) + connect_socket(self._broadcast_coalescing_socket, + cfg['broadcast_coalescing']) self._notification_socket = self._context.socket(zmq.SUB) self._notification_socket.setsockopt(zmq.SUBSCRIBE, b'') @@ -697,11 +712,12 @@ def _extract_metadata(self, msg): 'follow' : msg_meta.get('follow', []), 'after' : msg_meta.get('after', []), 'status' : content['status'], + 'is_broadcast_non_coalescing': msg_meta.get('is_broadcast_non_coalescing', False), + 'is_broadcast_coalescing': msg_meta.get('is_broadcast_coalescing', False) } if md['engine_uuid'] is not None: md['engine_id'] = self._engines.get(md['engine_uuid'], None) - if 'date' in parent: md['submitted'] = parent['date'] if 'started' in msg_meta: @@ -807,7 +823,13 @@ def _handle_execute_reply(self, msg): def _handle_apply_reply(self, msg): """Save the reply to an apply_request into our results.""" parent = msg['parent_header'] - msg_id = parent['msg_id'] + + md = msg['metadata'] + if md.get('is_broadcast_non_coalescing', False) or md.get('is_broadcast_coalescing'): + msg_id = msg['metadata']['original_msg_id'] + else: + msg_id = parent['msg_id'] + future = self._futures.get(msg_id, None) if msg_id not in self.outstanding: if msg_id in self.history: @@ -829,9 +851,15 @@ def _handle_apply_reply(self, msg): if msg_id in e_outstanding: e_outstanding.remove(msg_id) + # construct result: if content['status'] == 'ok': - self.results[msg_id] = serialize.deserialize_object(msg['buffers'])[0] + if md.get('is_broadcast_coalescing', False): + self.results[msg_id] = serialize.deserialize_object( + msg['buffers'], try_to_extract_all=True + ) + else: + self.results[msg_id] = serialize.deserialize_object(msg['buffers'])[0] elif content['status'] == 'aborted': self.results[msg_id] = error.TaskAborted(msg_id) out_future = self._output_futures.get(msg_id) @@ -885,8 +913,13 @@ def _setup_streams(self): self._iopub_stream.on_recv(self._dispatch_iopub, copy=False) self._notification_stream = ZMQStream(self._notification_socket, self._io_loop) self._notification_stream.on_recv(self._dispatch_notification, copy=False) - self._broadcast_non_coalescing_stream = ZMQStream(self._broadcast_non_coalescing_socket, self._io_loop) + + self._broadcast_non_coalescing_stream = ZMQStream( + self._broadcast_non_coalescing_socket, self._io_loop) self._broadcast_non_coalescing_stream.on_recv(self._dispatch_reply, copy=False) + self._broadcast_coalescing_stream = ZMQStream( + self._broadcast_coalescing_socket, self._io_loop) + self._broadcast_coalescing_stream.on_recv(self._dispatch_reply, copy=False) def _start_io_thread(self): """Start IOLoop in a background thread.""" @@ -1585,7 +1618,7 @@ def direct_view(self, targets='all', **kwargs): client=self, socket=self._mux_stream, targets=targets, **kwargs ) - def broadcast_view(self, targets='all', **kwargs): + def broadcast_view(self, targets='all', is_coalescing=False, **kwargs): """construct a BroadCastView object. If no arguments are specified, create a BroadCastView using all engines using all engines. @@ -1595,13 +1628,17 @@ def broadcast_view(self, targets='all', **kwargs): targets: list,slice,int,etc. [default: use all engines] The subset of engines across which to load-balance execution + is_coalescing: scheduler collects all messages from engines and returns them as one kwargs: passed to BroadCastView """ targets = self._build_targets(targets)[1] - return BroadcastView( - client=self, socket=self._broadcast_non_coalescing_stream, targets=targets, **kwargs - ) + return BroadcastViewCoalescing( + client=self, socket=self._broadcast_coalescing_stream, targets=targets, **kwargs + ) if is_coalescing else BroadcastViewNonCoalescing( + client=self, + socket=self._broadcast_non_coalescing_stream, + targets=targets, **kwargs) #-------------------------------------------------------------------------- # Query methods diff --git a/ipyparallel/client/view.py b/ipyparallel/client/view.py index 64950b5cd..0d35f54db 100644 --- a/ipyparallel/client/view.py +++ b/ipyparallel/client/view.py @@ -839,7 +839,7 @@ def activate(self, suffix=''): ip.magics_manager.register(M) -class BroadcastView(DirectView): +class BroadcastViewNonCoalescing(DirectView): def __init__(self, client=None, socket=None, targets=None): super().__init__(client=client, socket=socket, targets=targets) @@ -876,6 +876,7 @@ def _really_apply(self, f, args=None, kwargs=None, block=None, track=None, targe self.client.outstanding.add(msg_and_target_id) self.outstanding.add(msg_and_target_id) futures.append(future[0]) + self.outstanding.remove(original_msg_id) if isinstance(targets, int): futures = futures[0] @@ -892,6 +893,46 @@ def map(self, f, *sequences, **kwargs): pass +class BroadcastViewCoalescing(DirectView): + def __init__(self, client=None, socket=None, targets=None): + super().__init__(client=client, socket=socket, targets=targets) + + @sync_results + @save_ids + def _really_apply(self, f, args=None, kwargs=None, block=None, track=None, + targets=None): + args = [] if args is None else args + kwargs = {} if kwargs is None else kwargs + block = self.block if block is None else block + track = self.track if track is None else track + targets = self.targets if targets is None else targets + + idents, _targets = self.client._build_targets(targets) + + pf = PrePickled(f) + pargs = [PrePickled(arg) for arg in args] + pkwargs = {k: PrePickled(v) for k, v in kwargs.items()} + + s_idents = [ident.decode("utf8") for ident in idents] + + metadata = dict(targets=s_idents, is_broadcast_coalescing=True) + + message_future = self.client.send_apply_request( + self._socket, pf, pargs, pkwargs, + track=track, metadata=metadata) + + self.client.outstanding.add(message_future.msg_id) + + ar = AsyncResult(self.client, message_future, fname=getname(f), targets=_targets, + owner=True) + if block: + try: + return ar.get() + except KeyboardInterrupt: + pass + return ar + + class LoadBalancedView(View): """An load-balancing View that only executes via the Task scheduler. @@ -1219,5 +1260,6 @@ def shutdown(self, wait=True): if wait: self.view.wait() -__all__ = ['LoadBalancedView', 'DirectView', 'ViewExecutor', 'BroadcastView'] +__all__ = ['LoadBalancedView', 'DirectView', 'ViewExecutor', + 'BroadcastViewNonCoalescing', 'BroadcastViewCoalescing'] diff --git a/ipyparallel/controller/broadcast_scheduler.py b/ipyparallel/controller/broadcast_scheduler.py index 90b471b45..92e711b76 100644 --- a/ipyparallel/controller/broadcast_scheduler.py +++ b/ipyparallel/controller/broadcast_scheduler.py @@ -1,4 +1,3 @@ -import zmq from ipython_genutils.py3compat import cast_bytes from ipyparallel import util @@ -8,7 +7,8 @@ class BroadcastSchedulerNonCoalescing(Scheduler): def __init__(self, *args, **kwargs): super().__init__(**kwargs) - self.log.info('Broadcast Scheduler Started') + self.log.info('Broadcast non coalescing Scheduler Started') + @util.log_errors def dispatch_submission(self, raw_msg): @@ -69,3 +69,84 @@ def dispatch_result(self, raw_msg): self.all_done.add(msg_id) self.mon_stream.send_multipart([b'outbcast'] + raw_msg, copy=False) + + +class BroadcastSchedulerCoalescing(Scheduler): + def __init__(self, *args, **kwargs): + super().__init__(**kwargs) + self.log.info('Broadcast coalescing Scheduler Started') + + accumulated_replies = {} + + @util.log_errors + def dispatch_submission(self, raw_msg): + try: + idents, msg_list = self.session.feed_identities(raw_msg, copy=False) + msg = self.session.deserialize(msg_list, content=False, copy=False) + except Exception as e: + self.log.error( + f'broadcast::Invalid broadcast msg: {raw_msg}', exc_info=True + ) + return + + # send to monitor + header = msg['header'] + metadata = msg['metadata'] + original_msg_id = header['msg_id'] + + targets = metadata.get('targets', []) + self.accumulated_replies[original_msg_id] = { f'{original_msg_id}_{target}': None for target in targets} + metadata['original_msg_id'] = original_msg_id + + for target in targets: + msg_and_target_id = f'{original_msg_id}_{target}' + self.all_ids.add(msg_and_target_id) + header['msg_id'] = msg_and_target_id + new_idents = [cast_bytes(target)] + idents + new_msg_list = self.session.serialize(msg, ident=new_idents) + new_msg_list.extend(msg['buffers']) + + self.mon_stream.send_multipart([b'inbcast'] + new_msg_list, copy=False) + # self.log.debug("Sending %r", new_msg_list) + self.engine_stream.send_multipart(new_msg_list, copy=False) + + + + @util.log_errors + def dispatch_result(self, raw_msg): + try: + idents, msg = self.session.feed_identities(raw_msg, copy=False) + msg = self.session.deserialize(msg, content=False, copy=False) + engine, client = idents[:2] + except Exception as e: + self.log.error( + f'broadcast::Invalid broadcast msg: {raw_msg}', exc_info=True + ) + return + + metadata = msg['metadata'] + msg_id = msg['parent_header']['msg_id'] + + success = metadata['status'] == 'ok' + if success: + self.all_completed.add(msg_id) + else: + self.all_failed.add(msg_id) + + original_msg_id = metadata['original_msg_id'] + self.accumulated_replies[original_msg_id][msg_id] = raw_msg + raw_msg[:2] = [client, engine] + + if all(msg is not None for msg + in self.accumulated_replies[original_msg_id].values()): + + self.client_stream.send_multipart( + [ + msgpart for msg in + self.accumulated_replies[original_msg_id].values() + for msgpart in msg + ] + , copy=False) + self.all_done.add(original_msg_id) + + self.mon_stream.send_multipart([b'outbcast'] + raw_msg, copy=False) \ No newline at end of file diff --git a/ipyparallel/controller/hub.py b/ipyparallel/controller/hub.py index bd718b54d..01e789524 100644 --- a/ipyparallel/controller/hub.py +++ b/ipyparallel/controller/hub.py @@ -152,9 +152,16 @@ def _task_default(self): broadcast_non_coalescing = Tuple(Integer(), Integer(), config=True, help="""Client/Engine Port pair for BroadcastNonCoalescing queue""") + def _broadcast_non_coalescing_default(self): return tuple(util.select_random_ports(2)) + broadcast_coalescing = Tuple(Integer(), Integer(), config=True, + help="""Client/Engine Port pair for BroadcastCoalescing queue""") + + def _broadcast_coalescing_default(self): + return tuple(util.select_random_ports(2)) + control = Tuple(Integer(), Integer(), config=True, help="""Client/Engine Port pair for Control queue""") @@ -280,7 +287,7 @@ def init_hub(self): scheme = TaskScheduler.scheme_name.default_value # build connection dicts - engine = self.engine_info = { # TODO: Add broadcast + engine = self.engine_info = { 'interface' : "%s://%s" % (self.engine_transport, self.engine_ip), 'registration' : self.regport, 'control' : self.control[1], @@ -289,7 +296,8 @@ def init_hub(self): 'hb_pong' : self.hb[1], 'task' : self.task[1], 'iopub' : self.iopub[1], - 'broadcast_non_coalescing': self.broadcast_non_coalescing[1] + 'broadcast_non_coalescing': self.broadcast_non_coalescing[1], + 'broadcast_coalescing': self.broadcast_coalescing[1] } client = self.client_info = { @@ -301,7 +309,8 @@ def init_hub(self): 'task_scheme' : scheme, 'iopub' : self.iopub[0], 'notification' : self.notifier_port, - 'broadcast_non_coalescing': self.broadcast_non_coalescing[0] + 'broadcast_non_coalescing': self.broadcast_non_coalescing[0], + 'broadcast_coalescing': self.broadcast_coalescing[0] } self.log.debug("Hub engine addrs: %s", self.engine_info) @@ -726,7 +735,6 @@ def save_queue_result(self, idents, msg): #--------------------- Broadcast traffic ------------------------------ def save_broadcast_request(self, idents, msg): client_id = idents[0] - try: msg = self.session.deserialize(msg) except Exception as e: diff --git a/ipyparallel/engine/engine.py b/ipyparallel/engine/engine.py index 6201a2864..e728d99c7 100644 --- a/ipyparallel/engine/engine.py +++ b/ipyparallel/engine/engine.py @@ -232,7 +232,7 @@ def url(key): heart.start() # create Shell Connections (MUX, Task, etc.): - shell_addrs = url('mux'), url('task'), url('broadcast_non_coalescing') + shell_addrs = url('mux'), url('task'), url('broadcast_non_coalescing'), url('broadcast_coalescing') # Use only one shell stream for mux and tasks stream = zmqstream.ZMQStream(ctx.socket(zmq.ROUTER), loop) diff --git a/ipyparallel/engine/kernel.py b/ipyparallel/engine/kernel.py index 9e01dd365..def636bb4 100644 --- a/ipyparallel/engine/kernel.py +++ b/ipyparallel/engine/kernel.py @@ -52,9 +52,8 @@ def should_handle(self, stream, msg, idents): return False return True - def is_broadcast(self, parent): - return 'metadata' in parent and 'is_broadcast' in parent['metadata']\ - and parent['metadata']['is_broadcast'] + def extract_original_msg_id(self, parent): + return parent.get('metadata', {}).get('original_msg_id', '') def init_metadata(self, parent): """init metadata dict, for execute/apply_reply""" @@ -62,8 +61,9 @@ def init_metadata(self, parent): 'started': utcnow(), 'dependencies_met' : True, 'engine' : self.ident, - 'is_broadcast': self.is_broadcast(parent) - + 'is_broadcast_non_coalescing': parent.get('metadata', {}).get('is_broadcast_non_coalescing', False), + 'is_broadcast_coalescing': parent.get('metadata', {}).get('is_broadcast_coalescing', False), + 'original_msg_id': self.extract_original_msg_id(parent) } def finish_metadata(self, parent, metadata, reply_content): diff --git a/ipyparallel/serialize/serialize.py b/ipyparallel/serialize/serialize.py index f39e92498..81850de28 100644 --- a/ipyparallel/serialize/serialize.py +++ b/ipyparallel/serialize/serialize.py @@ -125,7 +125,7 @@ def serialize_object(obj, buffer_threshold=MAX_BYTES, item_threshold=MAX_ITEMS): buffers.insert(0, pickle.dumps(cobj, PICKLE_PROTOCOL)) return buffers -def deserialize_object(buffers, g=None): +def deserialize_object(buffers, g=None, try_to_extract_all=False): """reconstruct an object serialized by serialize_object from data buffers. Parameters @@ -143,7 +143,15 @@ def deserialize_object(buffers, g=None): bufs = list(buffers) pobj = buffer_to_bytes_py2(bufs.pop(0)) canned = pickle.loads(pobj) - if istype(canned, sequence_types) and len(canned) < MAX_ITEMS: + if try_to_extract_all: + unpickled_buffers = [canned] + for buf in bufs: + try: + unpickled_buffers.append(pickle.loads(buffer_to_bytes_py2(buf))) + except Exception: + continue + return unpickled_buffers + elif istype(canned, sequence_types) and len(canned) < MAX_ITEMS: for c in canned: _restore_buffers(c, bufs) newobj = uncan_sequence(canned, g) From 18cd105e839ca76fae13ab4484cf391c9abf044e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom-Olav=20B=C3=B8yum?= Date: Tue, 10 Mar 2020 11:51:50 +0100 Subject: [PATCH 14/34] spanning tree schedulers MVP --- ipyparallel/apps/ipcontrollerapp.py | 120 +++++++++++------- ipyparallel/client/client.py | 22 +++- ipyparallel/client/view.py | 40 ++++++ ipyparallel/controller/broadcast_scheduler.py | 28 ++-- .../controller/exponential_scheduler.py | 95 ++++++++++++++ ipyparallel/controller/hub.py | 12 +- ipyparallel/controller/scheduler.py | 34 ++++- ipyparallel/engine/engine.py | 9 +- ipyparallel/engine/kernel.py | 1 + 9 files changed, 289 insertions(+), 72 deletions(-) create mode 100644 ipyparallel/controller/exponential_scheduler.py diff --git a/ipyparallel/apps/ipcontrollerapp.py b/ipyparallel/apps/ipcontrollerapp.py index ecc1ae6db..60d459e50 100755 --- a/ipyparallel/apps/ipcontrollerapp.py +++ b/ipyparallel/apps/ipcontrollerapp.py @@ -39,6 +39,7 @@ from ipyparallel.controller.broadcast_scheduler import BroadcastSchedulerNonCoalescing, \ BroadcastSchedulerCoalescing +from ipyparallel.controller.exponential_scheduler import ExponentialScheduler from ipyparallel.controller.heartmonitor import HeartMonitor from ipyparallel.controller.hub import HubFactory from ipyparallel.controller.scheduler import launch_scheduler @@ -342,24 +343,15 @@ def init_hub(self): self.config.Session.key = self.factory.session.key def launch_python_scheduler(self, scheduler_args, children): - kwargs = {} - kwargs.update(scheduler_args) - kwargs.update( - dict( - logname='scheduler', - loglevel=self.log_level, - log_url=self.log_url,config=dict(self.config), - ) - ) if 'Process' in self.mq_class: # run the Python scheduler in a Process - q = Process(target=launch_scheduler, kwargs=kwargs) + q = Process(target=launch_scheduler, kwargs=scheduler_args) q.daemon = True children.append(q) else: # single-threaded Controller - kwargs['in_thread'] = True - launch_scheduler(**kwargs) + scheduler_args['in_thread'] = True + launch_scheduler(**scheduler_args) def init_schedulers(self): children = self.children @@ -421,40 +413,42 @@ def init_schedulers(self): else: self.log.info("task::using Python %s Task scheduler"%scheme) - scheduler_args = dict( - scheduler_class=TaskScheduler, - in_addr=f.client_url('task'), - out_addr=f.engine_url('task'), - mon_addr=monitor_url, - not_addr=disambiguate_url(f.client_url('notification')), - reg_addr=disambiguate_url(f.client_url('registration')), - identity=b'task', - ) - self.launch_python_scheduler(scheduler_args, children) - - scheduler_args = dict( - scheduler_class=BroadcastSchedulerNonCoalescing, - in_addr=f.client_url('broadcast_non_coalescing'), - out_addr=f.engine_url('broadcast_non_coalescing'), - mon_addr=monitor_url, - not_addr=disambiguate_url(f.client_url('notification')), - reg_addr=disambiguate_url(f.client_url('registration')), - identity=b'broadcast_non_coalescing', - ) - - self.launch_python_scheduler(scheduler_args, children) - - scheduler_args = dict( - scheduler_class=BroadcastSchedulerCoalescing, - in_addr=f.client_url('broadcast_coalescing'), - out_addr=f.engine_url('broadcast_coalescing'), - mon_addr=monitor_url, - not_addr=disambiguate_url(f.client_url('notification')), - reg_addr=disambiguate_url(f.client_url('registration')), - identity=b'broadcast_coalescing', - ) - - self.launch_python_scheduler(scheduler_args, children) + self.launch_python_scheduler(self.get_python_scheduler_args('task', f, TaskScheduler, monitor_url) + , children) + + self.launch_python_scheduler(self.get_python_scheduler_args( + 'broadcast_non_coalescing', f, BroadcastSchedulerNonCoalescing, monitor_url + ), children) + + self.launch_python_scheduler(self.get_python_scheduler_args( + 'broadcast_coalescing', f, BroadcastSchedulerCoalescing, monitor_url + ), children) + + sub_scheduler_ids = [bytes(f'sub_scheduler_{i}', 'utf8') for i in range(7)] + + self.launch_python_scheduler(self.get_python_scheduler_args( + 'sub_scheduler', f, ExponentialScheduler, monitor_url, + identity=sub_scheduler_ids[0], + is_root=True, + is_sub_scheduler=True, + connected_sub_schedulers=sub_scheduler_ids[1:3], + ), children) + + self.launch_python_scheduler(self.get_python_scheduler_args( + 'sub_scheduler', f, ExponentialScheduler, monitor_url, + identity=sub_scheduler_ids[1], + is_leaf=True, + is_sub_scheduler=True, + connect=True + ), children) + + self.launch_python_scheduler(self.get_python_scheduler_args( + 'sub_scheduler', f, ExponentialScheduler, monitor_url, + identity=sub_scheduler_ids[2], + is_leaf=True, + is_sub_scheduler=True, + connect=True + ), children) # set unlimited HWM for all relay devices if hasattr(zmq, 'SNDHWM'): @@ -543,6 +537,40 @@ def start(self): finally: self.cleanup_connection_files() + def get_python_scheduler_args( + self, + scheduler_name, + factory, + scheduler_class, + monitor_url, + identity=None, + is_leaf=False, + is_root=False, + connected_sub_schedulers=None, + is_sub_scheduler=False, + connect=False, + ): + return { + 'scheduler_class': scheduler_class, + 'in_addr': factory.client_url(scheduler_name), + 'out_addr': factory.engine_url(scheduler_name), + 'mon_addr': monitor_url, + 'not_addr': disambiguate_url(factory.client_url('notification')), + 'reg_addr': disambiguate_url(factory.client_url('registration')), + 'identity': identity if identity else bytes(scheduler_name, 'utf8'), + 'is_leaf': is_leaf, + 'is_root': is_root, + 'connected_sub_schedulers': connected_sub_schedulers + if connected_sub_schedulers + else [], + 'is_sub_scheduler': is_sub_scheduler, + 'logname': 'scheduler', + 'loglevel': self.log_level, + 'log_url': self.log_url, + 'config': dict(self.config), + 'connect': connect + } + def launch_new_instance(*args, **kwargs): """Create and run the IPython controller""" diff --git a/ipyparallel/client/client.py b/ipyparallel/client/client.py index ed3aef016..0615f7ef5 100644 --- a/ipyparallel/client/client.py +++ b/ipyparallel/client/client.py @@ -51,7 +51,8 @@ from ..util import ioloop from .asyncresult import AsyncResult, AsyncHubResult from .futures import MessageFuture, multi_future -from .view import DirectView, LoadBalancedView, BroadcastViewNonCoalescing, BroadcastViewCoalescing +from .view import DirectView, LoadBalancedView, BroadcastViewNonCoalescing, \ + BroadcastViewCoalescing, ExponentialView import jupyter_client.session jupyter_client.session.extract_dates = lambda obj: obj # -------------------------------------------------------------------------- @@ -361,6 +362,8 @@ def _profile_default(self): _task_socket=Instance('zmq.Socket', allow_none=True) _broadcast_non_coalescing_socket=Instance('zmq.Socket', allow_none=True) _broadcast_coalescing_socket=Instance('zmq.Socket', allow_none=True) + _sub_scheduler_socket = Instance('zmq.Socket', allow_none=True) + _task_scheme=Unicode() _closed = False @@ -451,7 +454,9 @@ def __init__(self, url_file=None, profile=None, profile_dir=None, ipython_dir=No 'notification', 'registration', 'broadcast_non_coalescing', - 'broadcast_coalescing'): + 'broadcast_coalescing', + 'sub_scheduler', + ): cfg[key] = cfg['interface'] + ':%i' % cfg[key] url = cfg['registration'] @@ -668,6 +673,9 @@ def connect_socket(s, url): connect_socket(self._broadcast_coalescing_socket, cfg['broadcast_coalescing']) + self._sub_scheduler_socket = self._context.socket(zmq.DEALER) + connect_socket(self._sub_scheduler_socket, cfg['sub_scheduler']) + self._notification_socket = self._context.socket(zmq.SUB) self._notification_socket.setsockopt(zmq.SUBSCRIBE, b'') connect_socket(self._notification_socket, cfg['notification']) @@ -917,10 +925,16 @@ def _setup_streams(self): self._broadcast_non_coalescing_stream = ZMQStream( self._broadcast_non_coalescing_socket, self._io_loop) self._broadcast_non_coalescing_stream.on_recv(self._dispatch_reply, copy=False) + self._broadcast_coalescing_stream = ZMQStream( self._broadcast_coalescing_socket, self._io_loop) self._broadcast_coalescing_stream.on_recv(self._dispatch_reply, copy=False) + self._sub_scheduler_stream = ZMQStream( + self._sub_scheduler_socket, self._io_loop) + + self._sub_scheduler_stream.on_recv(self._dispatch_reply, copy=False) + def _start_io_thread(self): """Start IOLoop in a background thread.""" evt = Event() @@ -1640,6 +1654,10 @@ def broadcast_view(self, targets='all', is_coalescing=False, **kwargs): socket=self._broadcast_non_coalescing_stream, targets=targets, **kwargs) + def exponential_view(self, targets='all', **kwargs): + return ExponentialView( + client=self, socket=self._sub_scheduler_stream, targets=targets, **kwargs + ) #-------------------------------------------------------------------------- # Query methods #-------------------------------------------------------------------------- diff --git a/ipyparallel/client/view.py b/ipyparallel/client/view.py index 0d35f54db..df1bc4c97 100644 --- a/ipyparallel/client/view.py +++ b/ipyparallel/client/view.py @@ -933,6 +933,46 @@ def _really_apply(self, f, args=None, kwargs=None, block=None, track=None, return ar +class ExponentialView(DirectView): + def __init__(self, client=None, socket=None, targets=None): + super().__init__(client=client, socket=socket, targets=targets) + + @sync_results + @save_ids + def _really_apply(self, f, args=None, kwargs=None, block=None, track=None, + targets=None): + args = [] if args is None else args + kwargs = {} if kwargs is None else kwargs + block = self.block if block is None else block + track = self.track if track is None else track + targets = self.targets if targets is None else targets + + idents, _targets = self.client._build_targets(targets) + + pf = PrePickled(f) + pargs = [PrePickled(arg) for arg in args] + pkwargs = {k: PrePickled(v) for k, v in kwargs.items()} + + s_idents = [ident.decode("utf8") for ident in idents] + + metadata = dict(targets=s_idents) + + message_future = self.client.send_apply_request( + self._socket, pf, pargs, pkwargs, + track=track, metadata=metadata) + + self.client.outstanding.add(message_future.msg_id) + + ar = AsyncResult(self.client, message_future, fname=getname(f), + targets=_targets, + owner=True) + if block: + try: + return ar.get() + except KeyboardInterrupt: + pass + return ar + class LoadBalancedView(View): """An load-balancing View that only executes via the Task scheduler. diff --git a/ipyparallel/controller/broadcast_scheduler.py b/ipyparallel/controller/broadcast_scheduler.py index 92e711b76..26f9c3ffd 100644 --- a/ipyparallel/controller/broadcast_scheduler.py +++ b/ipyparallel/controller/broadcast_scheduler.py @@ -11,7 +11,6 @@ def __init__(self, *args, **kwargs): @util.log_errors def dispatch_submission(self, raw_msg): - try: idents, msg_list = self.session.feed_identities(raw_msg, copy=False) msg = self.session.deserialize(msg_list, content=False, copy=False) @@ -21,8 +20,6 @@ def dispatch_submission(self, raw_msg): ) return - # send to monitor - header = msg['header'] metadata = msg['metadata'] original_msg_id = header['msg_id'] @@ -39,7 +36,6 @@ def dispatch_submission(self, raw_msg): new_msg_list.extend(msg['buffers']) self.mon_stream.send_multipart([b'inbcast'] + new_msg_list, copy=False) - self.log.debug("Sending %r", new_msg_list) self.engine_stream.send_multipart(new_msg_list, copy=False) @util.log_errors @@ -89,13 +85,14 @@ def dispatch_submission(self, raw_msg): ) return - # send to monitor header = msg['header'] metadata = msg['metadata'] original_msg_id = header['msg_id'] targets = metadata.get('targets', []) - self.accumulated_replies[original_msg_id] = { f'{original_msg_id}_{target}': None for target in targets} + self.accumulated_replies[original_msg_id] = { + f'{original_msg_id}_{target}': None for target in targets + } metadata['original_msg_id'] = original_msg_id for target in targets: @@ -110,8 +107,6 @@ def dispatch_submission(self, raw_msg): # self.log.debug("Sending %r", new_msg_list) self.engine_stream.send_multipart(new_msg_list, copy=False) - - @util.log_errors def dispatch_result(self, raw_msg): try: @@ -137,16 +132,19 @@ def dispatch_result(self, raw_msg): self.accumulated_replies[original_msg_id][msg_id] = raw_msg raw_msg[:2] = [client, engine] - if all(msg is not None for msg - in self.accumulated_replies[original_msg_id].values()): + if all( + msg is not None + for msg in self.accumulated_replies[original_msg_id].values() + ): self.client_stream.send_multipart( [ - msgpart for msg in - self.accumulated_replies[original_msg_id].values() + msgpart + for msg in self.accumulated_replies[original_msg_id].values() for msgpart in msg - ] - , copy=False) + ], + copy=False, + ) self.all_done.add(original_msg_id) - self.mon_stream.send_multipart([b'outbcast'] + raw_msg, copy=False) \ No newline at end of file + self.mon_stream.send_multipart([b'outbcast'] + raw_msg, copy=False) diff --git a/ipyparallel/controller/exponential_scheduler.py b/ipyparallel/controller/exponential_scheduler.py new file mode 100644 index 000000000..529516d57 --- /dev/null +++ b/ipyparallel/controller/exponential_scheduler.py @@ -0,0 +1,95 @@ +from ipython_genutils.py3compat import cast_bytes + +from ipyparallel import util +from ipyparallel.controller.scheduler import Scheduler + +MESSAGES_PER_SCHEDULER = 7 + + +class ExponentialScheduler(Scheduler): + accumulated_replies = {} + + def __init__( + self, + *args, + is_root=False, + is_leaf=False, + identity=None, + connected_sub_schedulers=None, + **kwargs, + ): + super().__init__(**kwargs) + self.is_root = is_root + self.is_leaf = is_leaf + self.identity = identity + self.connected_sub_schedulers = connected_sub_schedulers + + def add_connected_sub_scheduler(self, sub_scheduler_id): + self.connected_sub_schedulers.append(sub_scheduler_id) + + def start(self): + self.client_stream.on_recv(self.dispatch_submission, copy=False) + + def resume_receiving(self): + self.client_stream.on_recv(self.dispatch_submission) + + def stop_receiving(self): + self.client_stream.on_recv(None) + + def send_to_sub_schedulers(self, msg, targets, idents): + original_msg_id = msg['header']['msg_id'] + self.accumulated_replies[original_msg_id] = { + f'{original_msg_id}_{scheduler_id.decode("utf8")}': None + for scheduler_id in self.connected_sub_schedulers + } + + for i, scheduler_id in enumerate(self.connected_sub_schedulers): + msg_and_scheduler_id = f'{original_msg_id}_{scheduler_id.decode("utf8")}' + + targets_for_scheduler = targets[ + i * MESSAGES_PER_SCHEDULER : (i + 1) * MESSAGES_PER_SCHEDULER + ] + if not targets_for_scheduler: + del self.accumulated_replies[original_msg_id][msg_and_scheduler_id] + continue + msg['header']['msg_id'] = msg_and_scheduler_id + msg['metadata']['targets'] = targets_for_scheduler + self.all_ids.add(msg_and_scheduler_id) + new_idents = [cast_bytes(scheduler_id + b'_in')] + idents + new_msg_list = self.session.serialize(msg, ident=new_idents) + new_msg_list.extend(msg['buffers']) + self.mon_stream.send_multipart([b'inexpo'] + new_msg_list, copy=False) + self.client_stream.send_multipart(new_msg_list, copy=False) + + @util.log_errors + def dispatch_submission(self, raw_msg): + self.log.info(f'Exponential msg received ') + try: + idents, msg_list = self.session.feed_identities(raw_msg, copy=False) + msg = self.session.deserialize(msg_list, content=False, copy=False) + except Exception as e: + self.log.error(f'exponential scheduler:: Invalid msg: {raw_msg}') + return + header = msg['header'] + metadata = msg['metadata'] + original_msg_id = header['msg_id'] + targets = metadata.get('targets', []) + if not self.is_leaf: + self.send_to_sub_schedulers(msg, targets, idents) + else: + self.accumulated_replies[original_msg_id] = { + f'{original_msg_id}_{target}': None for target in targets + } + for target in targets: + msg_and_target_id = f'{original_msg_id}_{target}' + self.all_ids.add(msg_and_target_id) + header['msg_id'] = msg_and_target_id + new_idents = [cast_bytes(target)] + idents + new_msg_list = self.session.serialize(msg, ident=new_idents) + new_msg_list.extend(msg['buffers']) + + self.mon_stream.send_multipart([b'inexpo'] + new_msg_list, copy=False) + self.engine_stream.send_multipart(new_msg_list, copy=False) + + def dispatch_result(self, raw_msg): + self.log.info(f'expo: {self.id} received {raw_msg}') diff --git a/ipyparallel/controller/hub.py b/ipyparallel/controller/hub.py index 01e789524..c148d8d41 100644 --- a/ipyparallel/controller/hub.py +++ b/ipyparallel/controller/hub.py @@ -162,6 +162,12 @@ def _broadcast_non_coalescing_default(self): def _broadcast_coalescing_default(self): return tuple(util.select_random_ports(2)) + sub_scheduler = Tuple(Integer(), Integer(), config=True, + help="Port pair for queue from client through sub schedulers to engines") + + def _sub_scheduler_default(self): + return tuple(util.select_random_ports(2)) + control = Tuple(Integer(), Integer(), config=True, help="""Client/Engine Port pair for Control queue""") @@ -297,7 +303,8 @@ def init_hub(self): 'task' : self.task[1], 'iopub' : self.iopub[1], 'broadcast_non_coalescing': self.broadcast_non_coalescing[1], - 'broadcast_coalescing': self.broadcast_coalescing[1] + 'broadcast_coalescing': self.broadcast_coalescing[1], + 'sub_scheduler': self.sub_scheduler[1], } client = self.client_info = { @@ -310,7 +317,8 @@ def init_hub(self): 'iopub' : self.iopub[0], 'notification' : self.notifier_port, 'broadcast_non_coalescing': self.broadcast_non_coalescing[0], - 'broadcast_coalescing': self.broadcast_coalescing[0] + 'broadcast_coalescing': self.broadcast_coalescing[0], + 'sub_scheduler': self.sub_scheduler[0], } self.log.debug("Hub engine addrs: %s", self.engine_info) diff --git a/ipyparallel/controller/scheduler.py b/ipyparallel/controller/scheduler.py index 2c8af0b5d..40a40c3a1 100644 --- a/ipyparallel/controller/scheduler.py +++ b/ipyparallel/controller/scheduler.py @@ -24,6 +24,7 @@ from ipyparallel.util import connect_logger, local_logger, ioloop import jupyter_client.session + jupyter_client.session.extract_dates = lambda obj: obj from jupyter_client.session import SessionFactory @@ -82,7 +83,6 @@ def dispatch_submission(self, raw_msg): raise NotImplementedError("Implement in subclasses") - def launch_scheduler( scheduler_class, in_addr, @@ -96,6 +96,11 @@ def launch_scheduler( loglevel=logging.DEBUG, identity=None, in_thread=False, + is_leaf=False, + is_root=False, + connected_sub_schedulers=None, + is_sub_scheduler=False, + connect=False ): ZMQStream = zmqstream.ZMQStream @@ -114,18 +119,25 @@ def launch_scheduler( ctx = zmq.Context() loop = ioloop.IOLoop() - ins = ZMQStream(ctx.socket(zmq.ROUTER), loop) util.set_hwm(ins, 0) if identity: ins.setsockopt(zmq.IDENTITY, identity + b'_in') - ins.bind(in_addr) + if connect: + ins.connect(in_addr) + else: + ins.bind(in_addr) outs = ZMQStream(ctx.socket(zmq.ROUTER), loop) util.set_hwm(outs, 0) + if identity: outs.setsockopt(zmq.IDENTITY, identity + b'_out') - outs.bind(out_addr) + if connect: + outs.connect(out_addr) + else: + outs.bind(out_addr) + mons = zmqstream.ZMQStream(ctx.socket(zmq.PUB), loop) util.set_hwm(mons, 0) mons.connect(mon_addr) @@ -147,7 +159,7 @@ def launch_scheduler( else: log = local_logger(logname, loglevel) - scheduler = scheduler_class( + scheduler_args = dict( client_stream=ins, engine_stream=outs, mon_stream=mons, @@ -157,6 +169,18 @@ def launch_scheduler( log=log, config=config, ) + if is_sub_scheduler: + scheduler_args.update( + dict( + is_leaf=is_leaf, + is_root=is_root, + connected_sub_schedulers=connected_sub_schedulers, + identity=identity + ) + ) + + scheduler = scheduler_class(**scheduler_args) + scheduler.start() if not in_thread: try: diff --git a/ipyparallel/engine/engine.py b/ipyparallel/engine/engine.py index e728d99c7..6dba2a0e0 100644 --- a/ipyparallel/engine/engine.py +++ b/ipyparallel/engine/engine.py @@ -232,7 +232,12 @@ def url(key): heart.start() # create Shell Connections (MUX, Task, etc.): - shell_addrs = url('mux'), url('task'), url('broadcast_non_coalescing'), url('broadcast_coalescing') + shell_addrs = \ + url('mux'),\ + url('task'),\ + url('broadcast_non_coalescing'),\ + url('broadcast_coalescing'),\ + url('sub_scheduler') # Use only one shell stream for mux and tasks stream = zmqstream.ZMQStream(ctx.socket(zmq.ROUTER), loop) @@ -240,7 +245,7 @@ def url(key): self.log.debug("Setting shell identity %r", identity) shell_streams = [stream] - for addr in shell_addrs: # TODO: Possibly problematic + for addr in shell_addrs: self.log.info("Connecting shell to %s", addr) connect(stream, addr) diff --git a/ipyparallel/engine/kernel.py b/ipyparallel/engine/kernel.py index def636bb4..7c7e501d5 100644 --- a/ipyparallel/engine/kernel.py +++ b/ipyparallel/engine/kernel.py @@ -98,6 +98,7 @@ def apply_request(self, stream, ident, parent): md = self.finish_metadata(parent, md, reply_content) # flush i/o + self.log.info(f'ENGINE apply request, ident: {ident}') sys.stdout.flush() sys.stderr.flush() self.log.debug('engine: sending apply_reply') From 3cfc03209da6675eceda7c430970356df9a880cd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom-Olav=20B=C3=B8yum?= Date: Tue, 17 Mar 2020 10:57:45 +0100 Subject: [PATCH 15/34] start of work for scheduler-scheduler streams --- ipyparallel/controller/scheduler.py | 140 +++++++++++++++++----------- 1 file changed, 84 insertions(+), 56 deletions(-) diff --git a/ipyparallel/controller/scheduler.py b/ipyparallel/controller/scheduler.py index 40a40c3a1..750b62b32 100644 --- a/ipyparallel/controller/scheduler.py +++ b/ipyparallel/controller/scheduler.py @@ -82,29 +82,18 @@ def dispatch_result(self, raw_msg): def dispatch_submission(self, raw_msg): raise NotImplementedError("Implement in subclasses") - -def launch_scheduler( - scheduler_class, - in_addr, - out_addr, - mon_addr, - not_addr, - reg_addr, - config=None, - logname='root', - log_url=None, - loglevel=logging.DEBUG, - identity=None, - in_thread=False, - is_leaf=False, - is_root=False, - connected_sub_schedulers=None, - is_sub_scheduler=False, - connect=False +ZMQStream = zmqstream.ZMQStream + +def get_common_scheduler_streams( + mon_addr, + not_addr, + reg_addr, + config, + logname, + log_url, + loglevel, + in_thread, ): - - ZMQStream = zmqstream.ZMQStream - if config: # unwrap dict back into Config config = Config(config) @@ -118,28 +107,7 @@ def launch_scheduler( # for safety with multiprocessing ctx = zmq.Context() loop = ioloop.IOLoop() - - ins = ZMQStream(ctx.socket(zmq.ROUTER), loop) - util.set_hwm(ins, 0) - if identity: - ins.setsockopt(zmq.IDENTITY, identity + b'_in') - if connect: - ins.connect(in_addr) - else: - ins.bind(in_addr) - - outs = ZMQStream(ctx.socket(zmq.ROUTER), loop) - util.set_hwm(outs, 0) - - if identity: - outs.setsockopt(zmq.IDENTITY, identity + b'_out') - if connect: - outs.connect(out_addr) - else: - outs.bind(out_addr) - mons = zmqstream.ZMQStream(ctx.socket(zmq.PUB), loop) - util.set_hwm(mons, 0) mons.connect(mon_addr) nots = zmqstream.ZMQStream(ctx.socket(zmq.SUB), loop) nots.setsockopt(zmq.SUBSCRIBE, b'') @@ -158,8 +126,51 @@ def launch_scheduler( ) else: log = local_logger(logname, loglevel) + return config, ctx, loop, mons, nots, querys, log + - scheduler_args = dict( +def launch_scheduler( + scheduler_class, + in_addr, + out_addr, + mon_addr, + not_addr, + reg_addr, + config=None, + logname='root', + log_url=None, + loglevel=logging.DEBUG, + identity=None, + in_thread=False, +): + config, ctx, loop, mons, nots, querys, log = get_common_scheduler_streams( + mon_addr, + not_addr, + reg_addr, + config, + logname, + log_url, + loglevel, + in_thread + ) + + util.set_hwm(mons, 0) + ins = ZMQStream(ctx.socket(zmq.ROUTER), loop) + util.set_hwm(ins, 0) + if identity: + ins.setsockopt(zmq.IDENTITY, identity + b'_in') + else: + ins.bind(in_addr) + + outs = ZMQStream(ctx.socket(zmq.ROUTER), loop) + util.set_hwm(outs, 0) + + if identity: + outs.setsockopt(zmq.IDENTITY, identity + b'_out') + else: + outs.bind(out_addr) + + scheduler = scheduler_class( client_stream=ins, engine_stream=outs, mon_stream=mons, @@ -167,19 +178,8 @@ def launch_scheduler( query_stream=querys, loop=loop, log=log, - config=config, + config=config ) - if is_sub_scheduler: - scheduler_args.update( - dict( - is_leaf=is_leaf, - is_root=is_root, - connected_sub_schedulers=connected_sub_schedulers, - identity=identity - ) - ) - - scheduler = scheduler_class(**scheduler_args) scheduler.start() if not in_thread: @@ -187,3 +187,31 @@ def launch_scheduler( loop.start() except KeyboardInterrupt: scheduler.log.critical("Interrupted, exiting...") + +def launch_tree_spanning_scheduler( + in_addr, + out_addrs, + mon_addr, + not_addr, + reg_addr, + config=None, + loglevel=logging.DEBUG, + log_url=None, + is_leaf=False, + is_root=False, + in_thread=False, +): + config, ctx, loop, mons, nots, querys, log = get_common_scheduler_streams( + mon_addr, + not_addr, + reg_addr, + config, + 'scheduler', + log_url, + loglevel, + in_thread + ) + ins = ZMQStream(ctx.socket(zmq.ROUTER), loop) + util.set_hwm(ins, 0) + + From 0427c1003a93ea1b05c6346237b5bf3d222cdf16 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom-Olav=20B=C3=B8yum?= Date: Tue, 17 Mar 2020 10:58:19 +0100 Subject: [PATCH 16/34] start of work for scheduler-scheduler streams --- ipyparallel/apps/ipcontrollerapp.py | 50 +++++-------------- .../controller/exponential_scheduler.py | 3 +- ipyparallel/controller/hub.py | 25 +++++++--- ipyparallel/engine/engine.py | 15 +++--- 4 files changed, 40 insertions(+), 53 deletions(-) diff --git a/ipyparallel/apps/ipcontrollerapp.py b/ipyparallel/apps/ipcontrollerapp.py index 60d459e50..830faa9ab 100755 --- a/ipyparallel/apps/ipcontrollerapp.py +++ b/ipyparallel/apps/ipcontrollerapp.py @@ -137,6 +137,8 @@ aliases.update(base_aliases) aliases.update(session_aliases) +SPANNING_TREE_SCHEDULER_DEPTH = 2 + class IPControllerApp(BaseParallelApplication): name = u'ipcontroller' @@ -424,31 +426,9 @@ def init_schedulers(self): 'broadcast_coalescing', f, BroadcastSchedulerCoalescing, monitor_url ), children) - sub_scheduler_ids = [bytes(f'sub_scheduler_{i}', 'utf8') for i in range(7)] - - self.launch_python_scheduler(self.get_python_scheduler_args( - 'sub_scheduler', f, ExponentialScheduler, monitor_url, - identity=sub_scheduler_ids[0], - is_root=True, - is_sub_scheduler=True, - connected_sub_schedulers=sub_scheduler_ids[1:3], - ), children) - - self.launch_python_scheduler(self.get_python_scheduler_args( - 'sub_scheduler', f, ExponentialScheduler, monitor_url, - identity=sub_scheduler_ids[1], - is_leaf=True, - is_sub_scheduler=True, - connect=True - ), children) - - self.launch_python_scheduler(self.get_python_scheduler_args( - 'sub_scheduler', f, ExponentialScheduler, monitor_url, - identity=sub_scheduler_ids[2], - is_leaf=True, - is_sub_scheduler=True, - connect=True - ), children) + self.launch_spanning_tree_schedulers( + f, monitor_url, children + ) # set unlimited HWM for all relay devices if hasattr(zmq, 'SNDHWM'): @@ -544,11 +524,6 @@ def get_python_scheduler_args( scheduler_class, monitor_url, identity=None, - is_leaf=False, - is_root=False, - connected_sub_schedulers=None, - is_sub_scheduler=False, - connect=False, ): return { 'scheduler_class': scheduler_class, @@ -558,19 +533,20 @@ def get_python_scheduler_args( 'not_addr': disambiguate_url(factory.client_url('notification')), 'reg_addr': disambiguate_url(factory.client_url('registration')), 'identity': identity if identity else bytes(scheduler_name, 'utf8'), - 'is_leaf': is_leaf, - 'is_root': is_root, - 'connected_sub_schedulers': connected_sub_schedulers - if connected_sub_schedulers - else [], - 'is_sub_scheduler': is_sub_scheduler, 'logname': 'scheduler', 'loglevel': self.log_level, 'log_url': self.log_url, 'config': dict(self.config), - 'connect': connect } + def launch_spanning_tree_schedulers(self, factory, monitor_url, children): + + + + + + + def launch_new_instance(*args, **kwargs): """Create and run the IPython controller""" diff --git a/ipyparallel/controller/exponential_scheduler.py b/ipyparallel/controller/exponential_scheduler.py index 529516d57..1eedf392e 100644 --- a/ipyparallel/controller/exponential_scheduler.py +++ b/ipyparallel/controller/exponential_scheduler.py @@ -91,5 +91,6 @@ def dispatch_submission(self, raw_msg): self.mon_stream.send_multipart([b'inexpo'] + new_msg_list, copy=False) self.engine_stream.send_multipart(new_msg_list, copy=False) + @util.log_errors def dispatch_result(self, raw_msg): - self.log.info(f'expo: {self.id} received {raw_msg}') + self.log.info(f'EXPO: received {raw_msg}') diff --git a/ipyparallel/controller/hub.py b/ipyparallel/controller/hub.py index c148d8d41..ccee789eb 100644 --- a/ipyparallel/controller/hub.py +++ b/ipyparallel/controller/hub.py @@ -21,13 +21,15 @@ # internal: from ipython_genutils.importstring import import_item + +from ..apps.ipcontrollerapp import SPANNING_TREE_SCHEDULER_DEPTH from ..util import extract_dates from jupyter_client.localinterfaces import localhost from ipython_genutils.py3compat import cast_bytes, unicode_type, iteritems, buffer_to_bytes_py2 from traitlets import ( HasTraits, Any, Instance, Integer, Unicode, Dict, Set, Tuple, - DottedObjectName, observe -) + DottedObjectName, observe, + List) from datetime import datetime from ipyparallel import error, util @@ -129,6 +131,9 @@ class EngineConnector(HasTraits): 'nodb' : 'ipyparallel.controller.dictdb.NoDB', } +def number_of_port_pairs_for_sub_schedulers(depth): + return 2**(depth-1)+2**depth - 1 + class HubFactory(RegistrationFactory): """The Configurable for setting up a Hub.""" @@ -162,11 +167,15 @@ def _broadcast_non_coalescing_default(self): def _broadcast_coalescing_default(self): return tuple(util.select_random_ports(2)) - sub_scheduler = Tuple(Integer(), Integer(), config=True, - help="Port pair for queue from client through sub schedulers to engines") + sub_schedulers = List(Trait=Tuple(Integer(), Integer(), config=True, + help="Port pair for queue from client through sub schedulers to engines"), + help="List of available ports for spanning tree schedulers") - def _sub_scheduler_default(self): - return tuple(util.select_random_ports(2)) + def _sub_schedulers_default(self): + return [util.select_random_ports(2) for i in range( + number_of_port_pairs_for_sub_schedulers( + SPANNING_TREE_SCHEDULER_DEPTH + ))] control = Tuple(Integer(), Integer(), config=True, help="""Client/Engine Port pair for Control queue""") @@ -304,7 +313,7 @@ def init_hub(self): 'iopub' : self.iopub[1], 'broadcast_non_coalescing': self.broadcast_non_coalescing[1], 'broadcast_coalescing': self.broadcast_coalescing[1], - 'sub_scheduler': self.sub_scheduler[1], + 'sub_scheduler': (outgoing for incoming, outgoing in self.sub_schedulers), } client = self.client_info = { @@ -318,7 +327,7 @@ def init_hub(self): 'notification' : self.notifier_port, 'broadcast_non_coalescing': self.broadcast_non_coalescing[0], 'broadcast_coalescing': self.broadcast_coalescing[0], - 'sub_scheduler': self.sub_scheduler[0], + 'sub_scheduler': (incoming for incoming, outgoing in self.sub_schedulers), } self.log.debug("Hub engine addrs: %s", self.engine_info) diff --git a/ipyparallel/engine/engine.py b/ipyparallel/engine/engine.py index 6dba2a0e0..dd7d272f9 100644 --- a/ipyparallel/engine/engine.py +++ b/ipyparallel/engine/engine.py @@ -205,7 +205,10 @@ def complete_registration(self, msg, connect, maybe_tunnel): def url(key): """get zmq url for given channel""" return str(info["interface"] + ":%i" % info[key]) - + + def urls(keys): + return (url(key) for key in keys) + if content['status'] == 'ok': if self.id is not None and content['id'] != self.id: self.log.warning("Did not get the requested id: %i != %i", content['id'], self.id) @@ -232,12 +235,10 @@ def url(key): heart.start() # create Shell Connections (MUX, Task, etc.): - shell_addrs = \ - url('mux'),\ - url('task'),\ - url('broadcast_non_coalescing'),\ - url('broadcast_coalescing'),\ - url('sub_scheduler') + shell_addrs = (url('mux'), url('task'), url('broadcast_non_coalescing'), + url('broadcast_coalescing')) + urls('sub_schedulers') + + self.log.info(f'ENGINE: shell_addrs: {shell_addrs}') # Use only one shell stream for mux and tasks stream = zmqstream.ZMQStream(ctx.socket(zmq.ROUTER), loop) From 681904b6e5181cd0cae6c0da986c5abb49908964 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom-Olav=20B=C3=B8yum?= Date: Tue, 31 Mar 2020 12:58:37 +0200 Subject: [PATCH 17/34] spanning tree scheduler mvp --- ipyparallel/apps/ipcontrollerapp.py | 59 ++++- ipyparallel/client/client.py | 4 +- .../controller/exponential_scheduler.py | 243 +++++++++++++++--- ipyparallel/controller/hub.py | 48 ++-- ipyparallel/controller/scheduler.py | 35 +-- ipyparallel/engine/engine.py | 8 +- 6 files changed, 294 insertions(+), 103 deletions(-) diff --git a/ipyparallel/apps/ipcontrollerapp.py b/ipyparallel/apps/ipcontrollerapp.py index 830faa9ab..a7edce656 100755 --- a/ipyparallel/apps/ipcontrollerapp.py +++ b/ipyparallel/apps/ipcontrollerapp.py @@ -39,9 +39,10 @@ from ipyparallel.controller.broadcast_scheduler import BroadcastSchedulerNonCoalescing, \ BroadcastSchedulerCoalescing -from ipyparallel.controller.exponential_scheduler import ExponentialScheduler +from ipyparallel.controller.exponential_scheduler import SpanningTreeScheduler, \ + SPANNING_TREE_SCHEDULER_DEPTH, launch_spanning_tree_scheduler from ipyparallel.controller.heartmonitor import HeartMonitor -from ipyparallel.controller.hub import HubFactory +from ipyparallel.controller.hub import HubFactory, get_number_of_non_leaf_schedulers from ipyparallel.controller.scheduler import launch_scheduler from ipyparallel.controller.task_scheduler import TaskScheduler from ipyparallel.controller.dictdb import DictDB @@ -137,7 +138,7 @@ aliases.update(base_aliases) aliases.update(session_aliases) -SPANNING_TREE_SCHEDULER_DEPTH = 2 + class IPControllerApp(BaseParallelApplication): @@ -541,12 +542,54 @@ def get_python_scheduler_args( def launch_spanning_tree_schedulers(self, factory, monitor_url, children): + def launch_in_thread_or_process(scheduler_args): - - - - - + if 'Process' in self.mq_class: + # run the Python scheduler in a Process + q = Process(target=launch_spanning_tree_scheduler, kwargs=scheduler_args) + q.daemon = True + children.append(q) + else: + # single-threaded Controller + scheduler_args['in_thread'] = True + launch_spanning_tree_scheduler(**scheduler_args) + + def recursively_start_schedulers(identity, depth): + outgoing_id1 = identity * 2 + 1 + outgoing_id2 = outgoing_id1 + 1 + is_leaf = depth == SPANNING_TREE_SCHEDULER_DEPTH + + scheduler_args = dict( + in_addr=factory.client_url('sub_schedulers', identity), + mon_addr=monitor_url, + not_addr=disambiguate_url(factory.client_url('notification')), + reg_addr=disambiguate_url(factory.client_url('registration')), + identity=identity, + config=dict(self.config), + loglevel=self.log_level, + log_url=self.log_url, + outgoing_ids=[outgoing_id1, outgoing_id2] + ) + if is_leaf: + scheduler_args.update( + out_addrs=[ + factory.engine_url('sub_schedulers', identity - get_number_of_non_leaf_schedulers()) + ], + is_leaf=is_leaf + ) + else: + scheduler_args.update( + out_addrs=[ + factory.client_url('sub_schedulers', outgoing_id1), + factory.client_url('sub_schedulers', outgoing_id2) + ], + ) + launch_in_thread_or_process(scheduler_args) + if not is_leaf: + recursively_start_schedulers(outgoing_id1, depth + 1) + recursively_start_schedulers(outgoing_id2, depth + 1) + + recursively_start_schedulers(0, 0) def launch_new_instance(*args, **kwargs): """Create and run the IPython controller""" diff --git a/ipyparallel/client/client.py b/ipyparallel/client/client.py index 0615f7ef5..c1d34d4c0 100644 --- a/ipyparallel/client/client.py +++ b/ipyparallel/client/client.py @@ -455,10 +455,10 @@ def __init__(self, url_file=None, profile=None, profile_dir=None, ipython_dir=No 'registration', 'broadcast_non_coalescing', 'broadcast_coalescing', - 'sub_scheduler', ): cfg[key] = cfg['interface'] + ':%i' % cfg[key] + cfg['sub_schedulers'] = cfg['interface'] + ':%i' % cfg['sub_schedulers'][0] url = cfg['registration'] if location is not None and addr == localhost(): @@ -674,7 +674,7 @@ def connect_socket(s, url): cfg['broadcast_coalescing']) self._sub_scheduler_socket = self._context.socket(zmq.DEALER) - connect_socket(self._sub_scheduler_socket, cfg['sub_scheduler']) + connect_socket(self._sub_scheduler_socket, cfg['sub_schedulers']) self._notification_socket = self._context.socket(zmq.SUB) self._notification_socket.setsockopt(zmq.SUBSCRIBE, b'') diff --git a/ipyparallel/controller/exponential_scheduler.py b/ipyparallel/controller/exponential_scheduler.py index 1eedf392e..60c4352ec 100644 --- a/ipyparallel/controller/exponential_scheduler.py +++ b/ipyparallel/controller/exponential_scheduler.py @@ -1,34 +1,32 @@ +import logging + +import zmq from ipython_genutils.py3compat import cast_bytes from ipyparallel import util -from ipyparallel.controller.scheduler import Scheduler - -MESSAGES_PER_SCHEDULER = 7 +from ipyparallel.controller.scheduler import ( + Scheduler, + get_common_scheduler_streams, + ZMQStream, +) +SPANNING_TREE_SCHEDULER_DEPTH = 2 -class ExponentialScheduler(Scheduler): +class SpanningTreeScheduler(Scheduler): accumulated_replies = {} def __init__( - self, - *args, - is_root=False, - is_leaf=False, - identity=None, - connected_sub_schedulers=None, - **kwargs, + self, *args, connected_sub_schedulers=None, outgoing_streams=None, **kwargs ): super().__init__(**kwargs) - self.is_root = is_root - self.is_leaf = is_leaf - self.identity = identity self.connected_sub_schedulers = connected_sub_schedulers - - def add_connected_sub_scheduler(self, sub_scheduler_id): - self.connected_sub_schedulers.append(sub_scheduler_id) + self.outgoing_streams = outgoing_streams + self.log.info('Spanning tree scheduler started') def start(self): self.client_stream.on_recv(self.dispatch_submission, copy=False) + for outgoing_stream in self.outgoing_streams: + outgoing_stream.on_recv(self.dispatch_result, copy=False) def resume_receiving(self): self.client_stream.on_recv(self.dispatch_submission) @@ -36,8 +34,19 @@ def resume_receiving(self): def stop_receiving(self): self.client_stream.on_recv(None) - def send_to_sub_schedulers(self, msg, targets, idents): - original_msg_id = msg['header']['msg_id'] + @util.log_errors + def dispatch_submission(self, raw_msg): + self.log.info(f'Spanning tree msg received ') + try: + idents, msg_list = self.session.feed_identities(raw_msg, copy=False) + msg = self.session.deserialize(msg_list, content=False, copy=False) + except Exception as e: + self.log.error(f'Spanning tree scheduler:: Invalid msg: {raw_msg}') + return + header = msg['header'] + metadata = msg['metadata'] + original_msg_id = header['msg_id'] + targets = metadata.get('targets', []) self.accumulated_replies[original_msg_id] = { f'{original_msg_id}_{scheduler_id.decode("utf8")}': None for scheduler_id in self.connected_sub_schedulers @@ -47,11 +56,11 @@ def send_to_sub_schedulers(self, msg, targets, idents): msg_and_scheduler_id = f'{original_msg_id}_{scheduler_id.decode("utf8")}' targets_for_scheduler = targets[ - i * MESSAGES_PER_SCHEDULER : (i + 1) * MESSAGES_PER_SCHEDULER + i * len(targets)//2: (i + 1) * len(targets)//2 ] if not targets_for_scheduler: del self.accumulated_replies[original_msg_id][msg_and_scheduler_id] - continue + continue # needs to reply a message to previous scheduler maybe so it won't get stuck waiting msg['header']['msg_id'] = msg_and_scheduler_id msg['metadata']['targets'] = targets_for_scheduler self.all_ids.add(msg_and_scheduler_id) @@ -59,38 +68,190 @@ def send_to_sub_schedulers(self, msg, targets, idents): new_msg_list = self.session.serialize(msg, ident=new_idents) new_msg_list.extend(msg['buffers']) self.mon_stream.send_multipart([b'inexpo'] + new_msg_list, copy=False) - self.client_stream.send_multipart(new_msg_list, copy=False) + self.outgoing_streams[i].send_multipart(new_msg_list, copy=False) + + @util.log_errors + def dispatch_result(self, raw_msg): + try: + idents, msg = self.session.feed_identities(raw_msg, copy=False) + msg = self.session.deserialize(msg, content=False, copy=False) + next_scheduler, previous_scheduler = idents[:2] + except Exception as e: + self.log.error( + f'spanning tree::Invalid broadcast msg: {raw_msg}', exc_info=True + ) + return + + metadata = msg['metadata'] + msg_id = msg['parent_header']['msg_id'] + success = metadata['status'] == 'ok' + if success: + self.all_completed.add(msg_id) + else: + self.all_failed.add(msg_id) + + original_msg_id = metadata['original_msg_id'] + self.accumulated_replies[original_msg_id][next_scheduler] = raw_msg + raw_msg.pop() + raw_msg[0] = previous_scheduler + + if all( + msg is not None + for msg in self.accumulated_replies[original_msg_id].values() + ): + self.client_stream.send_multipart( + [ + msgpart + for msg in self.accumulated_replies[original_msg_id].values() + for msgpart in msg + ], + copy=False, + ) + self.all_done.add(original_msg_id) + self.mon_stream.send_multipart([b'outexpo'] + raw_msg, copy=False) + + +class SpanningTreeLeafScheduler(Scheduler): + accumulated_replies = {} + + def __init__(self, *args, **kwargs): + super().__init__(**kwargs) + self.log.info('Spanning tree leaf scheduler started') + @util.log_errors def dispatch_submission(self, raw_msg): - self.log.info(f'Exponential msg received ') + self.log.info(f'Spanning tree leaf msg received ') try: idents, msg_list = self.session.feed_identities(raw_msg, copy=False) msg = self.session.deserialize(msg_list, content=False, copy=False) except Exception as e: - self.log.error(f'exponential scheduler:: Invalid msg: {raw_msg}') + self.log.error(f'Spanning tree scheduler:: Invalid msg: {raw_msg}') return header = msg['header'] metadata = msg['metadata'] original_msg_id = header['msg_id'] targets = metadata.get('targets', []) - if not self.is_leaf: - self.send_to_sub_schedulers(msg, targets, idents) - else: - self.accumulated_replies[original_msg_id] = { - f'{original_msg_id}_{target}': None for target in targets - } - for target in targets: - msg_and_target_id = f'{original_msg_id}_{target}' - self.all_ids.add(msg_and_target_id) - header['msg_id'] = msg_and_target_id - new_idents = [cast_bytes(target)] + idents - new_msg_list = self.session.serialize(msg, ident=new_idents) - new_msg_list.extend(msg['buffers']) - - self.mon_stream.send_multipart([b'inexpo'] + new_msg_list, copy=False) - self.engine_stream.send_multipart(new_msg_list, copy=False) + self.accumulated_replies[original_msg_id] = { + f'{original_msg_id}_{target}': None for target in targets + } + for target in targets: + msg_and_target_id = f'{original_msg_id}_{target}' + self.all_ids.add(msg_and_target_id) + header['msg_id'] = msg_and_target_id + new_idents = [cast_bytes(target)] + idents + new_msg_list = self.session.serialize(msg, ident=new_idents) + new_msg_list.extend(msg['buffers']) + + self.mon_stream.send_multipart([b'inexpo'] + new_msg_list, copy=False) + self.engine_stream.send_multipart(new_msg_list, copy=False) @util.log_errors def dispatch_result(self, raw_msg): - self.log.info(f'EXPO: received {raw_msg}') + try: + idents, msg = self.session.feed_identities(raw_msg, copy=False) + msg = self.session.deserialize(msg, content=False, copy=False) + engine, previous_scheduler = idents[:2] + except Exception as e: + self.log.error( + f'spanning tree::Invalid broadcast msg: {raw_msg}', exc_info=True + ) + return + + metadata = msg['metadata'] + msg_id = msg['parent_header']['msg_id'] + success = metadata['status'] == 'ok' + if success: + self.all_completed.add(msg_id) + else: + self.all_failed.add(msg_id) + + original_msg_id = metadata['original_msg_id'] + self.accumulated_replies[original_msg_id][msg_id] = raw_msg + raw_msg.pop() + raw_msg[0] = previous_scheduler + + if all( + msg is not None + for msg in self.accumulated_replies[original_msg_id].values() + ): + + self.client_stream.send_multipart( + [ + msgpart + for msg in self.accumulated_replies[original_msg_id].values() + for msgpart in msg + ], + copy=False, + ) + self.all_done.add(original_msg_id) + self.mon_stream.send_multipart([b'outexpo'] + raw_msg, copy=False) + +def get_id_with_prefix(identity): + return bytes(f'sub_scheduler_{identity}', 'utf8') + +def launch_spanning_tree_scheduler( + in_addr, + out_addrs, + mon_addr, + not_addr, + reg_addr, + identity, + config=None, + loglevel=logging.DEBUG, + log_url=None, + is_leaf=False, + in_thread=False, + outgoing_ids=None, +): + config, ctx, loop, mons, nots, querys, log = get_common_scheduler_streams( + mon_addr, not_addr, reg_addr, config, 'scheduler', log_url, loglevel, in_thread + ) + + is_root = identity == 0 + sub_scheduler_id = get_id_with_prefix(identity) + + incoming_stream = ZMQStream(ctx.socket(zmq.ROUTER), loop) + util.set_hwm(incoming_stream, 0) + incoming_stream.setsockopt(zmq.IDENTITY, sub_scheduler_id + b'_in') + + if is_root: + incoming_stream.bind(in_addr) + else: + incoming_stream.connect(in_addr) + + outgoing_streams = [] + for out_addr in out_addrs: + out = ZMQStream(ctx.socket(zmq.ROUTER), loop) + util.set_hwm(out, 0) + out.setsockopt(zmq.IDENTITY, sub_scheduler_id + b'_out') + out.bind(out_addr) + outgoing_streams.append(out) + + scheduler_args = dict( + client_stream=incoming_stream, + mon_stream=mons, + notifier_stream=nots, + query_stream=querys, + loop=loop, + log=log, + config=config, + ) + if is_leaf: + scheduler_args.update( + engine_stream=outgoing_streams[0] + ) + scheduler = SpanningTreeLeafScheduler(**scheduler_args) + else: + scheduler_args.update( + connected_sub_schedulers=[get_id_with_prefix(identity) for identity in outgoing_ids], + outgoing_streams=outgoing_streams, + ) + scheduler = SpanningTreeScheduler(**scheduler_args) + + scheduler.start() + if not in_thread: + try: + loop.start() + except KeyboardInterrupt: + scheduler.log.critical("Interrupted, exiting...") diff --git a/ipyparallel/controller/hub.py b/ipyparallel/controller/hub.py index ccee789eb..897a8b4b3 100644 --- a/ipyparallel/controller/hub.py +++ b/ipyparallel/controller/hub.py @@ -22,7 +22,7 @@ # internal: from ipython_genutils.importstring import import_item -from ..apps.ipcontrollerapp import SPANNING_TREE_SCHEDULER_DEPTH +from .exponential_scheduler import SPANNING_TREE_SCHEDULER_DEPTH from ..util import extract_dates from jupyter_client.localinterfaces import localhost from ipython_genutils.py3compat import cast_bytes, unicode_type, iteritems, buffer_to_bytes_py2 @@ -109,6 +109,18 @@ def init_record(msg): } +def get_number_of_leaf_schedulers(): + return 2**SPANNING_TREE_SCHEDULER_DEPTH + + +def get_number_of_tree_spanning_schedulers(): + return 2 * get_number_of_leaf_schedulers() - 1 + + +def get_number_of_non_leaf_schedulers(): + return get_number_of_tree_spanning_schedulers() - get_number_of_leaf_schedulers() + + class EngineConnector(HasTraits): """A simple object for accessing the various zmq connections of an object. Attributes are: @@ -131,8 +143,6 @@ class EngineConnector(HasTraits): 'nodb' : 'ipyparallel.controller.dictdb.NoDB', } -def number_of_port_pairs_for_sub_schedulers(depth): - return 2**(depth-1)+2**depth - 1 class HubFactory(RegistrationFactory): """The Configurable for setting up a Hub.""" @@ -167,15 +177,13 @@ def _broadcast_non_coalescing_default(self): def _broadcast_coalescing_default(self): return tuple(util.select_random_ports(2)) - sub_schedulers = List(Trait=Tuple(Integer(), Integer(), config=True, - help="Port pair for queue from client through sub schedulers to engines"), - help="List of available ports for spanning tree schedulers") + sub_schedulers = List(Integer(), config=True, + help="List of available ports for spanning tree schedulers") def _sub_schedulers_default(self): - return [util.select_random_ports(2) for i in range( - number_of_port_pairs_for_sub_schedulers( - SPANNING_TREE_SCHEDULER_DEPTH - ))] + return util.select_random_ports( + get_number_of_leaf_schedulers() + get_number_of_tree_spanning_schedulers() + ) control = Tuple(Integer(), Integer(), config=True, help="""Client/Engine Port pair for Control queue""") @@ -282,13 +290,21 @@ def start(self): self.heartmonitor.start() self.log.info("Heartmonitor started") - def client_url(self, channel): + def client_url(self, channel, index=None): """return full zmq url for a named client channel""" - return "%s://%s:%i" % (self.client_transport, self.client_ip, self.client_info[channel]) + return "%s://%s:%i" % ( + self.client_transport, + self.client_ip, + self.client_info[channel] if index is None else self.client_info[channel][index] + ) - def engine_url(self, channel): + def engine_url(self, channel, index=None): """return full zmq url for a named engine channel""" - return "%s://%s:%i" % (self.engine_transport, self.engine_ip, self.engine_info[channel]) + return "%s://%s:%i" % ( + self.engine_transport, + self.engine_ip, + self.engine_info[channel] if index is None else self.engine_info[channel][index] + ) def init_hub(self): """construct Hub object""" @@ -313,7 +329,7 @@ def init_hub(self): 'iopub' : self.iopub[1], 'broadcast_non_coalescing': self.broadcast_non_coalescing[1], 'broadcast_coalescing': self.broadcast_coalescing[1], - 'sub_scheduler': (outgoing for incoming, outgoing in self.sub_schedulers), + 'sub_schedulers': self.sub_schedulers[-get_number_of_leaf_schedulers():] } client = self.client_info = { @@ -327,7 +343,7 @@ def init_hub(self): 'notification' : self.notifier_port, 'broadcast_non_coalescing': self.broadcast_non_coalescing[0], 'broadcast_coalescing': self.broadcast_coalescing[0], - 'sub_scheduler': (incoming for incoming, outgoing in self.sub_schedulers), + 'sub_schedulers': self.sub_schedulers[:get_number_of_tree_spanning_schedulers()], } self.log.debug("Hub engine addrs: %s", self.engine_info) diff --git a/ipyparallel/controller/scheduler.py b/ipyparallel/controller/scheduler.py index 750b62b32..b1e7f91c1 100644 --- a/ipyparallel/controller/scheduler.py +++ b/ipyparallel/controller/scheduler.py @@ -159,16 +159,15 @@ def launch_scheduler( util.set_hwm(ins, 0) if identity: ins.setsockopt(zmq.IDENTITY, identity + b'_in') - else: - ins.bind(in_addr) + + ins.bind(in_addr) outs = ZMQStream(ctx.socket(zmq.ROUTER), loop) util.set_hwm(outs, 0) if identity: outs.setsockopt(zmq.IDENTITY, identity + b'_out') - else: - outs.bind(out_addr) + outs.bind(out_addr) scheduler = scheduler_class( client_stream=ins, @@ -187,31 +186,3 @@ def launch_scheduler( loop.start() except KeyboardInterrupt: scheduler.log.critical("Interrupted, exiting...") - -def launch_tree_spanning_scheduler( - in_addr, - out_addrs, - mon_addr, - not_addr, - reg_addr, - config=None, - loglevel=logging.DEBUG, - log_url=None, - is_leaf=False, - is_root=False, - in_thread=False, -): - config, ctx, loop, mons, nots, querys, log = get_common_scheduler_streams( - mon_addr, - not_addr, - reg_addr, - config, - 'scheduler', - log_url, - loglevel, - in_thread - ) - ins = ZMQStream(ctx.socket(zmq.ROUTER), loop) - util.set_hwm(ins, 0) - - diff --git a/ipyparallel/engine/engine.py b/ipyparallel/engine/engine.py index dd7d272f9..ba0e69fab 100644 --- a/ipyparallel/engine/engine.py +++ b/ipyparallel/engine/engine.py @@ -206,8 +206,8 @@ def url(key): """get zmq url for given channel""" return str(info["interface"] + ":%i" % info[key]) - def urls(keys): - return (url(key) for key in keys) + def urls(key): + return [f'{info["interface"]}:{port}' for port in info[key]] if content['status'] == 'ok': if self.id is not None and content['id'] != self.id: @@ -235,8 +235,8 @@ def urls(keys): heart.start() # create Shell Connections (MUX, Task, etc.): - shell_addrs = (url('mux'), url('task'), url('broadcast_non_coalescing'), - url('broadcast_coalescing')) + urls('sub_schedulers') + shell_addrs = [url('mux'), url('task'), url('broadcast_non_coalescing'), + url('broadcast_coalescing')] + urls('sub_schedulers') self.log.info(f'ENGINE: shell_addrs: {shell_addrs}') From a964e5a59c73419f15f20eb8d33bacb6508b191a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom-Olav=20B=C3=B8yum?= Date: Mon, 27 Apr 2020 20:14:11 +0200 Subject: [PATCH 18/34] spanning tree scheduler, benchmarks added --- ipyparallel/apps/ipcontrollerapp.py | 6 +- ipyparallel/client/client.py | 591 +++++++++++------- ipyparallel/client/view.py | 4 +- ipyparallel/controller/broadcast_scheduler.py | 58 +- ipyparallel/controller/hub.py | 9 +- ipyparallel/controller/scheduler.py | 34 +- ...cheduler.py => spanning_tree_scheduler.py} | 132 ++-- ipyparallel/engine/kernel.py | 8 +- 8 files changed, 470 insertions(+), 372 deletions(-) rename ipyparallel/controller/{exponential_scheduler.py => spanning_tree_scheduler.py} (60%) diff --git a/ipyparallel/apps/ipcontrollerapp.py b/ipyparallel/apps/ipcontrollerapp.py index a7edce656..9486e2f53 100755 --- a/ipyparallel/apps/ipcontrollerapp.py +++ b/ipyparallel/apps/ipcontrollerapp.py @@ -39,8 +39,7 @@ from ipyparallel.controller.broadcast_scheduler import BroadcastSchedulerNonCoalescing, \ BroadcastSchedulerCoalescing -from ipyparallel.controller.exponential_scheduler import SpanningTreeScheduler, \ - SPANNING_TREE_SCHEDULER_DEPTH, launch_spanning_tree_scheduler +from ipyparallel.controller.spanning_tree_scheduler import SPANNING_TREE_SCHEDULER_DEPTH, launch_spanning_tree_scheduler from ipyparallel.controller.heartmonitor import HeartMonitor from ipyparallel.controller.hub import HubFactory, get_number_of_non_leaf_schedulers from ipyparallel.controller.scheduler import launch_scheduler @@ -568,7 +567,8 @@ def recursively_start_schedulers(identity, depth): config=dict(self.config), loglevel=self.log_level, log_url=self.log_url, - outgoing_ids=[outgoing_id1, outgoing_id2] + outgoing_ids=[outgoing_id1, outgoing_id2], + depth=depth, ) if is_leaf: scheduler_args.update( diff --git a/ipyparallel/client/client.py b/ipyparallel/client/client.py index c1d34d4c0..fdb81fbae 100644 --- a/ipyparallel/client/client.py +++ b/ipyparallel/client/client.py @@ -51,9 +51,15 @@ from ..util import ioloop from .asyncresult import AsyncResult, AsyncHubResult from .futures import MessageFuture, multi_future -from .view import DirectView, LoadBalancedView, BroadcastViewNonCoalescing, \ - BroadcastViewCoalescing, ExponentialView +from .view import ( + DirectView, + LoadBalancedView, + BroadcastViewNonCoalescing, + BroadcastViewCoalescing, + SpanningTreeView, +) import jupyter_client.session + jupyter_client.session.extract_dates = lambda obj: obj # -------------------------------------------------------------------------- # Decorators for Client methods @@ -87,6 +93,7 @@ def unpack_message(f, self, msg_parts): class ExecuteReply(RichOutput): """wrapper for finished Execute results""" + def __init__(self, msg_id, content, metadata): self.msg_id = msg_id self._content = content @@ -117,6 +124,7 @@ def _metadata(self): def display(self): from IPython.display import publish_display_data + publish_display_data(self.data, self.metadata) def _repr_mime_(self, mime): @@ -144,7 +152,7 @@ def __getattr__(self, key): return self.metadata[key] def __repr__(self): - execute_result = self.metadata['execute_result'] or {'data':{}} + execute_result = self.metadata['execute_result'] or {'data': {}} text_out = execute_result['data'].get('text/plain', '') if len(text_out) > 32: text_out = text_out[:29] + '...' @@ -152,7 +160,7 @@ def __repr__(self): return "" % (self.execution_count, text_out) def _plaintext(self): - execute_result = self.metadata['execute_result'] or {'data':{}} + execute_result = self.metadata['execute_result'] or {'data': {}} text_out = execute_result['data'].get('text/plain', '') if not text_out: @@ -174,14 +182,14 @@ def _plaintext(self): # add newline for multiline reprs text_out = '\n' + text_out - return u''.join([ - out, - u'Out[%i:%i]: ' % ( - self.metadata['engine_id'], self.execution_count - ), - normal, - text_out, - ]) + return u''.join( + [ + out, + u'Out[%i:%i]: ' % (self.metadata['engine_id'], self.execution_count), + normal, + text_out, + ] + ) def _repr_pretty_(self, p, cycle): p.text(self._plaintext()) @@ -195,27 +203,28 @@ class Metadata(dict): These objects have a strict set of keys - errors will raise if you try to add new keys. """ + def __init__(self, *args, **kwargs): dict.__init__(self) - md = {'msg_id' : None, - 'submitted' : None, - 'started' : None, - 'completed' : None, - 'received' : None, - 'engine_uuid' : None, - 'engine_id' : None, - 'follow' : None, - 'after' : None, - 'status' : None, - - 'execute_input' : None, - 'execute_result' : None, - 'error' : None, - 'stdout' : '', - 'stderr' : '', - 'outputs' : [], - 'data': {}, - } + md = { + 'msg_id': None, + 'submitted': None, + 'started': None, + 'completed': None, + 'received': None, + 'engine_uuid': None, + 'engine_id': None, + 'follow': None, + 'after': None, + 'status': None, + 'execute_input': None, + 'execute_result': None, + 'error': None, + 'stdout': '', + 'stderr': '', + 'outputs': [], + 'data': {}, + } self.update(md) self.update(dict(*args, **kwargs)) @@ -321,7 +330,6 @@ class Client(HasTraits): """ - block = Bool(False) outstanding = Set() results = Instance('collections.defaultdict', (dict,)) @@ -333,7 +341,8 @@ class Client(HasTraits): _io_loop = Any() _io_thread = Any() - profile=Unicode() + profile = Unicode() + def _profile_default(self): if BaseIPythonApplication.initialized(): # an IPython app *might* be running, try to get its profile @@ -346,37 +355,46 @@ def _profile_default(self): else: return u'default' - _outstanding_dict = Instance('collections.defaultdict', (set,)) _ids = List() - _connected=Bool(False) - _ssh=Bool(False) + _connected = Bool(False) + _ssh = Bool(False) _context = Instance('zmq.Context', allow_none=True) _config = Dict() - _engines=Instance(util.ReverseDict, (), {}) - _query_socket=Instance('zmq.Socket', allow_none=True) - _control_socket=Instance('zmq.Socket', allow_none=True) - _iopub_socket=Instance('zmq.Socket', allow_none=True) - _notification_socket=Instance('zmq.Socket', allow_none=True) - _mux_socket=Instance('zmq.Socket', allow_none=True) - _task_socket=Instance('zmq.Socket', allow_none=True) - _broadcast_non_coalescing_socket=Instance('zmq.Socket', allow_none=True) - _broadcast_coalescing_socket=Instance('zmq.Socket', allow_none=True) + _engines = Instance(util.ReverseDict, (), {}) + _query_socket = Instance('zmq.Socket', allow_none=True) + _control_socket = Instance('zmq.Socket', allow_none=True) + _iopub_socket = Instance('zmq.Socket', allow_none=True) + _notification_socket = Instance('zmq.Socket', allow_none=True) + _mux_socket = Instance('zmq.Socket', allow_none=True) + _task_socket = Instance('zmq.Socket', allow_none=True) + _broadcast_non_coalescing_socket = Instance('zmq.Socket', allow_none=True) + _broadcast_coalescing_socket = Instance('zmq.Socket', allow_none=True) _sub_scheduler_socket = Instance('zmq.Socket', allow_none=True) - - _task_scheme=Unicode() + _task_scheme = Unicode() _closed = False def __new__(self, *args, **kw): # don't raise on positional args return HasTraits.__new__(self, **kw) - def __init__(self, url_file=None, profile=None, profile_dir=None, ipython_dir=None, - context=None, debug=False, - sshserver=None, sshkey=None, password=None, paramiko=None, - timeout=10, cluster_id=None, **extra_args - ): + def __init__( + self, + url_file=None, + profile=None, + profile_dir=None, + ipython_dir=None, + context=None, + debug=False, + sshserver=None, + sshkey=None, + password=None, + paramiko=None, + timeout=10, + cluster_id=None, + **extra_args + ): if profile: super(Client, self).__init__(debug=debug, profile=profile) else: @@ -387,17 +405,21 @@ def __init__(self, url_file=None, profile=None, profile_dir=None, ipython_dir=No if 'url_or_file' in extra_args: url_file = extra_args['url_or_file'] - warnings.warn("url_or_file arg no longer supported, use url_file", DeprecationWarning) + warnings.warn( + "url_or_file arg no longer supported, use url_file", DeprecationWarning + ) if url_file and util.is_url(url_file): raise ValueError("single urls cannot be specified, url-files must be used.") self._setup_profile_dir(self.profile, profile_dir, ipython_dir) - no_file_msg = '\n'.join([ - "You have attempted to connect to an IPython Cluster but no Controller could be found.", - "Please double-check your configuration and ensure that a cluster is running.", - ]) + no_file_msg = '\n'.join( + [ + "You have attempted to connect to an IPython Cluster but no Controller could be found.", + "Please double-check your configuration and ensure that a cluster is running.", + ] + ) if self._cd is not None: if url_file is None: @@ -409,25 +431,25 @@ def __init__(self, url_file=None, profile=None, profile_dir=None, ipython_dir=No short = compress_user(url_file) if not os.path.exists(url_file): print("Waiting for connection file: %s" % short) - waiting_time = 0. + waiting_time = 0.0 while waiting_time < timeout: - time.sleep(min(timeout-waiting_time, 1)) + time.sleep(min(timeout - waiting_time, 1)) waiting_time += 1 if os.path.exists(url_file): break if not os.path.exists(url_file): - msg = '\n'.join([ - "Connection file %r not found." % short, - no_file_msg, - ]) + msg = '\n'.join( + ["Connection file %r not found." % short, no_file_msg] + ) raise IOError(msg) if url_file is None: raise IOError(no_file_msg) if not os.path.exists(url_file): # Connection file explicitly specified, but not found - raise IOError("Connection file %r not found. Is a controller running?" % \ - compress_user(url_file) + raise IOError( + "Connection file %r not found. Is a controller running?" + % compress_user(url_file) ) with open(url_file) as f: @@ -441,20 +463,20 @@ def __init__(self, url_file=None, profile=None, profile_dir=None, ipython_dir=No location = cfg.setdefault('location', None) - proto,addr = cfg['interface'].split('://') + proto, addr = cfg['interface'].split('://') addr = util.disambiguate_ip_address(addr, location) cfg['interface'] = "%s://%s" % (proto, addr) # turn interface,port into full urls: for key in ( - 'control', - 'task', - 'mux', - 'iopub', - 'notification', - 'registration', - 'broadcast_non_coalescing', - 'broadcast_coalescing', + 'control', + 'task', + 'mux', + 'iopub', + 'notification', + 'registration', + 'broadcast_non_coalescing', + 'broadcast_coalescing', ): cfg[key] = cfg['interface'] + ':%i' % cfg[key] @@ -468,18 +490,24 @@ def __init__(self, url_file=None, profile=None, profile_dir=None, ipython_dir=No if not is_local_ip(location_ip) and not sshserver: # load ssh from JSON *only* if the controller is not on # this machine - sshserver=cfg['ssh'] - if not is_local_ip(location_ip) and not sshserver and\ - location != socket.gethostname(): + sshserver = cfg['ssh'] + if ( + not is_local_ip(location_ip) + and not sshserver + and location != socket.gethostname() + ): # warn if no ssh specified, but SSH is probably needed # This is only a warning, because the most likely cause # is a local Controller on a laptop whose IP is dynamic - warnings.warn(""" + warnings.warn( + """ Controller appears to be listening on localhost, but not on this machine. If this is true, you should specify Client(...,sshserver='you@%s') - or instruct your controller to listen on an external IP.""" % location, - RuntimeWarning) + or instruct your controller to listen on an external IP.""" + % location, + RuntimeWarning, + ) elif not sshserver: # otherwise sync with cfg sshserver = cfg['ssh'] @@ -492,10 +520,11 @@ def __init__(self, url_file=None, profile=None, profile_dir=None, ipython_dir=No sshserver = addr if self._ssh and password is None: from zmq.ssh import tunnel + if tunnel.try_passwordless_ssh(sshserver, sshkey, paramiko): - password=False + password = False else: - password = getpass("SSH Password for %s: "%sshserver) + password = getpass("SSH Password for %s: " % sshserver) ssh_kwargs = dict(keyfile=sshkey, password=password, paramiko=paramiko) # configure and construct the session @@ -505,10 +534,12 @@ def __init__(self, url_file=None, profile=None, profile_dir=None, ipython_dir=No extra_args['key'] = cast_bytes(cfg['key']) extra_args['signature_scheme'] = cfg['signature_scheme'] except KeyError as exc: - msg = '\n'.join([ - "Connection file is invalid (missing '{}'), possibly from an old version of IPython.", - "If you are reusing connection files, remove them and start ipcontroller again." - ]) + msg = '\n'.join( + [ + "Connection file is invalid (missing '{}'), possibly from an old version of IPython.", + "If you are reusing connection files, remove them and start ipcontroller again.", + ] + ) raise ValueError(msg.format(exc.message)) self.session = Session(**extra_args) @@ -517,19 +548,28 @@ def __init__(self, url_file=None, profile=None, profile_dir=None, ipython_dir=No if self._ssh: from zmq.ssh import tunnel - tunnel.tunnel_connection(self._query_socket, cfg['registration'], sshserver, - timeout=timeout, **ssh_kwargs) + + tunnel.tunnel_connection( + self._query_socket, + cfg['registration'], + sshserver, + timeout=timeout, + **ssh_kwargs + ) else: self._query_socket.connect(cfg['registration']) self.session.debug = self.debug - self._notification_handlers = {'registration_notification' : self._register_engine, - 'unregistration_notification' : self._unregister_engine, - 'shutdown_notification' : lambda msg: self.close(), - } - self._queue_handlers = {'execute_reply' : self._handle_execute_reply, - 'apply_reply' : self._handle_apply_reply} + self._notification_handlers = { + 'registration_notification': self._register_engine, + 'unregistration_notification': self._unregister_engine, + 'shutdown_notification': lambda msg: self.close(), + } + self._queue_handlers = { + 'execute_reply': self._handle_execute_reply, + 'apply_reply': self._handle_apply_reply, + } try: self._connect(sshserver, ssh_kwargs, timeout) @@ -563,8 +603,7 @@ def _setup_profile_dir(self, profile, profile_dir, ipython_dir): pass elif profile is not None: try: - self._cd = ProfileDir.find_profile_dir_by_name( - ipython_dir, profile) + self._cd = ProfileDir.find_profile_dir_by_name(ipython_dir, profile) return except ProfileDirError: pass @@ -572,14 +611,17 @@ def _setup_profile_dir(self, profile, profile_dir, ipython_dir): def _update_engines(self, engines): """Update our engines dict and _ids from a dict of the form: {id:uuid}.""" - for k,v in iteritems(engines): + for k, v in iteritems(engines): eid = int(k) if eid not in self._engines: self._ids.append(eid) self._engines[eid] = v self._ids = sorted(self._ids) - if sorted(self._engines.keys()) != list(range(len(self._engines))) and \ - self._task_scheme == 'pure' and self._task_socket: + if ( + sorted(self._engines.keys()) != list(range(len(self._engines))) + and self._task_scheme == 'pure' + and self._task_socket + ): self._stop_scheduling_tasks() def _stop_scheduling_tasks(self): @@ -588,11 +630,15 @@ def _stop_scheduling_tasks(self): """ self._task_socket.close() self._task_socket = None - msg = "An engine has been unregistered, and we are using pure " +\ - "ZMQ task scheduling. Task farming will be disabled." + msg = ( + "An engine has been unregistered, and we are using pure " + + "ZMQ task scheduling. Task farming will be disabled." + ) if self.outstanding: - msg += " If you were running tasks when this happened, " +\ - "some `outstanding` msg_ids may never resolve." + msg += ( + " If you were running tasks when this happened, " + + "some `outstanding` msg_ids may never resolve." + ) warnings.warn(msg, RuntimeWarning) def _build_targets(self, targets): @@ -602,7 +648,9 @@ def _build_targets(self, targets): if not self._ids: # flush notification socket if no engines yet, just in case if not self.ids: - raise error.NoEnginesRegistered("Can't build targets without any engines") + raise error.NoEnginesRegistered( + "Can't build targets without any engines" + ) if targets is None: targets = self._ids @@ -610,21 +658,23 @@ def _build_targets(self, targets): if targets.lower() == 'all': targets = self._ids else: - raise TypeError("%r not valid str target, must be 'all'"%(targets)) + raise TypeError("%r not valid str target, must be 'all'" % (targets)) elif isinstance(targets, int): if targets < 0: targets = self.ids[targets] if targets not in self._ids: - raise IndexError("No such engine: %i"%targets) + raise IndexError("No such engine: %i" % targets) targets = [targets] if isinstance(targets, slice): indices = list(range(len(self._ids))[targets]) ids = self.ids - targets = [ ids[i] for i in indices ] + targets = [ids[i] for i in indices] if not isinstance(targets, (tuple, list, xrange)): - raise TypeError("targets by int/slice/collection of ints only, not %s"%(type(targets))) + raise TypeError( + "targets by int/slice/collection of ints only, not %s" % (type(targets)) + ) return [cast_bytes(self._engines[t]) for t in targets], list(targets) @@ -635,11 +685,12 @@ def _connect(self, sshserver, ssh_kwargs, timeout): # Maybe allow reconnecting? if self._connected: return - self._connected=True + self._connected = True def connect_socket(s, url): if self._ssh: from zmq.ssh import tunnel + return tunnel.tunnel_connection(s, url, sshserver, **ssh_kwargs) else: return s.connect(url) @@ -649,7 +700,7 @@ def connect_socket(s, url): poller = zmq.Poller() poller.register(self._query_socket, zmq.POLLIN) # poll expects milliseconds, timeout is seconds - evts = poller.poll(timeout*1000) + evts = poller.poll(timeout * 1000) if not evts: raise error.TimeoutError("Hub connection request timed out") idents, msg = self.session.recv(self._query_socket, mode=0) @@ -666,12 +717,14 @@ def connect_socket(s, url): connect_socket(self._task_socket, cfg['task']) self._broadcast_non_coalescing_socket = self._context.socket(zmq.DEALER) - connect_socket(self._broadcast_non_coalescing_socket, - cfg['broadcast_non_coalescing']) + connect_socket( + self._broadcast_non_coalescing_socket, cfg['broadcast_non_coalescing'] + ) self._broadcast_coalescing_socket = self._context.socket(zmq.DEALER) - connect_socket(self._broadcast_coalescing_socket, - cfg['broadcast_coalescing']) + connect_socket( + self._broadcast_coalescing_socket, cfg['broadcast_coalescing'] + ) self._sub_scheduler_socket = self._context.socket(zmq.DEALER) connect_socket(self._sub_scheduler_socket, cfg['sub_schedulers']) @@ -695,9 +748,9 @@ def connect_socket(s, url): self._start_io_thread() - #-------------------------------------------------------------------------- + # -------------------------------------------------------------------------- # handlers and callbacks for incoming messages - #-------------------------------------------------------------------------- + # -------------------------------------------------------------------------- def _unwrap_exception(self, content): """unwrap exception, and remap engine_id to int.""" @@ -714,15 +767,19 @@ def _extract_metadata(self, msg): parent = msg['parent_header'] msg_meta = msg['metadata'] content = msg['content'] - md = {'msg_id' : parent['msg_id'], - 'received' : util.utcnow(), - 'engine_uuid' : msg_meta.get('engine', None), - 'follow' : msg_meta.get('follow', []), - 'after' : msg_meta.get('after', []), - 'status' : content['status'], - 'is_broadcast_non_coalescing': msg_meta.get('is_broadcast_non_coalescing', False), - 'is_broadcast_coalescing': msg_meta.get('is_broadcast_coalescing', False) - } + md = { + 'msg_id': parent['msg_id'], + 'received': util.utcnow(), + 'engine_uuid': msg_meta.get('engine', None), + 'follow': msg_meta.get('follow', []), + 'after': msg_meta.get('after', []), + 'status': content['status'], + 'is_broadcast_non_coalescing': msg_meta.get( + 'is_broadcast_non_coalescing', False + ), + 'is_broadcast_coalescing': msg_meta.get('is_broadcast_coalescing', False), + 'is_spanning_tree': msg_meta.get('is_spanning_tree', False) + } if md['engine_uuid'] is not None: md['engine_id'] = self._engines.get(md['engine_uuid'], None) @@ -738,7 +795,7 @@ def _register_engine(self, msg): """Register a new engine, and update our connection info.""" content = msg['content'] eid = content['id'] - d = {eid : content['uuid']} + d = {eid: content['uuid']} self._update_engines(d) def _unregister_engine(self, msg): @@ -769,7 +826,9 @@ def _handle_stranded_msgs(self, eid, uuid): # we already continue try: - raise error.EngineError("Engine %r died while running task %r"%(eid, msg_id)) + raise error.EngineError( + "Engine %r died while running task %r" % (eid, msg_id) + ) except: content = error.wrap_exception() # build a fake message: @@ -789,9 +848,9 @@ def _handle_execute_reply(self, msg): future = self._futures.get(msg_id, None) if msg_id not in self.outstanding: if msg_id in self.history: - print("got stale result: %s"%msg_id) + print("got stale result: %s" % msg_id) else: - print("got unknown result: %s"%msg_id) + print("got unknown result: %s" % msg_id) else: self.outstanding.remove(msg_id) @@ -828,12 +887,15 @@ def _handle_execute_reply(self, msg): if future: future.set_result(self.results[msg_id]) + def _should_use_metadata_msg_id(self, msg): + md = msg['metadata'] + return md.get('is_broadcast_non_coalescing', False) or md.get( + 'is_broadcast_coalescing', False) or md.get('is_spanning_tree', False) + def _handle_apply_reply(self, msg): """Save the reply to an apply_request into our results.""" parent = msg['parent_header'] - - md = msg['metadata'] - if md.get('is_broadcast_non_coalescing', False) or md.get('is_broadcast_coalescing'): + if self._should_use_metadata_msg_id(msg): msg_id = msg['metadata']['original_msg_id'] else: msg_id = parent['msg_id'] @@ -841,11 +903,11 @@ def _handle_apply_reply(self, msg): future = self._futures.get(msg_id, None) if msg_id not in self.outstanding: if msg_id in self.history: - print("got stale result: %s"%msg_id) + print("got stale result: %s" % msg_id) print(self.results[msg_id]) print(msg) else: - print("got unknown result: %s"%msg_id) + print("got unknown result: %s" % msg_id) else: self.outstanding.remove(msg_id) content = msg['content'] @@ -859,10 +921,9 @@ def _handle_apply_reply(self, msg): if msg_id in e_outstanding: e_outstanding.remove(msg_id) - # construct result: if content['status'] == 'ok': - if md.get('is_broadcast_coalescing', False): + if md.get('is_broadcast_coalescing', False) or md.get('is_spanning_tree', False): self.results[msg_id] = serialize.deserialize_object( msg['buffers'], try_to_extract_all=True ) @@ -891,6 +952,7 @@ def _make_io_loop(self): if 'asyncio' in sys.modules: # tornado 5 on asyncio requires creating a new asyncio loop import asyncio + try: asyncio.get_event_loop() except RuntimeError: @@ -923,15 +985,18 @@ def _setup_streams(self): self._notification_stream.on_recv(self._dispatch_notification, copy=False) self._broadcast_non_coalescing_stream = ZMQStream( - self._broadcast_non_coalescing_socket, self._io_loop) + self._broadcast_non_coalescing_socket, self._io_loop + ) self._broadcast_non_coalescing_stream.on_recv(self._dispatch_reply, copy=False) self._broadcast_coalescing_stream = ZMQStream( - self._broadcast_coalescing_socket, self._io_loop) + self._broadcast_coalescing_socket, self._io_loop + ) self._broadcast_coalescing_stream.on_recv(self._dispatch_reply, copy=False) self._sub_scheduler_stream = ZMQStream( - self._sub_scheduler_socket, self._io_loop) + self._sub_scheduler_socket, self._io_loop + ) self._sub_scheduler_stream.on_recv(self._dispatch_reply, copy=False) @@ -948,7 +1013,9 @@ def _start_io_thread(self): if not self._io_thread.is_alive(): raise RuntimeError("IO Loop failed to start") else: - raise RuntimeError("Start event was never set. Maybe a problem in the IO thread.") + raise RuntimeError( + "Start event was never set. Maybe a problem in the IO thread." + ) def _io_main(self, start_evt=None): """main loop for background IO thread""" @@ -1013,9 +1080,9 @@ def _dispatch_iopub(self, msg): s = md[name] or '' md[name] = s + content['text'] elif msg_type == 'error': - md.update({'error' : self._unwrap_exception(content)}) + md.update({'error': self._unwrap_exception(content)}) elif msg_type == 'execute_input': - md.update({'execute_input' : content['code']}) + md.update({'execute_input': content['code']}) elif msg_type == 'display_data': md['outputs'].append(content) elif msg_type == 'execute_result': @@ -1049,21 +1116,32 @@ def create_message_futures(self, msg_id, async_result=False, track=False): futures.append(output) return futures - def _send(self, socket, msg_type, content=None, parent=None, ident=None, - buffers=None, track=False, header=None, metadata=None): + def _send( + self, + socket, + msg_type, + content=None, + parent=None, + ident=None, + buffers=None, + track=False, + header=None, + metadata=None, + ): """Send a message in the IO thread returns msg object""" if self._closed: raise IOError("Connections have been closed.") - msg = self.session.msg(msg_type, content=content, parent=parent, - header=header, metadata=metadata) + msg = self.session.msg( + msg_type, content=content, parent=parent, header=header, metadata=metadata + ) msg_id = msg['header']['msg_id'] futures = self.create_message_futures( msg_id, async_result=msg_type in {'execute_request', 'apply_request'}, - track=track + track=track, ) def cleanup(f): @@ -1076,7 +1154,9 @@ def cleanup(f): multi_future(futures).add_done_callback(cleanup) def _really_send(): - sent = self.session.send(socket, msg, track=track, buffers=buffers, ident=ident) + sent = self.session.send( + socket, msg, track=track, buffers=buffers, ident=ident + ) if track: futures[0].tracker.set_result(sent['tracker']) @@ -1090,9 +1170,9 @@ def _send_recv(self, *args, **kwargs): future.wait() return future.result() - #-------------------------------------------------------------------------- + # -------------------------------------------------------------------------- # len, getitem - #-------------------------------------------------------------------------- + # -------------------------------------------------------------------------- def __len__(self): """len(client) returns # of engines.""" @@ -1103,7 +1183,9 @@ def __getitem__(self, key): Must be int, slice, or list/tuple/xrange of ints""" if not isinstance(key, (int, slice, tuple, list, xrange)): - raise TypeError("key by int/slice/iterable of ints only, not %s"%(type(key))) + raise TypeError( + "key by int/slice/iterable of ints only, not %s" % (type(key)) + ) else: return self.direct_view(key) @@ -1116,9 +1198,9 @@ def __iter__(self): for eid in self.ids: yield self.direct_view(eid) - #-------------------------------------------------------------------------- + # -------------------------------------------------------------------------- # Begin public methods - #-------------------------------------------------------------------------- + # -------------------------------------------------------------------------- @property def ids(self): @@ -1158,7 +1240,7 @@ def close(self, linger=None): if self._closed: return self._stop_io_thread() - snames = [ trait for trait in self.trait_names() if trait.endswith("socket") ] + snames = [trait for trait in self.trait_names() if trait.endswith("socket")] for name in snames: socket = getattr(self, name) if socket is not None and not socket.closed: @@ -1170,16 +1252,23 @@ def close(self, linger=None): def spin_thread(self, interval=1): """DEPRECATED, DOES NOTHING""" - warnings.warn("Client.spin_thread is deprecated now that IO is always in a thread", DeprecationWarning) + warnings.warn( + "Client.spin_thread is deprecated now that IO is always in a thread", + DeprecationWarning, + ) def stop_spin_thread(self): """DEPRECATED, DOES NOTHING""" - warnings.warn("Client.spin_thread is deprecated now that IO is always in a thread", DeprecationWarning) + warnings.warn( + "Client.spin_thread is deprecated now that IO is always in a thread", + DeprecationWarning, + ) def spin(self): """DEPRECATED, DOES NOTHING""" - warnings.warn("Client.spin is deprecated now that IO is in a thread", DeprecationWarning) - + warnings.warn( + "Client.spin is deprecated now that IO is in a thread", DeprecationWarning + ) def _await_futures(self, futures, timeout): """Wait for a collection of futures""" @@ -1233,8 +1322,9 @@ def wait(self, jobs=None, timeout=-1): # make a copy, so that we aren't passing a mutable collection to _futures_for_msgs theids = set(self.outstanding) else: - if isinstance(jobs, string_types + (int, AsyncResult)) \ - or not isinstance(jobs, Iterable): + if isinstance(jobs, string_types + (int, AsyncResult)) or not isinstance( + jobs, Iterable + ): jobs = [jobs] theids = set() for job in jobs: @@ -1254,22 +1344,22 @@ def wait(self, jobs=None, timeout=-1): futures.extend(self._futures_for_msgs(theids)) return self._await_futures(futures, timeout) - def wait_interactive(self, jobs=None, interval=1., timeout=-1.): + def wait_interactive(self, jobs=None, interval=1.0, timeout=-1.0): """Wait interactively for jobs If no job is specified, will wait for all outstanding jobs to complete. """ if jobs is None: # get futures for results - futures = [ f for f in self._futures.values() if hasattr(f, 'output') ] + futures = [f for f in self._futures.values() if hasattr(f, 'output')] ar = AsyncResult(self, futures, owner=False) else: ar = self._asyncresult_from_jobs(jobs, owner=False) return ar.wait_interactive(interval=interval, timeout=timeout) - #-------------------------------------------------------------------------- + # -------------------------------------------------------------------------- # Control methods - #-------------------------------------------------------------------------- + # -------------------------------------------------------------------------- def clear(self, targets=None, block=None): """Clear the namespace in target(s).""" @@ -1277,7 +1367,9 @@ def clear(self, targets=None, block=None): targets = self._build_targets(targets)[0] futures = [] for t in targets: - futures.append(self._send(self._control_stream, 'clear_request', content={}, ident=t)) + futures.append( + self._send(self._control_stream, 'clear_request', content={}, ident=t) + ) if not block: return multi_future(futures) for future in futures: @@ -1286,7 +1378,6 @@ def clear(self, targets=None, block=None): if msg['content']['status'] != 'ok': raise self._unwrap_exception(msg['content']) - def abort(self, jobs=None, targets=None, block=None): """Abort specific jobs from the execution queues of target(s). @@ -1309,9 +1400,13 @@ def abort(self, jobs=None, targets=None, block=None): msg_ids = [] if isinstance(jobs, string_types + (AsyncResult,)): jobs = [jobs] - bad_ids = [obj for obj in jobs if not isinstance(obj, string_types + (AsyncResult,))] + bad_ids = [ + obj for obj in jobs if not isinstance(obj, string_types + (AsyncResult,)) + ] if bad_ids: - raise TypeError("Invalid msg_id type %r, expected str or AsyncResult"%bad_ids[0]) + raise TypeError( + "Invalid msg_id type %r, expected str or AsyncResult" % bad_ids[0] + ) for j in jobs: if isinstance(j, AsyncResult): msg_ids.extend(j.msg_ids) @@ -1320,8 +1415,11 @@ def abort(self, jobs=None, targets=None, block=None): content = dict(msg_ids=msg_ids) futures = [] for t in targets: - futures.append(self._send(self._control_stream, 'abort_request', - content=content, ident=t)) + futures.append( + self._send( + self._control_stream, 'abort_request', content=content, ident=t + ) + ) if not block: return multi_future(futures) @@ -1349,6 +1447,7 @@ def shutdown(self, targets='all', restart=False, hub=False, block=None): whether to restart engines after shutting them down. """ from ipyparallel.error import NoEnginesRegistered + if restart: raise NotImplementedError("Engine restart is not yet implemented") @@ -1362,8 +1461,14 @@ def shutdown(self, targets='all', restart=False, hub=False, block=None): futures = [] for t in targets: - futures.append(self._send(self._control_stream, 'shutdown_request', - content={'restart':restart},ident=t)) + futures.append( + self._send( + self._control_stream, + 'shutdown_request', + content={'restart': restart}, + ident=t, + ) + ) error = False if block or hub: for f in futures: @@ -1384,7 +1489,9 @@ def shutdown(self, targets='all', restart=False, hub=False, block=None): if error: raise error - def become_dask(self, targets='all', port=0, nanny=False, scheduler_args=None, **worker_args): + def become_dask( + self, targets='all', port=0, nanny=False, scheduler_args=None, **worker_args + ): """Turn the IPython cluster into a dask.distributed cluster Parameters @@ -1415,10 +1522,12 @@ def become_dask(self, targets='all', port=0, nanny=False, scheduler_args=None, * if scheduler_args is None: scheduler_args = {} else: - scheduler_args = dict(scheduler_args) # copy + scheduler_args = dict(scheduler_args) # copy # Start a Scheduler on the Hub: - reply = self._send_recv(self._query_stream, 'become_dask_request', + reply = self._send_recv( + self._query_stream, + 'become_dask_request', {'scheduler_args': scheduler_args}, ) if reply['content']['status'] != 'ok': @@ -1444,7 +1553,6 @@ def become_dask(self, targets='all', port=0, nanny=False, scheduler_args=None, * return client - def stop_dask(self, targets='all'): """Stop the distributed Scheduler and Workers started by become_dask. @@ -1468,9 +1576,9 @@ def stop_dask(self, targets='all'): become_distributed = become_dask stop_distributed = stop_dask - #-------------------------------------------------------------------------- + # -------------------------------------------------------------------------- # Execution related methods - #-------------------------------------------------------------------------- + # -------------------------------------------------------------------------- def _maybe_raise(self, result): """wrapper for maybe raising an exception if apply failed.""" @@ -1479,15 +1587,18 @@ def _maybe_raise(self, result): return result - def send_apply_request(self, socket, f, args=None, kwargs=None, metadata=None, track=False, - ident=None): + def send_apply_request( + self, socket, f, args=None, kwargs=None, metadata=None, track=False, ident=None + ): """construct and send an apply message via a socket. This is the principal method with which all engine execution is performed by views. """ if self._closed: - raise RuntimeError("Client cannot be used after its sockets have been closed") + raise RuntimeError( + "Client cannot be used after its sockets have been closed" + ) # defaults: args = args if args is not None else [] @@ -1496,21 +1607,30 @@ def send_apply_request(self, socket, f, args=None, kwargs=None, metadata=None, t # validate arguments if not callable(f) and not isinstance(f, (Reference, PrePickled)): - raise TypeError("f must be callable, not %s"%type(f)) + raise TypeError("f must be callable, not %s" % type(f)) if not isinstance(args, (tuple, list)): - raise TypeError("args must be tuple or list, not %s"%type(args)) + raise TypeError("args must be tuple or list, not %s" % type(args)) if not isinstance(kwargs, dict): - raise TypeError("kwargs must be dict, not %s"%type(kwargs)) + raise TypeError("kwargs must be dict, not %s" % type(kwargs)) if not isinstance(metadata, dict): - raise TypeError("metadata must be dict, not %s"%type(metadata)) + raise TypeError("metadata must be dict, not %s" % type(metadata)) - bufs = serialize.pack_apply_message(f, args, kwargs, + bufs = serialize.pack_apply_message( + f, + args, + kwargs, buffer_threshold=self.session.buffer_threshold, item_threshold=self.session.item_threshold, ) - future = self._send(socket, "apply_request", buffers=bufs, ident=ident, - metadata=metadata, track=track) + future = self._send( + socket, + "apply_request", + buffers=bufs, + ident=ident, + metadata=metadata, + track=track, + ) msg_id = future.msg_id self.outstanding.add(msg_id) @@ -1525,13 +1645,17 @@ def send_apply_request(self, socket, f, args=None, kwargs=None, metadata=None, t return future - def send_execute_request(self, socket, code, silent=True, metadata=None, ident=None): + def send_execute_request( + self, socket, code, silent=True, metadata=None, ident=None + ): """construct and send an execute request via a socket. """ if self._closed: - raise RuntimeError("Client cannot be used after its sockets have been closed") + raise RuntimeError( + "Client cannot be used after its sockets have been closed" + ) # defaults: metadata = metadata if metadata is not None else {} @@ -1544,9 +1668,9 @@ def send_execute_request(self, socket, code, silent=True, metadata=None, ident=N content = dict(code=code, silent=bool(silent), user_expressions={}) - - future = self._send(socket, "execute_request", content=content, ident=ident, - metadata=metadata) + future = self._send( + socket, "execute_request", content=content, ident=ident, metadata=metadata + ) msg_id = future.msg_id self.outstanding.add(msg_id) @@ -1562,9 +1686,9 @@ def send_execute_request(self, socket, code, silent=True, metadata=None, ident=N return future - #-------------------------------------------------------------------------- + # -------------------------------------------------------------------------- # construct a View object - #-------------------------------------------------------------------------- + # -------------------------------------------------------------------------- def load_balanced_view(self, targets=None, **kwargs): """construct a DirectView object. @@ -1583,8 +1707,9 @@ def load_balanced_view(self, targets=None, **kwargs): targets = None if targets is not None: targets = self._build_targets(targets)[1] - return LoadBalancedView(client=self, socket=self._task_stream, targets=targets, - **kwargs) + return LoadBalancedView( + client=self, socket=self._task_stream, targets=targets, **kwargs + ) def executor(self, targets=None): """Construct a PEP-3148 Executor with a LoadBalancedView @@ -1647,20 +1772,30 @@ def broadcast_view(self, targets='all', is_coalescing=False, **kwargs): """ targets = self._build_targets(targets)[1] - return BroadcastViewCoalescing( - client=self, socket=self._broadcast_coalescing_stream, targets=targets, **kwargs - ) if is_coalescing else BroadcastViewNonCoalescing( - client=self, - socket=self._broadcast_non_coalescing_stream, - targets=targets, **kwargs) + return ( + BroadcastViewCoalescing( + client=self, + socket=self._broadcast_coalescing_stream, + targets=targets, + **kwargs + ) + if is_coalescing + else BroadcastViewNonCoalescing( + client=self, + socket=self._broadcast_non_coalescing_stream, + targets=targets, + **kwargs + ) + ) - def exponential_view(self, targets='all', **kwargs): - return ExponentialView( + def spanning_tree_view(self, targets='all', **kwargs): + return SpanningTreeView( client=self, socket=self._sub_scheduler_stream, targets=targets, **kwargs ) - #-------------------------------------------------------------------------- + + # -------------------------------------------------------------------------- # Query methods - #-------------------------------------------------------------------------- + # -------------------------------------------------------------------------- def get_result(self, indices_or_msg_ids=None, block=None, owner=True): """Retrieve a result by msg_id or history index, wrapped in an AsyncResult object. @@ -1742,14 +1877,14 @@ def resubmit(self, indices_or_msg_ids=None, metadata=None, block=None): indices_or_msg_ids = -1 theids = self._msg_ids_from_jobs(indices_or_msg_ids) - content = dict(msg_ids = theids) + content = dict(msg_ids=theids) reply = self._send_recv(self._query_stream, 'resubmit_request', content) content = reply['content'] if content['status'] != 'ok': raise self._unwrap_exception(content) mapping = content['resubmitted'] - new_ids = [ mapping[msg_id] for msg_id in theids ] + new_ids = [mapping[msg_id] for msg_id in theids] ar = AsyncHubResult(self, new_ids) @@ -1794,15 +1929,17 @@ def result_status(self, msg_ids, status_only=True): local_results[msg_id] = self.results[msg_id] theids.remove(msg_id) - if theids: # some not locally cached + if theids: # some not locally cached content = dict(msg_ids=theids, status_only=status_only) - reply = self._send_recv(self._query_stream, "result_request", content=content) + reply = self._send_recv( + self._query_stream, "result_request", content=content + ) content = reply['content'] if content['status'] != 'ok': raise self._unwrap_exception(content) buffers = reply['buffers'] else: - content = dict(completed=[],pending=[]) + content = dict(completed=[], pending=[]) content['completed'].extend(completed) @@ -1838,7 +1975,7 @@ def result_status(self, msg_ids, status_only=True): if rcontent['status'] == 'ok': if header['msg_type'] == 'apply_reply': - res,buffers = serialize.deserialize_object(buffers) + res, buffers = serialize.deserialize_object(buffers) elif header['msg_type'] == 'execute_reply': res = ExecuteReply(msg_id, rcontent, md) else: @@ -1887,10 +2024,14 @@ def queue_status(self, targets='all', verbose=False): def _msg_ids_from_target(self, targets=None): """Build a list of msg_ids from the list of engine targets""" - if not targets: # needed as _build_targets otherwise uses all engines + if not targets: # needed as _build_targets otherwise uses all engines return [] target_ids = self._build_targets(targets)[0] - return [md_id for md_id in self.metadata if self.metadata[md_id]["engine_uuid"] in target_ids] + return [ + md_id + for md_id in self.metadata + if self.metadata[md_id]["engine_uuid"] in target_ids + ] def _msg_ids_from_jobs(self, jobs=None): """Given a 'jobs' argument, convert it to a list of msg_ids. @@ -1990,7 +2131,9 @@ def purge_local_results(self, jobs=[], targets=[]): if jobs == 'all': if self.outstanding: - raise RuntimeError("Can't purge outstanding tasks: %s" % self.outstanding) + raise RuntimeError( + "Can't purge outstanding tasks: %s" % self.outstanding + ) self.results.clear() self.metadata.clear() self._futures.clear() @@ -2001,14 +2144,15 @@ def purge_local_results(self, jobs=[], targets=[]): msg_ids.update(self._msg_ids_from_jobs(jobs)) still_outstanding = self.outstanding.intersection(msg_ids) if still_outstanding: - raise RuntimeError("Can't purge outstanding tasks: %s" % still_outstanding) + raise RuntimeError( + "Can't purge outstanding tasks: %s" % still_outstanding + ) for mid in msg_ids: self.results.pop(mid, None) self.metadata.pop(mid, None) self._futures.pop(mid, None) self._output_futures.pop(mid, None) - def purge_hub_results(self, jobs=[], targets=[]): """Tell the Hub to forget results. @@ -2044,7 +2188,7 @@ def purge_hub_results(self, jobs=[], targets=[]): if content['status'] != 'ok': raise self._unwrap_exception(content) - def purge_results(self, jobs=[], targets=[]): + def purge_results(self, jobs=[], targets=[]): """Clears the cached results from both the hub and the local client Individual results can be purged by msg_id, or the entire @@ -2131,7 +2275,7 @@ def db_query(self, query, keys=None): buffers = reply['buffers'] has_bufs = buffer_lens is not None has_rbufs = result_buffer_lens is not None - for i,rec in enumerate(records): + for i, rec in enumerate(records): # unpack datetime objects for hkey in ('header', 'result_header'): if hkey in rec: @@ -2142,11 +2286,12 @@ def db_query(self, query, keys=None): # relink buffers if has_bufs: blen = buffer_lens[i] - rec['buffers'], buffers = buffers[:blen],buffers[blen:] + rec['buffers'], buffers = buffers[:blen], buffers[blen:] if has_rbufs: blen = result_buffer_lens[i] - rec['result_buffers'], buffers = buffers[:blen],buffers[blen:] + rec['result_buffers'], buffers = buffers[:blen], buffers[blen:] return records -__all__ = [ 'Client' ] + +__all__ = ['Client'] diff --git a/ipyparallel/client/view.py b/ipyparallel/client/view.py index df1bc4c97..3e3bb9b27 100644 --- a/ipyparallel/client/view.py +++ b/ipyparallel/client/view.py @@ -933,7 +933,7 @@ def _really_apply(self, f, args=None, kwargs=None, block=None, track=None, return ar -class ExponentialView(DirectView): +class SpanningTreeView(DirectView): def __init__(self, client=None, socket=None, targets=None): super().__init__(client=client, socket=socket, targets=targets) @@ -1301,5 +1301,5 @@ def shutdown(self, wait=True): self.view.wait() __all__ = ['LoadBalancedView', 'DirectView', 'ViewExecutor', - 'BroadcastViewNonCoalescing', 'BroadcastViewCoalescing'] + 'BroadcastViewNonCoalescing', 'BroadcastViewCoalescing', 'SpanningTreeView'] diff --git a/ipyparallel/controller/broadcast_scheduler.py b/ipyparallel/controller/broadcast_scheduler.py index 26f9c3ffd..e5d150798 100644 --- a/ipyparallel/controller/broadcast_scheduler.py +++ b/ipyparallel/controller/broadcast_scheduler.py @@ -14,27 +14,19 @@ def dispatch_submission(self, raw_msg): try: idents, msg_list = self.session.feed_identities(raw_msg, copy=False) msg = self.session.deserialize(msg_list, content=False, copy=False) - except Exception as e: + except: self.log.error( f'broadcast::Invalid broadcast msg: {raw_msg}', exc_info=True ) return - header = msg['header'] - metadata = msg['metadata'] - original_msg_id = header['msg_id'] - - targets = metadata.get('targets', []) + original_msg_id = msg['header']['msg_id'] + targets = msg['metadata'].get('targets', []) for target in targets: - msg_and_target_id = f'{original_msg_id}_{target}' - self.all_ids.add(msg_and_target_id) - new_idents = [cast_bytes(target)] + idents - - header['msg_id'] = msg_and_target_id - new_msg_list = self.session.serialize(msg, ident=new_idents) - new_msg_list.extend(msg['buffers']) - + new_msg_list = self.append_new_msg_id_to_msg( + self.get_new_msg_id(original_msg_id, target), target, idents, msg + ) self.mon_stream.send_multipart([b'inbcast'] + new_msg_list, copy=False) self.engine_stream.send_multipart(new_msg_list, copy=False) @@ -42,28 +34,16 @@ def dispatch_submission(self, raw_msg): def dispatch_result(self, raw_msg): try: idents, msg = self.session.feed_identities(raw_msg, copy=False) - msg = self.session.deserialize(msg, content=False, copy=False) engine, client = idents[:2] - except Exception as e: + except: self.log.error( f'broadcast::Invalid broadcast msg: {raw_msg}', exc_info=True ) return - metadata = msg['metadata'] - msg_id = msg['parent_header']['msg_id'] - - success = metadata['status'] == 'ok' - if success: - self.all_completed.add(msg_id) - else: - self.all_failed.add(msg_id) - # swap ids for ROUTER-ROUTER mirror raw_msg[:2] = [client, engine] self.client_stream.send_multipart(raw_msg, copy=False) - self.all_done.add(msg_id) - self.mon_stream.send_multipart([b'outbcast'] + raw_msg, copy=False) @@ -85,26 +65,20 @@ def dispatch_submission(self, raw_msg): ) return - header = msg['header'] metadata = msg['metadata'] - original_msg_id = header['msg_id'] - + original_msg_id = msg['header']['msg_id'] targets = metadata.get('targets', []) + self.accumulated_replies[original_msg_id] = { f'{original_msg_id}_{target}': None for target in targets } metadata['original_msg_id'] = original_msg_id for target in targets: - msg_and_target_id = f'{original_msg_id}_{target}' - self.all_ids.add(msg_and_target_id) - header['msg_id'] = msg_and_target_id - new_idents = [cast_bytes(target)] + idents - new_msg_list = self.session.serialize(msg, ident=new_idents) - new_msg_list.extend(msg['buffers']) - + new_msg_list = self.append_new_msg_id_to_msg( + self.get_new_msg_id(original_msg_id, target), target, idents, msg + ) self.mon_stream.send_multipart([b'inbcast'] + new_msg_list, copy=False) - # self.log.debug("Sending %r", new_msg_list) self.engine_stream.send_multipart(new_msg_list, copy=False) @util.log_errors @@ -122,12 +96,6 @@ def dispatch_result(self, raw_msg): metadata = msg['metadata'] msg_id = msg['parent_header']['msg_id'] - success = metadata['status'] == 'ok' - if success: - self.all_completed.add(msg_id) - else: - self.all_failed.add(msg_id) - original_msg_id = metadata['original_msg_id'] self.accumulated_replies[original_msg_id][msg_id] = raw_msg raw_msg[:2] = [client, engine] @@ -145,6 +113,4 @@ def dispatch_result(self, raw_msg): ], copy=False, ) - self.all_done.add(original_msg_id) - self.mon_stream.send_multipart([b'outbcast'] + raw_msg, copy=False) diff --git a/ipyparallel/controller/hub.py b/ipyparallel/controller/hub.py index 897a8b4b3..f5bff7a0e 100644 --- a/ipyparallel/controller/hub.py +++ b/ipyparallel/controller/hub.py @@ -22,7 +22,7 @@ # internal: from ipython_genutils.importstring import import_item -from .exponential_scheduler import SPANNING_TREE_SCHEDULER_DEPTH +from .spanning_tree_scheduler import SPANNING_TREE_SCHEDULER_DEPTH from ..util import extract_dates from jupyter_client.localinterfaces import localhost from ipython_genutils.py3compat import cast_bytes, unicode_type, iteritems, buffer_to_bytes_py2 @@ -487,6 +487,8 @@ def __init__(self, **kwargs): b'outtask': self.save_task_result, b'inbcast': self.save_broadcast_request, b'outbcast': self.save_broadcast_result, + b'insptree': self.save_spanning_tree_request, + b'outsptree': self.save_spanning_tree_result, b'tracktask': self.save_task_destination, b'incontrol': _passer, b'outcontrol': _passer, @@ -764,6 +766,11 @@ def save_queue_result(self, idents, msg): except Exception: self.log.error("DB Error updating record %r", msg_id, exc_info=True) + def save_spanning_tree_request(self): + pass + + def save_spanning_tree_result(self): + pass #--------------------- Broadcast traffic ------------------------------ def save_broadcast_request(self, idents, msg): diff --git a/ipyparallel/controller/scheduler.py b/ipyparallel/controller/scheduler.py index b1e7f91c1..e6a6d977a 100644 --- a/ipyparallel/controller/scheduler.py +++ b/ipyparallel/controller/scheduler.py @@ -9,6 +9,8 @@ # Distributed under the terms of the Modified BSD License. import logging + +from ipython_genutils.py3compat import cast_bytes from traitlets import observe, Instance, Set, CBytes @@ -82,17 +84,22 @@ def dispatch_result(self, raw_msg): def dispatch_submission(self, raw_msg): raise NotImplementedError("Implement in subclasses") + def append_new_msg_id_to_msg(self, new_id, target_id, idents, msg): + new_idents = [cast_bytes(target_id)] + idents + msg['header']['msg_id'] = new_id + new_msg_list = self.session.serialize(msg, ident=new_idents) + new_msg_list.extend(msg['buffers']) + return new_msg_list + + def get_new_msg_id(self, original_msg_id, outgoing_id): + return f'{original_msg_id}_{outgoing_id if isinstance(outgoing_id, str) else outgoing_id.decode("utf8")}' + + ZMQStream = zmqstream.ZMQStream + def get_common_scheduler_streams( - mon_addr, - not_addr, - reg_addr, - config, - logname, - log_url, - loglevel, - in_thread, + mon_addr, not_addr, reg_addr, config, logname, log_url, loglevel, in_thread ): if config: # unwrap dict back into Config @@ -144,14 +151,7 @@ def launch_scheduler( in_thread=False, ): config, ctx, loop, mons, nots, querys, log = get_common_scheduler_streams( - mon_addr, - not_addr, - reg_addr, - config, - logname, - log_url, - loglevel, - in_thread + mon_addr, not_addr, reg_addr, config, logname, log_url, loglevel, in_thread ) util.set_hwm(mons, 0) @@ -177,7 +177,7 @@ def launch_scheduler( query_stream=querys, loop=loop, log=log, - config=config + config=config, ) scheduler.start() diff --git a/ipyparallel/controller/exponential_scheduler.py b/ipyparallel/controller/spanning_tree_scheduler.py similarity index 60% rename from ipyparallel/controller/exponential_scheduler.py rename to ipyparallel/controller/spanning_tree_scheduler.py index 60c4352ec..cccfa9623 100644 --- a/ipyparallel/controller/exponential_scheduler.py +++ b/ipyparallel/controller/spanning_tree_scheduler.py @@ -1,7 +1,6 @@ import logging import zmq -from ipython_genutils.py3compat import cast_bytes from ipyparallel import util from ipyparallel.controller.scheduler import ( @@ -10,17 +9,19 @@ ZMQStream, ) -SPANNING_TREE_SCHEDULER_DEPTH = 2 +SPANNING_TREE_SCHEDULER_DEPTH = 3 + class SpanningTreeScheduler(Scheduler): accumulated_replies = {} def __init__( - self, *args, connected_sub_schedulers=None, outgoing_streams=None, **kwargs + self, depth=0, connected_sub_schedulers=None, outgoing_streams=None, **kwargs ): super().__init__(**kwargs) self.connected_sub_schedulers = connected_sub_schedulers self.outgoing_streams = outgoing_streams + self.depth = depth self.log.info('Spanning tree scheduler started') def start(self): @@ -36,68 +37,61 @@ def stop_receiving(self): @util.log_errors def dispatch_submission(self, raw_msg): - self.log.info(f'Spanning tree msg received ') try: idents, msg_list = self.session.feed_identities(raw_msg, copy=False) msg = self.session.deserialize(msg_list, content=False, copy=False) - except Exception as e: + except: self.log.error(f'Spanning tree scheduler:: Invalid msg: {raw_msg}') return - header = msg['header'] + metadata = msg['metadata'] - original_msg_id = header['msg_id'] + msg_id = msg['header']['msg_id'] targets = metadata.get('targets', []) + if 'original_msg_id' not in metadata: + metadata['original_msg_id'] = msg_id + metadata['is_spanning_tree'] = True + + original_msg_id = metadata['original_msg_id'] + self.accumulated_replies[original_msg_id] = { - f'{original_msg_id}_{scheduler_id.decode("utf8")}': None - for scheduler_id in self.connected_sub_schedulers + scheduler_id: None for scheduler_id in self.connected_sub_schedulers } for i, scheduler_id in enumerate(self.connected_sub_schedulers): - msg_and_scheduler_id = f'{original_msg_id}_{scheduler_id.decode("utf8")}' - targets_for_scheduler = targets[ - i * len(targets)//2: (i + 1) * len(targets)//2 + i * len(targets) // 2 : (i + 1) * len(targets) // 2 ] + if not targets_for_scheduler: - del self.accumulated_replies[original_msg_id][msg_and_scheduler_id] - continue # needs to reply a message to previous scheduler maybe so it won't get stuck waiting - msg['header']['msg_id'] = msg_and_scheduler_id + del self.accumulated_replies[original_msg_id][scheduler_id] + continue + msg['metadata']['targets'] = targets_for_scheduler - self.all_ids.add(msg_and_scheduler_id) - new_idents = [cast_bytes(scheduler_id + b'_in')] + idents - new_msg_list = self.session.serialize(msg, ident=new_idents) - new_msg_list.extend(msg['buffers']) - self.mon_stream.send_multipart([b'inexpo'] + new_msg_list, copy=False) - self.outgoing_streams[i].send_multipart(new_msg_list, copy=False) + + new_msg = self.append_new_msg_id_to_msg( + self.get_new_msg_id(msg_id, scheduler_id), scheduler_id, idents, msg + ) + self.mon_stream.send_multipart([b'insptree'] + new_msg, copy=False) + self.outgoing_streams[i].send_multipart(new_msg, copy=False) @util.log_errors def dispatch_result(self, raw_msg): try: idents, msg = self.session.feed_identities(raw_msg, copy=False) msg = self.session.deserialize(msg, content=False, copy=False) - next_scheduler, previous_scheduler = idents[:2] - except Exception as e: + outgoing_scheduler, _ = idents[:2] + except: self.log.error( f'spanning tree::Invalid broadcast msg: {raw_msg}', exc_info=True ) return - metadata = msg['metadata'] - msg_id = msg['parent_header']['msg_id'] - success = metadata['status'] == 'ok' - if success: - self.all_completed.add(msg_id) - else: - self.all_failed.add(msg_id) - - original_msg_id = metadata['original_msg_id'] - self.accumulated_replies[original_msg_id][next_scheduler] = raw_msg - raw_msg.pop() - raw_msg[0] = previous_scheduler + original_msg_id = msg['metadata']['original_msg_id'] + self.accumulated_replies[original_msg_id][outgoing_scheduler] = raw_msg[1:] if all( - msg is not None - for msg in self.accumulated_replies[original_msg_id].values() + msg is not None + for msg in self.accumulated_replies[original_msg_id].values() ): self.client_stream.send_multipart( [ @@ -107,8 +101,7 @@ def dispatch_result(self, raw_msg): ], copy=False, ) - self.all_done.add(original_msg_id) - self.mon_stream.send_multipart([b'outexpo'] + raw_msg, copy=False) + self.mon_stream.send_multipart([b'outsptree'] + raw_msg, copy=False) class SpanningTreeLeafScheduler(Scheduler): @@ -118,64 +111,48 @@ def __init__(self, *args, **kwargs): super().__init__(**kwargs) self.log.info('Spanning tree leaf scheduler started') - @util.log_errors def dispatch_submission(self, raw_msg): - self.log.info(f'Spanning tree leaf msg received ') try: idents, msg_list = self.session.feed_identities(raw_msg, copy=False) msg = self.session.deserialize(msg_list, content=False, copy=False) except Exception as e: self.log.error(f'Spanning tree scheduler:: Invalid msg: {raw_msg}') return - header = msg['header'] + metadata = msg['metadata'] - original_msg_id = header['msg_id'] + original_msg_id = metadata['original_msg_id'] targets = metadata.get('targets', []) + self.accumulated_replies[original_msg_id] = { - f'{original_msg_id}_{target}': None for target in targets + bytes(target, 'utf8'): None for target in targets } for target in targets: - msg_and_target_id = f'{original_msg_id}_{target}' - self.all_ids.add(msg_and_target_id) - header['msg_id'] = msg_and_target_id - new_idents = [cast_bytes(target)] + idents - new_msg_list = self.session.serialize(msg, ident=new_idents) - new_msg_list.extend(msg['buffers']) - - self.mon_stream.send_multipart([b'inexpo'] + new_msg_list, copy=False) - self.engine_stream.send_multipart(new_msg_list, copy=False) + new_msg = self.append_new_msg_id_to_msg( + self.get_new_msg_id(original_msg_id, target), target, idents, msg + ) + self.mon_stream.send_multipart([b'insptree'] + new_msg, copy=False) + self.engine_stream.send_multipart(new_msg, copy=False) @util.log_errors def dispatch_result(self, raw_msg): try: idents, msg = self.session.feed_identities(raw_msg, copy=False) msg = self.session.deserialize(msg, content=False, copy=False) - engine, previous_scheduler = idents[:2] - except Exception as e: + target = idents[0] + except: self.log.error( f'spanning tree::Invalid broadcast msg: {raw_msg}', exc_info=True ) return - metadata = msg['metadata'] - msg_id = msg['parent_header']['msg_id'] - success = metadata['status'] == 'ok' - if success: - self.all_completed.add(msg_id) - else: - self.all_failed.add(msg_id) - - original_msg_id = metadata['original_msg_id'] - self.accumulated_replies[original_msg_id][msg_id] = raw_msg - raw_msg.pop() - raw_msg[0] = previous_scheduler + original_msg_id = msg['metadata']['original_msg_id'] + self.accumulated_replies[original_msg_id][target] = raw_msg[1:] if all( msg is not None for msg in self.accumulated_replies[original_msg_id].values() ): - self.client_stream.send_multipart( [ msgpart @@ -184,12 +161,13 @@ def dispatch_result(self, raw_msg): ], copy=False, ) - self.all_done.add(original_msg_id) - self.mon_stream.send_multipart([b'outexpo'] + raw_msg, copy=False) + self.mon_stream.send_multipart([b'outsptree'] + raw_msg, copy=False) + def get_id_with_prefix(identity): return bytes(f'sub_scheduler_{identity}', 'utf8') + def launch_spanning_tree_scheduler( in_addr, out_addrs, @@ -203,6 +181,7 @@ def launch_spanning_tree_scheduler( is_leaf=False, in_thread=False, outgoing_ids=None, + depth=0, ): config, ctx, loop, mons, nots, querys, log = get_common_scheduler_streams( mon_addr, not_addr, reg_addr, config, 'scheduler', log_url, loglevel, in_thread @@ -213,7 +192,7 @@ def launch_spanning_tree_scheduler( incoming_stream = ZMQStream(ctx.socket(zmq.ROUTER), loop) util.set_hwm(incoming_stream, 0) - incoming_stream.setsockopt(zmq.IDENTITY, sub_scheduler_id + b'_in') + incoming_stream.setsockopt(zmq.IDENTITY, sub_scheduler_id) if is_root: incoming_stream.bind(in_addr) @@ -224,7 +203,7 @@ def launch_spanning_tree_scheduler( for out_addr in out_addrs: out = ZMQStream(ctx.socket(zmq.ROUTER), loop) util.set_hwm(out, 0) - out.setsockopt(zmq.IDENTITY, sub_scheduler_id + b'_out') + out.setsockopt(zmq.IDENTITY, sub_scheduler_id) out.bind(out_addr) outgoing_streams.append(out) @@ -236,15 +215,16 @@ def launch_spanning_tree_scheduler( loop=loop, log=log, config=config, + depth=depth, ) if is_leaf: - scheduler_args.update( - engine_stream=outgoing_streams[0] - ) + scheduler_args.update(engine_stream=outgoing_streams[0]) scheduler = SpanningTreeLeafScheduler(**scheduler_args) else: scheduler_args.update( - connected_sub_schedulers=[get_id_with_prefix(identity) for identity in outgoing_ids], + connected_sub_schedulers=[ + get_id_with_prefix(identity) for identity in outgoing_ids + ], outgoing_streams=outgoing_streams, ) scheduler = SpanningTreeScheduler(**scheduler_args) diff --git a/ipyparallel/engine/kernel.py b/ipyparallel/engine/kernel.py index 7c7e501d5..33e7ab005 100644 --- a/ipyparallel/engine/kernel.py +++ b/ipyparallel/engine/kernel.py @@ -52,9 +52,6 @@ def should_handle(self, stream, msg, idents): return False return True - def extract_original_msg_id(self, parent): - return parent.get('metadata', {}).get('original_msg_id', '') - def init_metadata(self, parent): """init metadata dict, for execute/apply_reply""" return { @@ -63,7 +60,10 @@ def init_metadata(self, parent): 'engine' : self.ident, 'is_broadcast_non_coalescing': parent.get('metadata', {}).get('is_broadcast_non_coalescing', False), 'is_broadcast_coalescing': parent.get('metadata', {}).get('is_broadcast_coalescing', False), - 'original_msg_id': self.extract_original_msg_id(parent) + 'original_msg_id': parent.get('metadata', {}).get('original_msg_id', ''), + 'is_spanning_tree': parent.get('metadata', {}).get( + 'is_spanning_tree', False), + } def finish_metadata(self, parent, metadata, reply_content): From 766bdabf611694c8bf1807677ed3a501bf0b87f2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom-Olav=20B=C3=B8yum?= Date: Tue, 28 Apr 2020 23:22:38 +0200 Subject: [PATCH 19/34] spanning tree scheduler, benchmarks added --- ipyparallel/client/view.py | 3 ++- ipyparallel/controller/spanning_tree_scheduler.py | 8 ++++---- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/ipyparallel/client/view.py b/ipyparallel/client/view.py index 3e3bb9b27..cc1dd8b55 100644 --- a/ipyparallel/client/view.py +++ b/ipyparallel/client/view.py @@ -876,7 +876,8 @@ def _really_apply(self, f, args=None, kwargs=None, block=None, track=None, targe self.client.outstanding.add(msg_and_target_id) self.outstanding.add(msg_and_target_id) futures.append(future[0]) - self.outstanding.remove(original_msg_id) + if original_msg_id in self.outstanding: + self.outstanding.remove(original_msg_id) if isinstance(targets, int): futures = futures[0] diff --git a/ipyparallel/controller/spanning_tree_scheduler.py b/ipyparallel/controller/spanning_tree_scheduler.py index cccfa9623..ed89ded6c 100644 --- a/ipyparallel/controller/spanning_tree_scheduler.py +++ b/ipyparallel/controller/spanning_tree_scheduler.py @@ -71,7 +71,7 @@ def dispatch_submission(self, raw_msg): new_msg = self.append_new_msg_id_to_msg( self.get_new_msg_id(msg_id, scheduler_id), scheduler_id, idents, msg ) - self.mon_stream.send_multipart([b'insptree'] + new_msg, copy=False) + # self.mon_stream.send_multipart([b'insptree'] + new_msg, copy=False) self.outgoing_streams[i].send_multipart(new_msg, copy=False) @util.log_errors @@ -101,7 +101,7 @@ def dispatch_result(self, raw_msg): ], copy=False, ) - self.mon_stream.send_multipart([b'outsptree'] + raw_msg, copy=False) + # self.mon_stream.send_multipart([b'outsptree'] + raw_msg, copy=False) class SpanningTreeLeafScheduler(Scheduler): @@ -131,7 +131,7 @@ def dispatch_submission(self, raw_msg): new_msg = self.append_new_msg_id_to_msg( self.get_new_msg_id(original_msg_id, target), target, idents, msg ) - self.mon_stream.send_multipart([b'insptree'] + new_msg, copy=False) + # self.mon_stream.send_multipart([b'insptree'] + new_msg, copy=False) self.engine_stream.send_multipart(new_msg, copy=False) @util.log_errors @@ -161,7 +161,7 @@ def dispatch_result(self, raw_msg): ], copy=False, ) - self.mon_stream.send_multipart([b'outsptree'] + raw_msg, copy=False) + # self.mon_stream.send_multipart([b'outsptree'] + raw_msg, copy=False) def get_id_with_prefix(identity): From 8f641cdeb826816c1cd145f5631be112b8093924 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom-Olav=20B=C3=B8yum?= Date: Tue, 12 May 2020 17:15:06 +0200 Subject: [PATCH 20/34] change broadcastCoalescing to be a broadca --- ipyparallel/apps/ipcontrollerapp.py | 257 +++++++++-------- ipyparallel/client/client.py | 70 ++--- ipyparallel/client/view.py | 48 +--- ipyparallel/controller/broadcast_scheduler.py | 262 ++++++++++++------ ipyparallel/controller/hub.py | 115 +++----- .../controller/spanning_tree_scheduler.py | 237 ---------------- ipyparallel/engine/engine.py | 3 +- ipyparallel/engine/kernel.py | 10 +- ipyparallel/serialize/serialize.py | 12 +- 9 files changed, 406 insertions(+), 608 deletions(-) delete mode 100644 ipyparallel/controller/spanning_tree_scheduler.py diff --git a/ipyparallel/apps/ipcontrollerapp.py b/ipyparallel/apps/ipcontrollerapp.py index 9486e2f53..861f8cbc8 100755 --- a/ipyparallel/apps/ipcontrollerapp.py +++ b/ipyparallel/apps/ipcontrollerapp.py @@ -33,13 +33,10 @@ from ipython_genutils.importstring import import_item from traitlets import Unicode, Bool, List, Dict, TraitError, observe -from jupyter_client.session import ( - Session, session_aliases, session_flags, -) +from jupyter_client.session import Session, session_aliases, session_flags -from ipyparallel.controller.broadcast_scheduler import BroadcastSchedulerNonCoalescing, \ - BroadcastSchedulerCoalescing -from ipyparallel.controller.spanning_tree_scheduler import SPANNING_TREE_SCHEDULER_DEPTH, launch_spanning_tree_scheduler +from ipyparallel.controller.broadcast_scheduler import launch_broadcast_scheduler, \ + SPANNING_TREE_SCHEDULER_DEPTH, BroadcastScheduler from ipyparallel.controller.heartmonitor import HeartMonitor from ipyparallel.controller.hub import HubFactory, get_number_of_non_leaf_schedulers from ipyparallel.controller.scheduler import launch_scheduler @@ -66,10 +63,9 @@ real_dbs.append(MongoDB) - -#----------------------------------------------------------------------------- +# ----------------------------------------------------------------------------- # Module level variables -#----------------------------------------------------------------------------- +# ----------------------------------------------------------------------------- _description = """Start the IPython controller for parallel computing. @@ -88,103 +84,141 @@ """ -#----------------------------------------------------------------------------- +# ----------------------------------------------------------------------------- # The main application -#----------------------------------------------------------------------------- +# ----------------------------------------------------------------------------- flags = {} flags.update(base_flags) -flags.update({ - 'usethreads' : ( {'IPControllerApp' : {'use_threads' : True}}, - 'Use threads instead of processes for the schedulers'), - 'sqlitedb' : ({'HubFactory' : {'db_class' : 'ipyparallel.controller.sqlitedb.SQLiteDB'}}, - 'use the SQLiteDB backend'), - 'mongodb' : ({'HubFactory' : {'db_class' : 'ipyparallel.controller.mongodb.MongoDB'}}, - 'use the MongoDB backend'), - 'dictdb' : ({'HubFactory' : {'db_class' : 'ipyparallel.controller.dictdb.DictDB'}}, - 'use the in-memory DictDB backend'), - 'nodb' : ({'HubFactory' : {'db_class' : 'ipyparallel.controller.dictdb.NoDB'}}, - """use dummy DB backend, which doesn't store any information. +flags.update( + { + 'usethreads': ( + {'IPControllerApp': {'use_threads': True}}, + 'Use threads instead of processes for the schedulers', + ), + 'sqlitedb': ( + {'HubFactory': {'db_class': 'ipyparallel.controller.sqlitedb.SQLiteDB'}}, + 'use the SQLiteDB backend', + ), + 'mongodb': ( + {'HubFactory': {'db_class': 'ipyparallel.controller.mongodb.MongoDB'}}, + 'use the MongoDB backend', + ), + 'dictdb': ( + {'HubFactory': {'db_class': 'ipyparallel.controller.dictdb.DictDB'}}, + 'use the in-memory DictDB backend', + ), + 'nodb': ( + {'HubFactory': {'db_class': 'ipyparallel.controller.dictdb.NoDB'}}, + """use dummy DB backend, which doesn't store any information. This is the default as of IPython 0.13. To enable delayed or repeated retrieval of results from the Hub, select one of the true db backends. - """), - 'reuse' : ({'IPControllerApp' : {'reuse_files' : True}}, - 'reuse existing json connection files'), - 'restore' : ({'IPControllerApp' : {'restore_engines' : True, 'reuse_files' : True}}, - 'Attempt to restore engines from a JSON file. ' - 'For use when resuming a crashed controller'), -}) + """, + ), + 'reuse': ( + {'IPControllerApp': {'reuse_files': True}}, + 'reuse existing json connection files', + ), + 'restore': ( + {'IPControllerApp': {'restore_engines': True, 'reuse_files': True}}, + 'Attempt to restore engines from a JSON file. ' + 'For use when resuming a crashed controller', + ), + } +) flags.update(session_flags) aliases = dict( - ssh = 'IPControllerApp.ssh_server', - enginessh = 'IPControllerApp.engine_ssh_server', - location = 'IPControllerApp.location', - - url = 'HubFactory.url', - ip = 'HubFactory.ip', - transport = 'HubFactory.transport', - port = 'HubFactory.regport', - - ping = 'HeartMonitor.period', - - scheme = 'TaskScheduler.scheme_name', - hwm = 'TaskScheduler.hwm', + ssh='IPControllerApp.ssh_server', + enginessh='IPControllerApp.engine_ssh_server', + location='IPControllerApp.location', + url='HubFactory.url', + ip='HubFactory.ip', + transport='HubFactory.transport', + port='HubFactory.regport', + ping='HeartMonitor.period', + scheme='TaskScheduler.scheme_name', + hwm='TaskScheduler.hwm', ) aliases.update(base_aliases) aliases.update(session_aliases) - class IPControllerApp(BaseParallelApplication): name = u'ipcontroller' description = _description examples = _examples - classes = [ProfileDir, Session, HubFactory, TaskScheduler, HeartMonitor, DictDB] + real_dbs + classes = [ + ProfileDir, + Session, + HubFactory, + TaskScheduler, + HeartMonitor, + DictDB, + ] + real_dbs # change default to True - auto_create = Bool(True, config=True, - help="""Whether to create profile dir if it doesn't exist.""") + auto_create = Bool( + True, config=True, help="""Whether to create profile dir if it doesn't exist.""" + ) - reuse_files = Bool(False, config=True, + reuse_files = Bool( + False, + config=True, help="""Whether to reuse existing json connection files. If False, connection files will be removed on a clean exit. - """ + """, ) - restore_engines = Bool(False, config=True, + restore_engines = Bool( + False, + config=True, help="""Reload engine state from JSON file - """ + """, ) - ssh_server = Unicode(u'', config=True, + ssh_server = Unicode( + u'', + config=True, help="""ssh url for clients to use when connecting to the Controller processes. It should be of the form: [user@]server[:port]. The Controller's listening addresses must be accessible from the ssh server""", ) - engine_ssh_server = Unicode(u'', config=True, + engine_ssh_server = Unicode( + u'', + config=True, help="""ssh url for engines to use when connecting to the Controller processes. It should be of the form: [user@]server[:port]. The Controller's listening addresses must be accessible from the ssh server""", ) - location = Unicode(socket.gethostname(), config=True, + location = Unicode( + socket.gethostname(), + config=True, help="""The external IP or domain name of the Controller, used for disambiguating engine and client connections.""", ) - import_statements = List([], config=True, - help="import statements to be run at startup. Necessary in some environments" + import_statements = List( + [], + config=True, + help="import statements to be run at startup. Necessary in some environments", ) - use_threads = Bool(False, config=True, - help='Use threads instead of processes for the schedulers', + use_threads = Bool( + False, config=True, help='Use threads instead of processes for the schedulers' ) - engine_json_file = Unicode('ipcontroller-engine.json', config=True, - help="JSON filename where engine connection info will be stored.") - client_json_file = Unicode('ipcontroller-client.json', config=True, - help="JSON filename where client connection info will be stored.") + engine_json_file = Unicode( + 'ipcontroller-engine.json', + config=True, + help="JSON filename where engine connection info will be stored.", + ) + client_json_file = Unicode( + 'ipcontroller-client.json', + config=True, + help="JSON filename where client connection info will be stored.", + ) @observe('cluster_id') def _cluster_id_changed(self, change): @@ -192,7 +226,6 @@ def _cluster_id_changed(self, change): self.engine_json_file = "%s-engine.json" % self.name self.client_json_file = "%s-client.json" % self.name - # internal children = List() mq_class = Unicode('zmq.devices.ProcessMonitoredQueue') @@ -203,23 +236,23 @@ def _use_threads_changed(self, change): 'Thread' if change['new'] else 'Process' ) - write_connection_files = Bool(True, + write_connection_files = Bool( + True, help="""Whether to write connection files to disk. True in all cases other than runs with `reuse_files=True` *after the first* - """ + """, ) aliases = Dict(aliases) flags = Dict(flags) - def save_connection_dict(self, fname, cdict): """save a connection dict to json file.""" fname = os.path.join(self.profile_dir.security_dir, fname) self.log.info("writing connection info to %s", fname) with open(fname, 'w') as f: f.write(json.dumps(cdict, indent=2)) - os.chmod(fname, stat.S_IRUSR|stat.S_IWUSR) + os.chmod(fname, stat.S_IRUSR | stat.S_IWUSR) def load_config_from_json(self): """load config from existing json connector files.""" @@ -236,7 +269,7 @@ def load_config_from_json(self): # json gives unicode, Session.key wants bytes c.Session.key = ecfg['key'].encode('ascii') - xport,ip = ecfg['interface'].split('://') + xport, ip = ecfg['interface'].split('://') c.HubFactory.engine_ip = ip c.HubFactory.engine_transport = xport @@ -253,7 +286,9 @@ def load_config_from_json(self): ccfg = json.loads(f.read()) for key in ('key', 'registration', 'pack', 'unpack', 'signature_scheme'): - assert ccfg[key] == ecfg[key], "mismatch between engine and client info: %r" % key + assert ccfg[key] == ecfg[key], ( + "mismatch between engine and client info: %r" % key + ) xport, ip = ccfg['interface'].split('://') @@ -290,7 +325,7 @@ def load_secondary_config(self): if self.reuse_files: try: self.load_config_from_json() - except (AssertionError,IOError) as e: + except (AssertionError, IOError) as e: self.log.error("Could not load config from JSON: %s" % e) else: # successfully loaded config from JSON, and reuse=True @@ -319,25 +354,27 @@ def init_hub(self): # save to new json config files f = self.factory base = { - 'key' : f.session.key.decode('ascii'), - 'location' : self.location, - 'pack' : f.session.packer, - 'unpack' : f.session.unpacker, - 'signature_scheme' : f.session.signature_scheme, + 'key': f.session.key.decode('ascii'), + 'location': self.location, + 'pack': f.session.packer, + 'unpack': f.session.unpacker, + 'signature_scheme': f.session.signature_scheme, } - cdict = {'ssh' : self.ssh_server} + cdict = {'ssh': self.ssh_server} cdict.update(f.client_info) cdict.update(base) self.save_connection_dict(self.client_json_file, cdict) - edict = {'ssh' : self.engine_ssh_server} + edict = {'ssh': self.engine_ssh_server} edict.update(f.engine_info) edict.update(base) self.save_connection_dict(self.engine_json_file, edict) fname = "engines%s.json" % self.cluster_id - self.factory.hub.engine_state_file = os.path.join(self.profile_dir.log_dir, fname) + self.factory.hub.engine_state_file = os.path.join( + self.profile_dir.log_dir, fname + ) if self.restore_engines: self.factory.hub._load_engine_state() # load key into config so other sessions in this process (TaskScheduler) @@ -365,13 +402,13 @@ def init_schedulers(self): monitor_url = disambiguate_url(f.monitor_url) # maybe_inproc = 'inproc://monitor' if self.use_threads else monitor_url # IOPub relay (in a Process) - q = mq(zmq.PUB, zmq.SUB, zmq.PUB, b'N/A',b'iopub') + q = mq(zmq.PUB, zmq.SUB, zmq.PUB, b'N/A', b'iopub') q.bind_in(f.client_url('iopub')) q.setsockopt_in(zmq.IDENTITY, ident + b"_iopub") q.bind_out(f.engine_url('iopub')) q.setsockopt_out(zmq.SUBSCRIBE, b'') q.connect_mon(monitor_url) - q.daemon=True + q.daemon = True children.append(q) # Multiplexer Queue (in a Process) @@ -382,7 +419,7 @@ def init_schedulers(self): q.bind_out(f.engine_url('mux')) q.setsockopt_out(zmq.IDENTITY, b'mux_out') q.connect_mon(monitor_url) - q.daemon=True + q.daemon = True children.append(q) # Control Queue (in a Process) @@ -392,7 +429,7 @@ def init_schedulers(self): q.bind_out(f.engine_url('control')) q.setsockopt_out(zmq.IDENTITY, b'control_out') q.connect_mon(monitor_url) - q.daemon=True + q.daemon = True children.append(q) if 'TaskScheduler.scheme_name' in self.config: scheme = self.config.TaskScheduler.scheme_name @@ -408,25 +445,19 @@ def init_schedulers(self): q.bind_out(f.engine_url('task')) q.setsockopt_out(zmq.IDENTITY, b'task_out') q.connect_mon(monitor_url) - q.daemon=True + q.daemon = True children.append(q) elif scheme == 'none': self.log.warn("task::using no Task scheduler") else: - self.log.info("task::using Python %s Task scheduler"%scheme) - self.launch_python_scheduler(self.get_python_scheduler_args('task', f, TaskScheduler, monitor_url) - , children) - - self.launch_python_scheduler(self.get_python_scheduler_args( - 'broadcast_non_coalescing', f, BroadcastSchedulerNonCoalescing, monitor_url - ), children) - - self.launch_python_scheduler(self.get_python_scheduler_args( - 'broadcast_coalescing', f, BroadcastSchedulerCoalescing, monitor_url - ), children) + self.log.info("task::using Python %s Task scheduler" % scheme) + self.launch_python_scheduler( + self.get_python_scheduler_args('task', f, TaskScheduler, monitor_url), + children, + ) - self.launch_spanning_tree_schedulers( + self.launch_broadcast_schedulers( f, monitor_url, children ) @@ -445,7 +476,6 @@ def init_schedulers(self): q.setsockopt_out(zmq.RCVHWM, 0) q.setsockopt_mon(zmq.SNDHWM, 0) - def terminate_children(self): child_procs = [] for child in self.children: @@ -482,7 +512,7 @@ def do_import_statements(self): def forward_logging(self): if self.log_url: - self.log.info("Forwarding logging to %s"%self.log_url) + self.log.info("Forwarding logging to %s" % self.log_url) context = zmq.Context.instance() lsock = context.socket(zmq.PUB) lsock.connect(self.log_url) @@ -518,12 +548,7 @@ def start(self): self.cleanup_connection_files() def get_python_scheduler_args( - self, - scheduler_name, - factory, - scheduler_class, - monitor_url, - identity=None, + self, scheduler_name, factory, scheduler_class, monitor_url, identity=None ): return { 'scheduler_class': scheduler_class, @@ -539,19 +564,22 @@ def get_python_scheduler_args( 'config': dict(self.config), } - def launch_spanning_tree_schedulers(self, factory, monitor_url, children): - + def launch_broadcast_schedulers( + self, factory, monitor_url, children + ): def launch_in_thread_or_process(scheduler_args): if 'Process' in self.mq_class: # run the Python scheduler in a Process - q = Process(target=launch_spanning_tree_scheduler, kwargs=scheduler_args) + q = Process( + target=launch_broadcast_scheduler, kwargs=scheduler_args + ) q.daemon = True children.append(q) else: # single-threaded Controller scheduler_args['in_thread'] = True - launch_spanning_tree_scheduler(**scheduler_args) + launch_broadcast_scheduler(**scheduler_args) def recursively_start_schedulers(identity, depth): outgoing_id1 = identity * 2 + 1 @@ -559,7 +587,7 @@ def recursively_start_schedulers(identity, depth): is_leaf = depth == SPANNING_TREE_SCHEDULER_DEPTH scheduler_args = dict( - in_addr=factory.client_url('sub_schedulers', identity), + in_addr=factory.client_url(BroadcastScheduler.port_name, identity), mon_addr=monitor_url, not_addr=disambiguate_url(factory.client_url('notification')), reg_addr=disambiguate_url(factory.client_url('registration')), @@ -573,16 +601,19 @@ def recursively_start_schedulers(identity, depth): if is_leaf: scheduler_args.update( out_addrs=[ - factory.engine_url('sub_schedulers', identity - get_number_of_non_leaf_schedulers()) + factory.engine_url( + BroadcastScheduler.port_name, + identity - get_number_of_non_leaf_schedulers(), + ) ], - is_leaf=is_leaf + is_leaf=is_leaf, ) else: scheduler_args.update( out_addrs=[ - factory.client_url('sub_schedulers', outgoing_id1), - factory.client_url('sub_schedulers', outgoing_id2) - ], + factory.client_url(BroadcastScheduler.port_name, outgoing_id1), + factory.client_url(BroadcastScheduler.port_name, outgoing_id2), + ] ) launch_in_thread_or_process(scheduler_args) if not is_leaf: @@ -591,6 +622,7 @@ def recursively_start_schedulers(identity, depth): recursively_start_schedulers(0, 0) + def launch_new_instance(*args, **kwargs): """Create and run the IPython controller""" if sys.platform == 'win32': @@ -601,6 +633,7 @@ def launch_new_instance(*args, **kwargs): # this only comes up when IPython has been installed using vanilla # setuptools, and *not* distribute. import multiprocessing + p = multiprocessing.current_process() # the main process has name 'MainProcess' # subprocesses will have names like 'Process-1' diff --git a/ipyparallel/client/client.py b/ipyparallel/client/client.py index fdb81fbae..db75374b9 100644 --- a/ipyparallel/client/client.py +++ b/ipyparallel/client/client.py @@ -55,8 +55,7 @@ DirectView, LoadBalancedView, BroadcastViewNonCoalescing, - BroadcastViewCoalescing, - SpanningTreeView, + BroadcastViewCoalescing ) import jupyter_client.session @@ -368,9 +367,7 @@ def _profile_default(self): _notification_socket = Instance('zmq.Socket', allow_none=True) _mux_socket = Instance('zmq.Socket', allow_none=True) _task_socket = Instance('zmq.Socket', allow_none=True) - _broadcast_non_coalescing_socket = Instance('zmq.Socket', allow_none=True) - _broadcast_coalescing_socket = Instance('zmq.Socket', allow_none=True) - _sub_scheduler_socket = Instance('zmq.Socket', allow_none=True) + _broadcast_socket = Instance('zmq.Socket', allow_none=True) _task_scheme = Unicode() _closed = False @@ -475,12 +472,10 @@ def __init__( 'iopub', 'notification', 'registration', - 'broadcast_non_coalescing', - 'broadcast_coalescing', ): cfg[key] = cfg['interface'] + ':%i' % cfg[key] - cfg['sub_schedulers'] = cfg['interface'] + ':%i' % cfg['sub_schedulers'][0] + cfg['broadcast'] = cfg['interface'] + ':%i' % cfg['broadcast'][0] url = cfg['registration'] if location is not None and addr == localhost(): @@ -716,19 +711,11 @@ def connect_socket(s, url): self._task_socket = self._context.socket(zmq.DEALER) connect_socket(self._task_socket, cfg['task']) - self._broadcast_non_coalescing_socket = self._context.socket(zmq.DEALER) + self._broadcast_socket = self._context.socket(zmq.DEALER) connect_socket( - self._broadcast_non_coalescing_socket, cfg['broadcast_non_coalescing'] + self._broadcast_socket, cfg['broadcast'] ) - self._broadcast_coalescing_socket = self._context.socket(zmq.DEALER) - connect_socket( - self._broadcast_coalescing_socket, cfg['broadcast_coalescing'] - ) - - self._sub_scheduler_socket = self._context.socket(zmq.DEALER) - connect_socket(self._sub_scheduler_socket, cfg['sub_schedulers']) - self._notification_socket = self._context.socket(zmq.SUB) self._notification_socket.setsockopt(zmq.SUBSCRIBE, b'') connect_socket(self._notification_socket, cfg['notification']) @@ -774,11 +761,10 @@ def _extract_metadata(self, msg): 'follow': msg_meta.get('follow', []), 'after': msg_meta.get('after', []), 'status': content['status'], - 'is_broadcast_non_coalescing': msg_meta.get( - 'is_broadcast_non_coalescing', False + 'is_broadcast': msg_meta.get( + 'is_broadcast', False ), - 'is_broadcast_coalescing': msg_meta.get('is_broadcast_coalescing', False), - 'is_spanning_tree': msg_meta.get('is_spanning_tree', False) + 'is_coalescing': msg_meta.get('is_coalescing', False), } if md['engine_uuid'] is not None: @@ -889,8 +875,7 @@ def _handle_execute_reply(self, msg): def _should_use_metadata_msg_id(self, msg): md = msg['metadata'] - return md.get('is_broadcast_non_coalescing', False) or md.get( - 'is_broadcast_coalescing', False) or md.get('is_spanning_tree', False) + return md.get('is_broadcast', False) and md.get('is_coalescing', False) def _handle_apply_reply(self, msg): """Save the reply to an apply_request into our results.""" @@ -923,10 +908,13 @@ def _handle_apply_reply(self, msg): # construct result: if content['status'] == 'ok': - if md.get('is_broadcast_coalescing', False) or md.get('is_spanning_tree', False): - self.results[msg_id] = serialize.deserialize_object( - msg['buffers'], try_to_extract_all=True - ) + if md.get('is_coalescing', False): + deserialized_bufs = [] + bufs = msg['buffers'] + while bufs: + deserialized, bufs = serialize.deserialize_object(bufs) + deserialized_bufs.append(deserialized) + self.results[msg_id] = deserialized_bufs else: self.results[msg_id] = serialize.deserialize_object(msg['buffers'])[0] elif content['status'] == 'aborted': @@ -984,21 +972,10 @@ def _setup_streams(self): self._notification_stream = ZMQStream(self._notification_socket, self._io_loop) self._notification_stream.on_recv(self._dispatch_notification, copy=False) - self._broadcast_non_coalescing_stream = ZMQStream( - self._broadcast_non_coalescing_socket, self._io_loop - ) - self._broadcast_non_coalescing_stream.on_recv(self._dispatch_reply, copy=False) - - self._broadcast_coalescing_stream = ZMQStream( - self._broadcast_coalescing_socket, self._io_loop - ) - self._broadcast_coalescing_stream.on_recv(self._dispatch_reply, copy=False) - - self._sub_scheduler_stream = ZMQStream( - self._sub_scheduler_socket, self._io_loop + self._broadcast_stream = ZMQStream( + self._broadcast_socket, self._io_loop ) - - self._sub_scheduler_stream.on_recv(self._dispatch_reply, copy=False) + self._broadcast_stream.on_recv(self._dispatch_reply, copy=False) def _start_io_thread(self): """Start IOLoop in a background thread.""" @@ -1775,24 +1752,19 @@ def broadcast_view(self, targets='all', is_coalescing=False, **kwargs): return ( BroadcastViewCoalescing( client=self, - socket=self._broadcast_coalescing_stream, + socket=self._broadcast_stream, targets=targets, **kwargs ) if is_coalescing else BroadcastViewNonCoalescing( client=self, - socket=self._broadcast_non_coalescing_stream, + socket=self._broadcast_stream, targets=targets, **kwargs ) ) - def spanning_tree_view(self, targets='all', **kwargs): - return SpanningTreeView( - client=self, socket=self._sub_scheduler_stream, targets=targets, **kwargs - ) - # -------------------------------------------------------------------------- # Query methods # -------------------------------------------------------------------------- diff --git a/ipyparallel/client/view.py b/ipyparallel/client/view.py index cc1dd8b55..86a73ec57 100644 --- a/ipyparallel/client/view.py +++ b/ipyparallel/client/view.py @@ -862,7 +862,7 @@ def _really_apply(self, f, args=None, kwargs=None, block=None, track=None, targe s_idents = [ident.decode("utf8") for ident in idents] - metadata = dict(targets=s_idents, is_broadcast=True) + metadata = dict(targets=s_idents, is_broadcast=True, is_coalescing=False) original_future = self.client.send_apply_request( self._socket, pf, pargs, pkwargs, @@ -916,7 +916,7 @@ def _really_apply(self, f, args=None, kwargs=None, block=None, track=None, s_idents = [ident.decode("utf8") for ident in idents] - metadata = dict(targets=s_idents, is_broadcast_coalescing=True) + metadata = dict(targets=s_idents, is_broadcast=True, is_coalescing=True) message_future = self.client.send_apply_request( self._socket, pf, pargs, pkwargs, @@ -933,47 +933,6 @@ def _really_apply(self, f, args=None, kwargs=None, block=None, track=None, pass return ar - -class SpanningTreeView(DirectView): - def __init__(self, client=None, socket=None, targets=None): - super().__init__(client=client, socket=socket, targets=targets) - - @sync_results - @save_ids - def _really_apply(self, f, args=None, kwargs=None, block=None, track=None, - targets=None): - args = [] if args is None else args - kwargs = {} if kwargs is None else kwargs - block = self.block if block is None else block - track = self.track if track is None else track - targets = self.targets if targets is None else targets - - idents, _targets = self.client._build_targets(targets) - - pf = PrePickled(f) - pargs = [PrePickled(arg) for arg in args] - pkwargs = {k: PrePickled(v) for k, v in kwargs.items()} - - s_idents = [ident.decode("utf8") for ident in idents] - - metadata = dict(targets=s_idents) - - message_future = self.client.send_apply_request( - self._socket, pf, pargs, pkwargs, - track=track, metadata=metadata) - - self.client.outstanding.add(message_future.msg_id) - - ar = AsyncResult(self.client, message_future, fname=getname(f), - targets=_targets, - owner=True) - if block: - try: - return ar.get() - except KeyboardInterrupt: - pass - return ar - class LoadBalancedView(View): """An load-balancing View that only executes via the Task scheduler. @@ -1092,6 +1051,7 @@ def set_flags(self, **kwargs): if t is not None: if t < 0: raise ValueError("Invalid timeout: %s"%t) + self.timeout = t @sync_results @@ -1302,5 +1262,5 @@ def shutdown(self, wait=True): self.view.wait() __all__ = ['LoadBalancedView', 'DirectView', 'ViewExecutor', - 'BroadcastViewNonCoalescing', 'BroadcastViewCoalescing', 'SpanningTreeView'] + 'BroadcastViewNonCoalescing', 'BroadcastViewCoalescing'] diff --git a/ipyparallel/controller/broadcast_scheduler.py b/ipyparallel/controller/broadcast_scheduler.py index e5d150798..167f55db7 100644 --- a/ipyparallel/controller/broadcast_scheduler.py +++ b/ipyparallel/controller/broadcast_scheduler.py @@ -1,116 +1,220 @@ -from ipython_genutils.py3compat import cast_bytes +import logging + +import zmq from ipyparallel import util -from ipyparallel.controller.scheduler import Scheduler +from ipyparallel.controller.scheduler import ( + Scheduler, + get_common_scheduler_streams, + ZMQStream, +) +SPANNING_TREE_SCHEDULER_DEPTH = 2 -class BroadcastSchedulerNonCoalescing(Scheduler): - def __init__(self, *args, **kwargs): - super().__init__(**kwargs) - self.log.info('Broadcast non coalescing Scheduler Started') - @util.log_errors - def dispatch_submission(self, raw_msg): - try: - idents, msg_list = self.session.feed_identities(raw_msg, copy=False) - msg = self.session.deserialize(msg_list, content=False, copy=False) - except: - self.log.error( - f'broadcast::Invalid broadcast msg: {raw_msg}', exc_info=True - ) - return +class BroadcastScheduler(Scheduler): + port_name = 'broadcast' + accumulated_replies = {} - original_msg_id = msg['header']['msg_id'] - targets = msg['metadata'].get('targets', []) + def __init__( + self, + *args, + depth=0, + connected_sub_schedulers=None, + outgoing_streams=None, + is_leaf=False, + **kwargs, + ): + super().__init__(**kwargs) + self.log.info('Broadcast Scheduler Started') + self.connected_sub_scheduler_ids = connected_sub_schedulers + self.outgoing_streams = outgoing_streams + self.is_leaf = is_leaf or SPANNING_TREE_SCHEDULER_DEPTH == 0 + self.depth = depth + + def start(self): + self.client_stream.on_recv(self.dispatch_submission, copy=False) + if self.is_leaf: + super().start() + else: + for outgoing_stream in self.outgoing_streams: + outgoing_stream.on_recv(self.dispatch_result, copy=False) + + def send_to_targets(self, msg, original_msg_id, targets, idents, is_coalescing): + if is_coalescing: + self.accumulated_replies[original_msg_id] = { + bytes(target, 'utf8'): None for target in targets + } for target in targets: - new_msg_list = self.append_new_msg_id_to_msg( + new_msg = self.append_new_msg_id_to_msg( self.get_new_msg_id(original_msg_id, target), target, idents, msg ) - self.mon_stream.send_multipart([b'inbcast'] + new_msg_list, copy=False) - self.engine_stream.send_multipart(new_msg_list, copy=False) - - @util.log_errors - def dispatch_result(self, raw_msg): - try: - idents, msg = self.session.feed_identities(raw_msg, copy=False) - engine, client = idents[:2] - except: - self.log.error( - f'broadcast::Invalid broadcast msg: {raw_msg}', exc_info=True + self.engine_stream.send_multipart(new_msg, copy=False) + + def send_to_sub_schedulers( + self, msg, original_msg_id, targets, idents, is_coalescing + ): + if is_coalescing: + + self.accumulated_replies[original_msg_id] = { + scheduler_id: None for scheduler_id in self.connected_sub_scheduler_ids + } + + for i, scheduler_id in enumerate(self.connected_sub_scheduler_ids): + slice_start = i * len(targets) // len(self.connected_sub_scheduler_ids) + slice_end = (i + 1) * len(targets) // len(self.connected_sub_scheduler_ids) + targets_for_scheduler = targets[slice_start:slice_end] + if not targets_for_scheduler: + del self.accumulated_replies[original_msg_id][scheduler_id] + msg['metadata']['targets'] = targets_for_scheduler + + new_msg = self.append_new_msg_id_to_msg( + self.get_new_msg_id(original_msg_id, scheduler_id), + scheduler_id, + idents, + msg, ) - return - - # swap ids for ROUTER-ROUTER mirror - raw_msg[:2] = [client, engine] - self.client_stream.send_multipart(raw_msg, copy=False) - self.mon_stream.send_multipart([b'outbcast'] + raw_msg, copy=False) - + self.outgoing_streams[i].send_multipart(new_msg, copy=False) -class BroadcastSchedulerCoalescing(Scheduler): - def __init__(self, *args, **kwargs): - super().__init__(**kwargs) - self.log.info('Broadcast coalescing Scheduler Started') - - accumulated_replies = {} + def coalescing_reply(self, raw_msg, msg, original_msg_id, outgoing_id): + if all( + msg is not None or stored_outgoing_id == outgoing_id + for stored_outgoing_id, msg in self.accumulated_replies[ + original_msg_id + ].items() + ): + new_msg = raw_msg[1:] + new_msg.extend( + [ + buffer + for msg_buffers in self.accumulated_replies[ + original_msg_id + ].values() + if msg_buffers + for buffer in msg_buffers + ] + ) + self.client_stream.send_multipart(new_msg, copy=False) + else: + self.accumulated_replies[original_msg_id][outgoing_id] = msg['buffers'] @util.log_errors def dispatch_submission(self, raw_msg): try: idents, msg_list = self.session.feed_identities(raw_msg, copy=False) msg = self.session.deserialize(msg_list, content=False, copy=False) - except Exception as e: + except: self.log.error( f'broadcast::Invalid broadcast msg: {raw_msg}', exc_info=True ) return - metadata = msg['metadata'] - original_msg_id = msg['header']['msg_id'] - targets = metadata.get('targets', []) + msg_id = msg['header']['msg_id'] + targets = metadata['targets'] - self.accumulated_replies[original_msg_id] = { - f'{original_msg_id}_{target}': None for target in targets - } - metadata['original_msg_id'] = original_msg_id + is_coalescing = metadata['is_coalescing'] - for target in targets: - new_msg_list = self.append_new_msg_id_to_msg( - self.get_new_msg_id(original_msg_id, target), target, idents, msg + if 'original_msg_id' not in metadata: + metadata['original_msg_id'] = msg_id + + original_msg_id = metadata['original_msg_id'] + if self.is_leaf: + self.send_to_targets(msg, original_msg_id, targets, idents, is_coalescing) + else: + self.send_to_sub_schedulers( + msg, original_msg_id, targets, idents, is_coalescing ) - self.mon_stream.send_multipart([b'inbcast'] + new_msg_list, copy=False) - self.engine_stream.send_multipart(new_msg_list, copy=False) @util.log_errors def dispatch_result(self, raw_msg): try: idents, msg = self.session.feed_identities(raw_msg, copy=False) msg = self.session.deserialize(msg, content=False, copy=False) - engine, client = idents[:2] - except Exception as e: + outgoing_id = idents[0] + + except: self.log.error( f'broadcast::Invalid broadcast msg: {raw_msg}', exc_info=True ) return - metadata = msg['metadata'] - msg_id = msg['parent_header']['msg_id'] - - original_msg_id = metadata['original_msg_id'] - self.accumulated_replies[original_msg_id][msg_id] = raw_msg - raw_msg[:2] = [client, engine] - - if all( - msg is not None - for msg in self.accumulated_replies[original_msg_id].values() - ): - - self.client_stream.send_multipart( - [ - msgpart - for msg in self.accumulated_replies[original_msg_id].values() - for msgpart in msg - ], - copy=False, - ) - self.mon_stream.send_multipart([b'outbcast'] + raw_msg, copy=False) + original_msg_id = msg['metadata']['original_msg_id'] + is_coalescing = msg['metadata']['is_coalescing'] + if is_coalescing: + self.coalescing_reply(raw_msg, msg, original_msg_id, outgoing_id) + else: + self.client_stream.send_multipart(raw_msg[1:], copy=False) + + +def get_id_with_prefix(identity): + return bytes(f'sub_scheduler_{identity}', 'utf8') + + +def launch_broadcast_scheduler( + in_addr, + out_addrs, + mon_addr, + not_addr, + reg_addr, + identity, + config=None, + loglevel=logging.DEBUG, + log_url=None, + is_leaf=False, + in_thread=False, + outgoing_ids=None, + depth=0, +): + config, ctx, loop, mons, nots, querys, log = get_common_scheduler_streams( + mon_addr, not_addr, reg_addr, config, 'scheduler', log_url, loglevel, in_thread + ) + + is_root = identity == 0 + sub_scheduler_id = get_id_with_prefix(identity) + + incoming_stream = ZMQStream(ctx.socket(zmq.ROUTER), loop) + util.set_hwm(incoming_stream, 0) + incoming_stream.setsockopt(zmq.IDENTITY, sub_scheduler_id) + + if is_root: + incoming_stream.bind(in_addr) + else: + incoming_stream.connect(in_addr) + + outgoing_streams = [] + for out_addr in out_addrs: + out = ZMQStream(ctx.socket(zmq.ROUTER), loop) + util.set_hwm(out, 0) + out.setsockopt(zmq.IDENTITY, sub_scheduler_id) + out.bind(out_addr) + outgoing_streams.append(out) + + scheduler_args = dict( + client_stream=incoming_stream, + mon_stream=mons, + notifier_stream=nots, + query_stream=querys, + loop=loop, + log=log, + config=config, + depth=depth, + ) + if is_leaf: + scheduler_args.update(engine_stream=outgoing_streams[0], is_leaf=True) + else: + scheduler_args.update( + connected_sub_schedulers=[ + get_id_with_prefix(identity) for identity in outgoing_ids + ], + outgoing_streams=outgoing_streams, + ) + + scheduler = BroadcastScheduler(**scheduler_args) + + scheduler.start() + if not in_thread: + try: + loop.start() + except KeyboardInterrupt: + scheduler.log.critical("Interrupted, exiting...") diff --git a/ipyparallel/controller/hub.py b/ipyparallel/controller/hub.py index f5bff7a0e..987ae28f2 100644 --- a/ipyparallel/controller/hub.py +++ b/ipyparallel/controller/hub.py @@ -22,7 +22,7 @@ # internal: from ipython_genutils.importstring import import_item -from .spanning_tree_scheduler import SPANNING_TREE_SCHEDULER_DEPTH +from .broadcast_scheduler import SPANNING_TREE_SCHEDULER_DEPTH, BroadcastScheduler from ..util import extract_dates from jupyter_client.localinterfaces import localhost from ipython_genutils.py3compat import cast_bytes, unicode_type, iteritems, buffer_to_bytes_py2 @@ -113,12 +113,12 @@ def get_number_of_leaf_schedulers(): return 2**SPANNING_TREE_SCHEDULER_DEPTH -def get_number_of_tree_spanning_schedulers(): +def get_number_of_broadcast_schedulers(): return 2 * get_number_of_leaf_schedulers() - 1 def get_number_of_non_leaf_schedulers(): - return get_number_of_tree_spanning_schedulers() - get_number_of_leaf_schedulers() + return get_number_of_broadcast_schedulers() - get_number_of_leaf_schedulers() class EngineConnector(HasTraits): @@ -129,7 +129,7 @@ class EngineConnector(HasTraits): pending: set of msg_ids stallback: tornado timeout for stalled registration """ - + id = Integer(0) uuid = Unicode() pending = Set() @@ -164,25 +164,12 @@ def _mux_default(self): def _task_default(self): return tuple(util.select_random_ports(2)) - broadcast_non_coalescing = Tuple(Integer(), Integer(), config=True, - help="""Client/Engine Port pair for BroadcastNonCoalescing queue""") - - - def _broadcast_non_coalescing_default(self): - return tuple(util.select_random_ports(2)) - - broadcast_coalescing = Tuple(Integer(), Integer(), config=True, - help="""Client/Engine Port pair for BroadcastCoalescing queue""") + broadcast = List(Integer(), config=True, + help="List of available ports for broadcast") - def _broadcast_coalescing_default(self): - return tuple(util.select_random_ports(2)) - - sub_schedulers = List(Integer(), config=True, - help="List of available ports for spanning tree schedulers") - - def _sub_schedulers_default(self): + def _broadcast_default(self): return util.select_random_ports( - get_number_of_leaf_schedulers() + get_number_of_tree_spanning_schedulers() + get_number_of_leaf_schedulers() + get_number_of_broadcast_schedulers() ) control = Tuple(Integer(), Integer(), config=True, @@ -247,7 +234,7 @@ def _engine_ip_default(self): registration_timeout = Integer(0, config=True, help="Engine registration timeout in seconds [default: max(30," "10*heartmonitor.period)]" ) - + def _registration_timeout_default(self): if self.heartmonitor is None: # early initialization, this value will be ignored @@ -297,7 +284,7 @@ def client_url(self, channel, index=None): self.client_ip, self.client_info[channel] if index is None else self.client_info[channel][index] ) - + def engine_url(self, channel, index=None): """return full zmq url for a named engine channel""" return "%s://%s:%i" % ( @@ -305,7 +292,7 @@ def engine_url(self, channel, index=None): self.engine_ip, self.engine_info[channel] if index is None else self.engine_info[channel][index] ) - + def init_hub(self): """construct Hub object""" @@ -316,7 +303,7 @@ def init_hub(self): else: from .task_scheduler import TaskScheduler scheme = TaskScheduler.scheme_name.default_value - + # build connection dicts engine = self.engine_info = { 'interface' : "%s://%s" % (self.engine_transport, self.engine_ip), @@ -327,9 +314,8 @@ def init_hub(self): 'hb_pong' : self.hb[1], 'task' : self.task[1], 'iopub' : self.iopub[1], - 'broadcast_non_coalescing': self.broadcast_non_coalescing[1], - 'broadcast_coalescing': self.broadcast_coalescing[1], - 'sub_schedulers': self.sub_schedulers[-get_number_of_leaf_schedulers():] + BroadcastScheduler.port_name: + self.broadcast[-get_number_of_leaf_schedulers():], } client = self.client_info = { @@ -341,14 +327,13 @@ def init_hub(self): 'task_scheme' : scheme, 'iopub' : self.iopub[0], 'notification' : self.notifier_port, - 'broadcast_non_coalescing': self.broadcast_non_coalescing[0], - 'broadcast_coalescing': self.broadcast_coalescing[0], - 'sub_schedulers': self.sub_schedulers[:get_number_of_tree_spanning_schedulers()], + BroadcastScheduler.port_name: + self.broadcast[:get_number_of_broadcast_schedulers()], } - + self.log.debug("Hub engine addrs: %s", self.engine_info) self.log.debug("Hub client addrs: %s", self.client_info) - + # Registrar socket q = ZMQStream(ctx.socket(zmq.ROUTER), loop) util.set_hwm(q, 0) @@ -372,7 +357,7 @@ def init_hub(self): ) ### Client connections ### - + # Notifier socket n = ZMQStream(ctx.socket(zmq.PUB), loop) n.bind(self.client_url('notification')) @@ -425,9 +410,9 @@ class Hub(SessionFactory): client_info: dict of zmq connection information for engines to connect to the queues. """ - + engine_state_file = Unicode() - + # internal data structures: ids=Set() # engine IDs keytable=Dict() @@ -487,8 +472,6 @@ def __init__(self, **kwargs): b'outtask': self.save_task_result, b'inbcast': self.save_broadcast_request, b'outbcast': self.save_broadcast_result, - b'insptree': self.save_spanning_tree_request, - b'outsptree': self.save_spanning_tree_result, b'tracktask': self.save_task_destination, b'incontrol': _passer, b'outcontrol': _passer, @@ -514,7 +497,7 @@ def __init__(self, **kwargs): self.resubmit.on_recv(lambda msg: None, copy=False) self.log.info("hub::created hub") - + def new_engine_id(self, requested_id=None): """generate a new engine integer id. @@ -537,7 +520,7 @@ def new_engine_id(self, requested_id=None): newid = self._idcounter self._idcounter += 1 return newid - + #----------------------------------------------------------------------------- # message validation #----------------------------------------------------------------------------- @@ -623,7 +606,7 @@ def dispatch_query(self, msg): self.session.send(self.query, "hub_error", ident=client_id, content=content, parent=msg) return - + try: f = handler(idents, msg) if f: @@ -766,12 +749,6 @@ def save_queue_result(self, idents, msg): except Exception: self.log.error("DB Error updating record %r", msg_id, exc_info=True) - def save_spanning_tree_request(self): - pass - - def save_spanning_tree_result(self): - pass - #--------------------- Broadcast traffic ------------------------------ def save_broadcast_request(self, idents, msg): client_id = idents[0] @@ -917,7 +894,7 @@ def save_task_result(self, idents, msg): md = msg['metadata'] engine_uuid = md.get('engine', u'') eid = self.by_ident.get(cast_bytes(engine_uuid), None) - + status = md.get('status', None) if msg_id in self.pending: @@ -1006,13 +983,13 @@ def save_iopub_message(self, topics, msg): msg_id = parent['msg_id'] msg_type = msg['header']['msg_type'] content = msg['content'] - + # ensure msg_id is in db try: rec = self.db.get_record(msg_id) except KeyError: rec = None - + # stream d = {} if msg_type == 'stream': @@ -1032,7 +1009,7 @@ def save_iopub_message(self, topics, msg): if not d: return - + if rec is None: # new record rec = empty_record() @@ -1042,7 +1019,7 @@ def save_iopub_message(self, topics, msg): update_record = self.db.add_record else: update_record = self.db.update_record - + try: update_record(msg_id, d) except Exception: @@ -1122,7 +1099,7 @@ def register_engine(self, reg, msg): self.incoming_registrations[heart] = EngineConnector(id=eid,uuid=uuid,stallback=t) else: self.log.error("registration::registration %i failed: %r", eid, content['evalue']) - + return eid def unregister_engine(self, ident, msg): @@ -1133,10 +1110,10 @@ def unregister_engine(self, ident, msg): self.log.error("registration::bad engine id for unregistration: %r", ident, exc_info=True) return self.log.info("registration::unregister_engine(%r)", eid) - + uuid = self.keytable[eid] content=dict(id=eid, uuid=uuid) - + #stop the heartbeats self.hearts.pop(uuid, None) self.heartmonitor.responses.discard(uuid) @@ -1211,7 +1188,7 @@ def finish_registration(self, heart): if self.notifier: self.session.send(self.notifier, "registration_notification", content=content) self.log.info("engine::Engine Connected: %i", eid) - + self._save_engine_state() def _purge_stalled_registration(self, heart): @@ -1228,7 +1205,7 @@ def _purge_stalled_registration(self, heart): def _cleanup_engine_state_file(self): """cleanup engine state mapping""" - + if os.path.exists(self.engine_state_file): self.log.debug("cleaning up engine state: %s", self.engine_state_file) try: @@ -1246,11 +1223,11 @@ def _save_engine_state(self): engines = {} for eid, ec in self.engines.items(): engines[eid] = ec.uuid - + state['engines'] = engines - + state['next_id'] = self._idcounter - + with open(self.engine_state_file, 'w') as f: json.dump(state, f) @@ -1259,12 +1236,12 @@ def _load_engine_state(self): """load engine mapping from JSON file""" if not os.path.exists(self.engine_state_file): return - + self.log.info("loading engine state from %s" % self.engine_state_file) - + with open(self.engine_state_file) as f: state = json.load(f) - + save_notifier = self.notifier self.notifier = None for eid, uuid in iteritems(state['engines']): @@ -1272,12 +1249,12 @@ def _load_engine_state(self): # start with this heart as current and beating: self.heartmonitor.responses.add(heart) self.heartmonitor.hearts.add(heart) - + self.incoming_registrations[heart] = EngineConnector(id=int(eid), uuid=uuid) self.finish_registration(heart) - + self.notifier = save_notifier - + self._idcounter = state['next_id'] #------------------------------------------------------------------------- @@ -1450,7 +1427,7 @@ def finish(reply): msg = self.session.msg(header['msg_type'], parent=header) msg_id = msg['msg_id'] msg['content'] = rec['content'] - + # use the old header, but update msg_id and timestamp fresh = msg['header'] header['msg_id'] = fresh['msg_id'] @@ -1469,7 +1446,7 @@ def finish(reply): return finish(error.wrap_exception()) finish(dict(status='ok', resubmitted=resubmitted)) - + # store the new IDs in the Task DB for msg_id, resubmit_id in iteritems(resubmitted): try: @@ -1483,7 +1460,7 @@ def _extract_record(self, rec): io_dict = {} for key in ('execute_input', 'execute_result', 'error', 'stdout', 'stderr'): io_dict[key] = rec[key] - content = { + content = { 'header': rec['header'], 'metadata': rec['metadata'], 'result_metadata': rec['result_metadata'], diff --git a/ipyparallel/controller/spanning_tree_scheduler.py b/ipyparallel/controller/spanning_tree_scheduler.py deleted file mode 100644 index ed89ded6c..000000000 --- a/ipyparallel/controller/spanning_tree_scheduler.py +++ /dev/null @@ -1,237 +0,0 @@ -import logging - -import zmq - -from ipyparallel import util -from ipyparallel.controller.scheduler import ( - Scheduler, - get_common_scheduler_streams, - ZMQStream, -) - -SPANNING_TREE_SCHEDULER_DEPTH = 3 - - -class SpanningTreeScheduler(Scheduler): - accumulated_replies = {} - - def __init__( - self, depth=0, connected_sub_schedulers=None, outgoing_streams=None, **kwargs - ): - super().__init__(**kwargs) - self.connected_sub_schedulers = connected_sub_schedulers - self.outgoing_streams = outgoing_streams - self.depth = depth - self.log.info('Spanning tree scheduler started') - - def start(self): - self.client_stream.on_recv(self.dispatch_submission, copy=False) - for outgoing_stream in self.outgoing_streams: - outgoing_stream.on_recv(self.dispatch_result, copy=False) - - def resume_receiving(self): - self.client_stream.on_recv(self.dispatch_submission) - - def stop_receiving(self): - self.client_stream.on_recv(None) - - @util.log_errors - def dispatch_submission(self, raw_msg): - try: - idents, msg_list = self.session.feed_identities(raw_msg, copy=False) - msg = self.session.deserialize(msg_list, content=False, copy=False) - except: - self.log.error(f'Spanning tree scheduler:: Invalid msg: {raw_msg}') - return - - metadata = msg['metadata'] - msg_id = msg['header']['msg_id'] - targets = metadata.get('targets', []) - if 'original_msg_id' not in metadata: - metadata['original_msg_id'] = msg_id - metadata['is_spanning_tree'] = True - - original_msg_id = metadata['original_msg_id'] - - self.accumulated_replies[original_msg_id] = { - scheduler_id: None for scheduler_id in self.connected_sub_schedulers - } - - for i, scheduler_id in enumerate(self.connected_sub_schedulers): - targets_for_scheduler = targets[ - i * len(targets) // 2 : (i + 1) * len(targets) // 2 - ] - - if not targets_for_scheduler: - del self.accumulated_replies[original_msg_id][scheduler_id] - continue - - msg['metadata']['targets'] = targets_for_scheduler - - new_msg = self.append_new_msg_id_to_msg( - self.get_new_msg_id(msg_id, scheduler_id), scheduler_id, idents, msg - ) - # self.mon_stream.send_multipart([b'insptree'] + new_msg, copy=False) - self.outgoing_streams[i].send_multipart(new_msg, copy=False) - - @util.log_errors - def dispatch_result(self, raw_msg): - try: - idents, msg = self.session.feed_identities(raw_msg, copy=False) - msg = self.session.deserialize(msg, content=False, copy=False) - outgoing_scheduler, _ = idents[:2] - except: - self.log.error( - f'spanning tree::Invalid broadcast msg: {raw_msg}', exc_info=True - ) - return - - original_msg_id = msg['metadata']['original_msg_id'] - self.accumulated_replies[original_msg_id][outgoing_scheduler] = raw_msg[1:] - - if all( - msg is not None - for msg in self.accumulated_replies[original_msg_id].values() - ): - self.client_stream.send_multipart( - [ - msgpart - for msg in self.accumulated_replies[original_msg_id].values() - for msgpart in msg - ], - copy=False, - ) - # self.mon_stream.send_multipart([b'outsptree'] + raw_msg, copy=False) - - -class SpanningTreeLeafScheduler(Scheduler): - accumulated_replies = {} - - def __init__(self, *args, **kwargs): - super().__init__(**kwargs) - self.log.info('Spanning tree leaf scheduler started') - - @util.log_errors - def dispatch_submission(self, raw_msg): - try: - idents, msg_list = self.session.feed_identities(raw_msg, copy=False) - msg = self.session.deserialize(msg_list, content=False, copy=False) - except Exception as e: - self.log.error(f'Spanning tree scheduler:: Invalid msg: {raw_msg}') - return - - metadata = msg['metadata'] - original_msg_id = metadata['original_msg_id'] - targets = metadata.get('targets', []) - - self.accumulated_replies[original_msg_id] = { - bytes(target, 'utf8'): None for target in targets - } - for target in targets: - new_msg = self.append_new_msg_id_to_msg( - self.get_new_msg_id(original_msg_id, target), target, idents, msg - ) - # self.mon_stream.send_multipart([b'insptree'] + new_msg, copy=False) - self.engine_stream.send_multipart(new_msg, copy=False) - - @util.log_errors - def dispatch_result(self, raw_msg): - try: - idents, msg = self.session.feed_identities(raw_msg, copy=False) - msg = self.session.deserialize(msg, content=False, copy=False) - target = idents[0] - except: - self.log.error( - f'spanning tree::Invalid broadcast msg: {raw_msg}', exc_info=True - ) - return - - original_msg_id = msg['metadata']['original_msg_id'] - self.accumulated_replies[original_msg_id][target] = raw_msg[1:] - - if all( - msg is not None - for msg in self.accumulated_replies[original_msg_id].values() - ): - self.client_stream.send_multipart( - [ - msgpart - for msg in self.accumulated_replies[original_msg_id].values() - for msgpart in msg - ], - copy=False, - ) - # self.mon_stream.send_multipart([b'outsptree'] + raw_msg, copy=False) - - -def get_id_with_prefix(identity): - return bytes(f'sub_scheduler_{identity}', 'utf8') - - -def launch_spanning_tree_scheduler( - in_addr, - out_addrs, - mon_addr, - not_addr, - reg_addr, - identity, - config=None, - loglevel=logging.DEBUG, - log_url=None, - is_leaf=False, - in_thread=False, - outgoing_ids=None, - depth=0, -): - config, ctx, loop, mons, nots, querys, log = get_common_scheduler_streams( - mon_addr, not_addr, reg_addr, config, 'scheduler', log_url, loglevel, in_thread - ) - - is_root = identity == 0 - sub_scheduler_id = get_id_with_prefix(identity) - - incoming_stream = ZMQStream(ctx.socket(zmq.ROUTER), loop) - util.set_hwm(incoming_stream, 0) - incoming_stream.setsockopt(zmq.IDENTITY, sub_scheduler_id) - - if is_root: - incoming_stream.bind(in_addr) - else: - incoming_stream.connect(in_addr) - - outgoing_streams = [] - for out_addr in out_addrs: - out = ZMQStream(ctx.socket(zmq.ROUTER), loop) - util.set_hwm(out, 0) - out.setsockopt(zmq.IDENTITY, sub_scheduler_id) - out.bind(out_addr) - outgoing_streams.append(out) - - scheduler_args = dict( - client_stream=incoming_stream, - mon_stream=mons, - notifier_stream=nots, - query_stream=querys, - loop=loop, - log=log, - config=config, - depth=depth, - ) - if is_leaf: - scheduler_args.update(engine_stream=outgoing_streams[0]) - scheduler = SpanningTreeLeafScheduler(**scheduler_args) - else: - scheduler_args.update( - connected_sub_schedulers=[ - get_id_with_prefix(identity) for identity in outgoing_ids - ], - outgoing_streams=outgoing_streams, - ) - scheduler = SpanningTreeScheduler(**scheduler_args) - - scheduler.start() - if not in_thread: - try: - loop.start() - except KeyboardInterrupt: - scheduler.log.critical("Interrupted, exiting...") diff --git a/ipyparallel/engine/engine.py b/ipyparallel/engine/engine.py index ba0e69fab..c44bc2900 100644 --- a/ipyparallel/engine/engine.py +++ b/ipyparallel/engine/engine.py @@ -235,8 +235,7 @@ def urls(key): heart.start() # create Shell Connections (MUX, Task, etc.): - shell_addrs = [url('mux'), url('task'), url('broadcast_non_coalescing'), - url('broadcast_coalescing')] + urls('sub_schedulers') + shell_addrs = [url('mux'), url('task')] + urls('broadcast') self.log.info(f'ENGINE: shell_addrs: {shell_addrs}') diff --git a/ipyparallel/engine/kernel.py b/ipyparallel/engine/kernel.py index 33e7ab005..69ee17024 100644 --- a/ipyparallel/engine/kernel.py +++ b/ipyparallel/engine/kernel.py @@ -54,16 +54,14 @@ def should_handle(self, stream, msg, idents): def init_metadata(self, parent): """init metadata dict, for execute/apply_reply""" + parent_metadata = parent.get('metadata', {}) return { 'started': utcnow(), 'dependencies_met' : True, 'engine' : self.ident, - 'is_broadcast_non_coalescing': parent.get('metadata', {}).get('is_broadcast_non_coalescing', False), - 'is_broadcast_coalescing': parent.get('metadata', {}).get('is_broadcast_coalescing', False), - 'original_msg_id': parent.get('metadata', {}).get('original_msg_id', ''), - 'is_spanning_tree': parent.get('metadata', {}).get( - 'is_spanning_tree', False), - + 'is_broadcast': parent_metadata.get('is_broadcast', False), + 'is_coalescing': parent_metadata.get('is_coalescing', False), + 'original_msg_id': parent_metadata.get('original_msg_id', ''), } def finish_metadata(self, parent, metadata, reply_content): diff --git a/ipyparallel/serialize/serialize.py b/ipyparallel/serialize/serialize.py index 81850de28..f39e92498 100644 --- a/ipyparallel/serialize/serialize.py +++ b/ipyparallel/serialize/serialize.py @@ -125,7 +125,7 @@ def serialize_object(obj, buffer_threshold=MAX_BYTES, item_threshold=MAX_ITEMS): buffers.insert(0, pickle.dumps(cobj, PICKLE_PROTOCOL)) return buffers -def deserialize_object(buffers, g=None, try_to_extract_all=False): +def deserialize_object(buffers, g=None): """reconstruct an object serialized by serialize_object from data buffers. Parameters @@ -143,15 +143,7 @@ def deserialize_object(buffers, g=None, try_to_extract_all=False): bufs = list(buffers) pobj = buffer_to_bytes_py2(bufs.pop(0)) canned = pickle.loads(pobj) - if try_to_extract_all: - unpickled_buffers = [canned] - for buf in bufs: - try: - unpickled_buffers.append(pickle.loads(buffer_to_bytes_py2(buf))) - except Exception: - continue - return unpickled_buffers - elif istype(canned, sequence_types) and len(canned) < MAX_ITEMS: + if istype(canned, sequence_types) and len(canned) < MAX_ITEMS: for c in canned: _restore_buffers(c, bufs) newobj = uncan_sequence(canned, g) From 85c7bb662c0fafcf0edefe295db727311eff715c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom-Olav=20B=C3=B8yum?= Date: Tue, 12 May 2020 18:11:33 +0200 Subject: [PATCH 21/34] trying 400 engines scheduler depth 3 --- ipyparallel/controller/broadcast_scheduler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ipyparallel/controller/broadcast_scheduler.py b/ipyparallel/controller/broadcast_scheduler.py index 167f55db7..388939d8d 100644 --- a/ipyparallel/controller/broadcast_scheduler.py +++ b/ipyparallel/controller/broadcast_scheduler.py @@ -9,7 +9,7 @@ ZMQStream, ) -SPANNING_TREE_SCHEDULER_DEPTH = 2 +SPANNING_TREE_SCHEDULER_DEPTH = 3 class BroadcastScheduler(Scheduler): From c9622fe55220b16e04a4e1aa1ecf2b2cb5d0a0ed Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom-Olav=20B=C3=B8yum?= Date: Fri, 15 May 2020 12:17:30 +0200 Subject: [PATCH 22/34] saved results for broadcast spanning tree --- ipyparallel/client/view.py | 1 - ipyparallel/controller/broadcast_scheduler.py | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/ipyparallel/client/view.py b/ipyparallel/client/view.py index 86a73ec57..2264190e4 100644 --- a/ipyparallel/client/view.py +++ b/ipyparallel/client/view.py @@ -578,7 +578,6 @@ def _really_apply(self, f, args=None, kwargs=None, targets=None, block=None, tra pass return ar - @sync_results def map(self, f, *sequences, **kwargs): """``view.map(f, *sequences, block=self.block)`` => list|AsyncMapResult diff --git a/ipyparallel/controller/broadcast_scheduler.py b/ipyparallel/controller/broadcast_scheduler.py index 388939d8d..1b7a9d2b9 100644 --- a/ipyparallel/controller/broadcast_scheduler.py +++ b/ipyparallel/controller/broadcast_scheduler.py @@ -44,7 +44,7 @@ def send_to_targets(self, msg, original_msg_id, targets, idents, is_coalescing): if is_coalescing: self.accumulated_replies[original_msg_id] = { bytes(target, 'utf8'): None for target in targets - } + } for target in targets: new_msg = self.append_new_msg_id_to_msg( From 04c917f0cf9d0502eb3c172596af589662256dd0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom-Olav=20B=C3=B8yum?= Date: Fri, 15 May 2020 12:52:38 +0200 Subject: [PATCH 23/34] engine on x-axis --- ipyparallel/controller/broadcast_scheduler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ipyparallel/controller/broadcast_scheduler.py b/ipyparallel/controller/broadcast_scheduler.py index 1b7a9d2b9..ced358f72 100644 --- a/ipyparallel/controller/broadcast_scheduler.py +++ b/ipyparallel/controller/broadcast_scheduler.py @@ -65,7 +65,7 @@ def send_to_sub_schedulers( slice_start = i * len(targets) // len(self.connected_sub_scheduler_ids) slice_end = (i + 1) * len(targets) // len(self.connected_sub_scheduler_ids) targets_for_scheduler = targets[slice_start:slice_end] - if not targets_for_scheduler: + if not targets_for_scheduler and is_coalescing: del self.accumulated_replies[original_msg_id][scheduler_id] msg['metadata']['targets'] = targets_for_scheduler From 2816df3cdf199eb4a28052b84975aa47837fb36a Mon Sep 17 00:00:00 2001 From: Min RK Date: Fri, 15 May 2020 13:32:44 +0200 Subject: [PATCH 24/34] use traitlets config to expose HubFactory.broadcast_scheduler_depth --- ipyparallel/apps/ipcontrollerapp.py | 15 +++--- ipyparallel/controller/broadcast_scheduler.py | 31 +++++------- ipyparallel/controller/hub.py | 50 ++++++++++++------- 3 files changed, 54 insertions(+), 42 deletions(-) diff --git a/ipyparallel/apps/ipcontrollerapp.py b/ipyparallel/apps/ipcontrollerapp.py index 861f8cbc8..bfe6506b4 100755 --- a/ipyparallel/apps/ipcontrollerapp.py +++ b/ipyparallel/apps/ipcontrollerapp.py @@ -36,9 +36,9 @@ from jupyter_client.session import Session, session_aliases, session_flags from ipyparallel.controller.broadcast_scheduler import launch_broadcast_scheduler, \ - SPANNING_TREE_SCHEDULER_DEPTH, BroadcastScheduler + BroadcastScheduler from ipyparallel.controller.heartmonitor import HeartMonitor -from ipyparallel.controller.hub import HubFactory, get_number_of_non_leaf_schedulers +from ipyparallel.controller.hub import HubFactory from ipyparallel.controller.scheduler import launch_scheduler from ipyparallel.controller.task_scheduler import TaskScheduler from ipyparallel.controller.dictdb import DictDB @@ -341,7 +341,10 @@ def init_hub(self): self.do_import_statements() try: - self.factory = HubFactory(config=c, log=self.log) + self.factory = HubFactory( + config=c, + log=self.log, + ) # self.start_logging() self.factory.init_hub() except TraitError: @@ -584,7 +587,7 @@ def launch_in_thread_or_process(scheduler_args): def recursively_start_schedulers(identity, depth): outgoing_id1 = identity * 2 + 1 outgoing_id2 = outgoing_id1 + 1 - is_leaf = depth == SPANNING_TREE_SCHEDULER_DEPTH + is_leaf = depth == self.factory.broadcast_scheduler_depth scheduler_args = dict( in_addr=factory.client_url(BroadcastScheduler.port_name, identity), @@ -597,16 +600,16 @@ def recursively_start_schedulers(identity, depth): log_url=self.log_url, outgoing_ids=[outgoing_id1, outgoing_id2], depth=depth, + is_leaf=is_leaf, ) if is_leaf: scheduler_args.update( out_addrs=[ factory.engine_url( BroadcastScheduler.port_name, - identity - get_number_of_non_leaf_schedulers(), + identity - factory.number_of_non_leaf_schedulers, ) ], - is_leaf=is_leaf, ) else: scheduler_args.update( diff --git a/ipyparallel/controller/broadcast_scheduler.py b/ipyparallel/controller/broadcast_scheduler.py index ced358f72..e1db41ecd 100644 --- a/ipyparallel/controller/broadcast_scheduler.py +++ b/ipyparallel/controller/broadcast_scheduler.py @@ -1,7 +1,10 @@ import logging +import os import zmq +from traitlets import Integer, List, Bytes, Bool + from ipyparallel import util from ipyparallel.controller.scheduler import ( Scheduler, @@ -9,30 +12,22 @@ ZMQStream, ) -SPANNING_TREE_SCHEDULER_DEPTH = 3 - class BroadcastScheduler(Scheduler): port_name = 'broadcast' accumulated_replies = {} - def __init__( - self, - *args, - depth=0, - connected_sub_schedulers=None, - outgoing_streams=None, - is_leaf=False, - **kwargs, - ): - super().__init__(**kwargs) - self.log.info('Broadcast Scheduler Started') - self.connected_sub_scheduler_ids = connected_sub_schedulers - self.outgoing_streams = outgoing_streams - self.is_leaf = is_leaf or SPANNING_TREE_SCHEDULER_DEPTH == 0 - self.depth = depth + depth = Integer(0) + is_leaf = Bool(False) + connected_sub_scheduler_ids = List(Bytes()) + outgoing_streams = List() def start(self): + self.log.info( + 'Broadcast Scheduler started with depth=%s, pid=%s', + self.depth, + os.getpid(), + ) self.client_stream.on_recv(self.dispatch_submission, copy=False) if self.is_leaf: super().start() @@ -204,7 +199,7 @@ def launch_broadcast_scheduler( scheduler_args.update(engine_stream=outgoing_streams[0], is_leaf=True) else: scheduler_args.update( - connected_sub_schedulers=[ + connected_sub_scheduler_ids=[ get_id_with_prefix(identity) for identity in outgoing_ids ], outgoing_streams=outgoing_streams, diff --git a/ipyparallel/controller/hub.py b/ipyparallel/controller/hub.py index 987ae28f2..595a376c9 100644 --- a/ipyparallel/controller/hub.py +++ b/ipyparallel/controller/hub.py @@ -22,14 +22,15 @@ # internal: from ipython_genutils.importstring import import_item -from .broadcast_scheduler import SPANNING_TREE_SCHEDULER_DEPTH, BroadcastScheduler +from .broadcast_scheduler import BroadcastScheduler from ..util import extract_dates from jupyter_client.localinterfaces import localhost from ipython_genutils.py3compat import cast_bytes, unicode_type, iteritems, buffer_to_bytes_py2 from traitlets import ( HasTraits, Any, Instance, Integer, Unicode, Dict, Set, Tuple, - DottedObjectName, observe, - List) + DottedObjectName, default, observe, + List, +) from datetime import datetime from ipyparallel import error, util @@ -109,18 +110,6 @@ def init_record(msg): } -def get_number_of_leaf_schedulers(): - return 2**SPANNING_TREE_SCHEDULER_DEPTH - - -def get_number_of_broadcast_schedulers(): - return 2 * get_number_of_leaf_schedulers() - 1 - - -def get_number_of_non_leaf_schedulers(): - return get_number_of_broadcast_schedulers() - get_number_of_leaf_schedulers() - - class EngineConnector(HasTraits): """A simple object for accessing the various zmq connections of an object. Attributes are: @@ -164,12 +153,37 @@ def _mux_default(self): def _task_default(self): return tuple(util.select_random_ports(2)) + + broadcast_scheduler_depth = Integer( + 3, + config=True, + help="Depth of spanning tree schedulers", + ) + number_of_leaf_schedulers = Integer() + number_of_broadcast_schedulers = Integer() + number_of_non_leaf_schedulers = Integer() + + @default('number_of_leaf_schedulers') + def get_number_of_leaf_schedulers(self): + return 2 ** self.broadcast_scheduler_depth + + + @default('number_of_broadcast_schedulers') + def get_number_of_broadcast_schedulers(self): + return 2 * self.number_of_leaf_schedulers - 1 + + + @default('number_of_non_leaf_schedulers') + def get_number_of_non_leaf_schedulers(self): + return self.number_of_broadcast_schedulers - self.number_of_leaf_schedulers + + broadcast = List(Integer(), config=True, help="List of available ports for broadcast") def _broadcast_default(self): return util.select_random_ports( - get_number_of_leaf_schedulers() + get_number_of_broadcast_schedulers() + self.number_of_leaf_schedulers + self.number_of_broadcast_schedulers ) control = Tuple(Integer(), Integer(), config=True, @@ -315,7 +329,7 @@ def init_hub(self): 'task' : self.task[1], 'iopub' : self.iopub[1], BroadcastScheduler.port_name: - self.broadcast[-get_number_of_leaf_schedulers():], + self.broadcast[-self.number_of_leaf_schedulers:], } client = self.client_info = { @@ -328,7 +342,7 @@ def init_hub(self): 'iopub' : self.iopub[0], 'notification' : self.notifier_port, BroadcastScheduler.port_name: - self.broadcast[:get_number_of_broadcast_schedulers()], + self.broadcast[:self.number_of_broadcast_schedulers], } self.log.debug("Hub engine addrs: %s", self.engine_info) From 48cf3839b6b5c562997be18c09382f3d1f4c6bca Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom-Olav=20B=C3=B8yum?= Date: Fri, 15 May 2020 20:00:12 +0200 Subject: [PATCH 25/34] Async and depth benchmarks --- ipyparallel/client/client.py | 23 ++--- ipyparallel/client/view.py | 84 +++++-------------- ipyparallel/controller/broadcast_scheduler.py | 1 - ipyparallel/controller/hub.py | 1 - 4 files changed, 30 insertions(+), 79 deletions(-) diff --git a/ipyparallel/client/client.py b/ipyparallel/client/client.py index db75374b9..565a1ce60 100644 --- a/ipyparallel/client/client.py +++ b/ipyparallel/client/client.py @@ -54,8 +54,7 @@ from .view import ( DirectView, LoadBalancedView, - BroadcastViewNonCoalescing, - BroadcastViewCoalescing + BroadcastView, ) import jupyter_client.session @@ -1749,21 +1748,13 @@ def broadcast_view(self, targets='all', is_coalescing=False, **kwargs): """ targets = self._build_targets(targets)[1] - return ( - BroadcastViewCoalescing( - client=self, - socket=self._broadcast_stream, - targets=targets, - **kwargs - ) - if is_coalescing - else BroadcastViewNonCoalescing( - client=self, - socket=self._broadcast_stream, - targets=targets, - **kwargs - ) + bcast_view = BroadcastView( + client=self, + socket=self._broadcast_stream, + targets=targets, ) + bcast_view.is_coalescing = is_coalescing + return bcast_view # -------------------------------------------------------------------------- # Query methods diff --git a/ipyparallel/client/view.py b/ipyparallel/client/view.py index 50d6ddfd0..53f288a98 100644 --- a/ipyparallel/client/view.py +++ b/ipyparallel/client/view.py @@ -838,10 +838,8 @@ def activate(self, suffix=''): ip.magics_manager.register(M) -class BroadcastViewNonCoalescing(DirectView): - def __init__(self, client=None, socket=None, targets=None): - super().__init__(client=client, socket=socket, targets=targets) - +class BroadcastView(DirectView): + is_coalescing = Bool(False) @sync_results @save_ids @@ -861,25 +859,30 @@ def _really_apply(self, f, args=None, kwargs=None, block=None, track=None, targe s_idents = [ident.decode("utf8") for ident in idents] - metadata = dict(targets=s_idents, is_broadcast=True, is_coalescing=False) + metadata = dict(targets=s_idents, is_broadcast=True, is_coalescing=self.is_coalescing) - original_future = self.client.send_apply_request( + if not self.is_coalescing: + original_future = self.client.send_apply_request( self._socket, pf, pargs, pkwargs, track=track, metadata=metadata) + original_msg_id = original_future.msg_id + + for ident in s_idents: + msg_and_target_id = f'{original_msg_id}_{ident}' + future = self.client.create_message_futures(msg_and_target_id, async_result=True, track=True) + self.client.outstanding.add(msg_and_target_id) + self.outstanding.add(msg_and_target_id) + futures.append(future[0]) + if original_msg_id in self.outstanding: + self.outstanding.remove(original_msg_id) + else: + message_future = self.client.send_apply_request( + self._socket, pf, pargs, pkwargs, + track=track, metadata=metadata + ) + self.client.outstanding.add(message_future.msg_id) + futures = message_future - original_msg_id = original_future.msg_id - - for ident in s_idents: - msg_and_target_id = f'{original_msg_id}_{ident}' - future = self.client.create_message_futures(msg_and_target_id, async_result=True, track=True) - self.client.outstanding.add(msg_and_target_id) - self.outstanding.add(msg_and_target_id) - futures.append(future[0]) - if original_msg_id in self.outstanding: - self.outstanding.remove(original_msg_id) - - if isinstance(targets, int): - futures = futures[0] ar = AsyncResult(self.client, futures, fname=getname(f), targets=_targets, owner=True) if block: @@ -892,46 +895,6 @@ def _really_apply(self, f, args=None, kwargs=None, block=None, track=None, targe def map(self, f, *sequences, **kwargs): pass - -class BroadcastViewCoalescing(DirectView): - def __init__(self, client=None, socket=None, targets=None): - super().__init__(client=client, socket=socket, targets=targets) - - @sync_results - @save_ids - def _really_apply(self, f, args=None, kwargs=None, block=None, track=None, - targets=None): - args = [] if args is None else args - kwargs = {} if kwargs is None else kwargs - block = self.block if block is None else block - track = self.track if track is None else track - targets = self.targets if targets is None else targets - - idents, _targets = self.client._build_targets(targets) - - pf = PrePickled(f) - pargs = [PrePickled(arg) for arg in args] - pkwargs = {k: PrePickled(v) for k, v in kwargs.items()} - - s_idents = [ident.decode("utf8") for ident in idents] - - metadata = dict(targets=s_idents, is_broadcast=True, is_coalescing=True) - - message_future = self.client.send_apply_request( - self._socket, pf, pargs, pkwargs, - track=track, metadata=metadata) - - self.client.outstanding.add(message_future.msg_id) - - ar = AsyncResult(self.client, message_future, fname=getname(f), targets=_targets, - owner=True) - if block: - try: - return ar.get() - except KeyboardInterrupt: - pass - return ar - class LoadBalancedView(View): """An load-balancing View that only executes via the Task scheduler. @@ -1260,6 +1223,5 @@ def shutdown(self, wait=True): if wait: self.view.wait() -__all__ = ['LoadBalancedView', 'DirectView', 'ViewExecutor', - 'BroadcastViewNonCoalescing', 'BroadcastViewCoalescing'] +__all__ = ['LoadBalancedView', 'DirectView', 'ViewExecutor', 'BroadcastView'] diff --git a/ipyparallel/controller/broadcast_scheduler.py b/ipyparallel/controller/broadcast_scheduler.py index e1db41ecd..47d335f4d 100644 --- a/ipyparallel/controller/broadcast_scheduler.py +++ b/ipyparallel/controller/broadcast_scheduler.py @@ -193,7 +193,6 @@ def launch_broadcast_scheduler( loop=loop, log=log, config=config, - depth=depth, ) if is_leaf: scheduler_args.update(engine_stream=outgoing_streams[0], is_leaf=True) diff --git a/ipyparallel/controller/hub.py b/ipyparallel/controller/hub.py index 3760d162e..c5472a76c 100644 --- a/ipyparallel/controller/hub.py +++ b/ipyparallel/controller/hub.py @@ -153,7 +153,6 @@ def _mux_default(self): def _task_default(self): return tuple(util.select_random_ports(2)) - broadcast_scheduler_depth = Integer( 3, config=True, From 2e27a17a4fb34ef3d89f5cc8e9faa0f7e535b421 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom-Olav=20B=C3=B8yum?= Date: Thu, 21 May 2020 15:35:07 +0200 Subject: [PATCH 26/34] added print out for active thread count --- ipyparallel/client/client.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/ipyparallel/client/client.py b/ipyparallel/client/client.py index 565a1ce60..b3b5ec6ae 100644 --- a/ipyparallel/client/client.py +++ b/ipyparallel/client/client.py @@ -5,6 +5,8 @@ from __future__ import print_function +import threading + try: from collections.abc import Iterable except ImportError: # py2 @@ -565,6 +567,8 @@ def __init__( 'apply_reply': self._handle_apply_reply, } + self.log.info(f'number of active threads running on client init {threading.active_count()}') + try: self._connect(sshserver, ssh_kwargs, timeout) except: From f3ae17f695f17031c4dddc4dbc038056833ac66e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom-Olav=20B=C3=B8yum?= Date: Thu, 21 May 2020 16:05:10 +0200 Subject: [PATCH 27/34] changing self.log.info with print --- ipyparallel/client/client.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ipyparallel/client/client.py b/ipyparallel/client/client.py index b3b5ec6ae..03c69e920 100644 --- a/ipyparallel/client/client.py +++ b/ipyparallel/client/client.py @@ -567,7 +567,7 @@ def __init__( 'apply_reply': self._handle_apply_reply, } - self.log.info(f'number of active threads running on client init {threading.active_count()}') + print(f'number of active threads running on client init {threading.active_count()}') try: self._connect(sshserver, ssh_kwargs, timeout) From 8d410f4bb679e3e9164aa7a4260d59ca4f071a92 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom-Olav=20B=C3=B8yum?= Date: Thu, 21 May 2020 16:36:02 +0200 Subject: [PATCH 28/34] removing delay parameter --- ipyparallel/client/view.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ipyparallel/client/view.py b/ipyparallel/client/view.py index 53f288a98..c4683488f 100644 --- a/ipyparallel/client/view.py +++ b/ipyparallel/client/view.py @@ -6,6 +6,7 @@ from __future__ import absolute_import, print_function import imp +import threading import warnings from contextlib import contextmanager @@ -860,7 +861,7 @@ def _really_apply(self, f, args=None, kwargs=None, block=None, track=None, targe s_idents = [ident.decode("utf8") for ident in idents] metadata = dict(targets=s_idents, is_broadcast=True, is_coalescing=self.is_coalescing) - + print(f'Active threads on braodcastView: {threading.active_count()}') if not self.is_coalescing: original_future = self.client.send_apply_request( self._socket, pf, pargs, pkwargs, From 986cf0a0bf74ee79af1f8bf53b37a25094cce5cd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom-Olav=20B=C3=B8yum?= Date: Tue, 26 May 2020 16:33:30 +0200 Subject: [PATCH 29/34] optimistically trying 1000 engines with all the benchmarks brought back --- ipyparallel/client/client.py | 2 -- ipyparallel/client/view.py | 2 -- 2 files changed, 4 deletions(-) diff --git a/ipyparallel/client/client.py b/ipyparallel/client/client.py index 03c69e920..d9af2a659 100644 --- a/ipyparallel/client/client.py +++ b/ipyparallel/client/client.py @@ -567,8 +567,6 @@ def __init__( 'apply_reply': self._handle_apply_reply, } - print(f'number of active threads running on client init {threading.active_count()}') - try: self._connect(sshserver, ssh_kwargs, timeout) except: diff --git a/ipyparallel/client/view.py b/ipyparallel/client/view.py index c4683488f..d8e52c741 100644 --- a/ipyparallel/client/view.py +++ b/ipyparallel/client/view.py @@ -850,7 +850,6 @@ def _really_apply(self, f, args=None, kwargs=None, block=None, track=None, targe block = self.block if block is None else block track = self.track if track is None else track targets = self.targets if targets is None else targets - idents, _targets = self.client._build_targets(targets) futures = [] @@ -861,7 +860,6 @@ def _really_apply(self, f, args=None, kwargs=None, block=None, track=None, targe s_idents = [ident.decode("utf8") for ident in idents] metadata = dict(targets=s_idents, is_broadcast=True, is_coalescing=self.is_coalescing) - print(f'Active threads on braodcastView: {threading.active_count()}') if not self.is_coalescing: original_future = self.client.send_apply_request( self._socket, pf, pargs, pkwargs, From 107894eff6c48a92d0cdbc180e5b6e8bcc901562 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom-Olav=20B=C3=B8yum?= Date: Tue, 26 May 2020 22:35:20 +0200 Subject: [PATCH 30/34] async only --- ipyparallel/controller/broadcast_scheduler.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/ipyparallel/controller/broadcast_scheduler.py b/ipyparallel/controller/broadcast_scheduler.py index 47d335f4d..ea91dd052 100644 --- a/ipyparallel/controller/broadcast_scheduler.py +++ b/ipyparallel/controller/broadcast_scheduler.py @@ -16,16 +16,13 @@ class BroadcastScheduler(Scheduler): port_name = 'broadcast' accumulated_replies = {} - - depth = Integer(0) is_leaf = Bool(False) connected_sub_scheduler_ids = List(Bytes()) outgoing_streams = List() def start(self): self.log.info( - 'Broadcast Scheduler started with depth=%s, pid=%s', - self.depth, + 'Broadcast Scheduler started with pid=%s', os.getpid(), ) self.client_stream.on_recv(self.dispatch_submission, copy=False) From 27fad6ce5e477f56b8ad133dcffe7ce99f3c1b63 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom-Olav=20B=C3=B8yum?= Date: Fri, 29 May 2020 12:04:15 +0200 Subject: [PATCH 31/34] delete messages after sending from scheduler to free up memory --- ipyparallel/controller/broadcast_scheduler.py | 1 + 1 file changed, 1 insertion(+) diff --git a/ipyparallel/controller/broadcast_scheduler.py b/ipyparallel/controller/broadcast_scheduler.py index ea91dd052..3f3520c25 100644 --- a/ipyparallel/controller/broadcast_scheduler.py +++ b/ipyparallel/controller/broadcast_scheduler.py @@ -88,6 +88,7 @@ def coalescing_reply(self, raw_msg, msg, original_msg_id, outgoing_id): ] ) self.client_stream.send_multipart(new_msg, copy=False) + del self.accumulated_replies[original_msg_id] else: self.accumulated_replies[original_msg_id][outgoing_id] = msg['buffers'] From d539f45bfeb0d0bd58975590d16fdb1f6426209e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom-Olav=20B=C3=B8yum?= Date: Fri, 28 Aug 2020 11:10:30 +0200 Subject: [PATCH 32/34] new scheduler implementation --- ipyparallel/client/view.py | 1 - ipyparallel/controller/broadcast_scheduler.py | 1 - 2 files changed, 2 deletions(-) diff --git a/ipyparallel/client/view.py b/ipyparallel/client/view.py index d8e52c741..c5a18757f 100644 --- a/ipyparallel/client/view.py +++ b/ipyparallel/client/view.py @@ -1112,7 +1112,6 @@ def _really_apply(self, f, args=None, kwargs=None, block=None, track=None, @save_ids def map(self, f, *sequences, **kwargs): """``view.map(f, *sequences, block=self.block, chunksize=1, ordered=True)`` => list|AsyncMapResult - Parallel version of builtin `map`, load-balanced by this View. `block`, and `chunksize` can be specified by keyword only. diff --git a/ipyparallel/controller/broadcast_scheduler.py b/ipyparallel/controller/broadcast_scheduler.py index 3f3520c25..0c73d9608 100644 --- a/ipyparallel/controller/broadcast_scheduler.py +++ b/ipyparallel/controller/broadcast_scheduler.py @@ -48,7 +48,6 @@ def send_to_sub_schedulers( self, msg, original_msg_id, targets, idents, is_coalescing ): if is_coalescing: - self.accumulated_replies[original_msg_id] = { scheduler_id: None for scheduler_id in self.connected_sub_scheduler_ids } From 702e66d0035cd7478546601208b3bd7758fdd5b4 Mon Sep 17 00:00:00 2001 From: Min RK Date: Mon, 22 Feb 2021 14:39:29 +0100 Subject: [PATCH 33/34] default broadcast depth=1 --- ipyparallel/controller/hub.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/ipyparallel/controller/hub.py b/ipyparallel/controller/hub.py index 5d6c77e17..d8a45b1b2 100644 --- a/ipyparallel/controller/hub.py +++ b/ipyparallel/controller/hub.py @@ -154,7 +154,7 @@ def _task_default(self): return tuple(util.select_random_ports(2)) broadcast_scheduler_depth = Integer( - 3, + 1, config=True, help="Depth of spanning tree schedulers", ) @@ -1626,4 +1626,3 @@ def stop_distributed(self, client_id, msg): self.session.send(self.query, "stop_distributed_reply", content=content, parent=msg, ident=client_id, ) - From 866656e499c5595c4b525000e73293f8f3132c39 Mon Sep 17 00:00:00 2001 From: Min RK Date: Tue, 23 Feb 2021 15:37:41 +0100 Subject: [PATCH 34/34] handle unparsed dates in AsyncResult parse on request in metadata now that they aren't parsed on receiving messages anymore --- ipyparallel/client/asyncresult.py | 43 +++++++++++++++++++++++-------- 1 file changed, 32 insertions(+), 11 deletions(-) diff --git a/ipyparallel/client/asyncresult.py b/ipyparallel/client/asyncresult.py index a59a866e3..369747a4e 100644 --- a/ipyparallel/client/asyncresult.py +++ b/ipyparallel/client/asyncresult.py @@ -23,7 +23,7 @@ from IPython import get_ipython from IPython.core.display import clear_output, display, display_pretty from ipyparallel import error -from ipyparallel.util import utcnow, compare_datetimes +from ipyparallel.util import utcnow, compare_datetimes, _parse_date from ipython_genutils.py3compat import string_types from .futures import MessageFuture, multi_future @@ -278,9 +278,28 @@ def r(self): """result property wrapper for `get(timeout=-1)`.""" return self.get() + _DATE_FIELDS = [ + "submitted", + "started", + "completed", + "received", + ] + + def _parse_metadata_dates(self): + """Ensure metadata date fields are parsed on access + + Rather than parsing timestamps from str->dt on receipt, + parse on access for compatibility. + """ + for md in self._metadata: + for key in self._DATE_FIELDS: + if isinstance(md.get(key, None), str): + md[key] = _parse_date(md[key]) + @property def metadata(self): """property for accessing execution metadata.""" + self._parse_metadata_dates() if self._single_result: return self._metadata[0] else: @@ -356,6 +375,7 @@ def __getitem__(self, key): # metadata proxy *does not* require that results are done self.wait(0) self.wait_for_output(0) + self._parse_metadata_dates() values = [ md[key] for md in self._metadata ] if self._single_result: return values[0] @@ -441,30 +461,30 @@ def timedelta(self, start, end, start_key=min, end_key=max): # not a list end = end_key(end) return compare_datetimes(end, start).total_seconds() - + @property def progress(self): """the number of tasks which have been completed at this point. - + Fractional progress would be given by 1.0 * ar.progress / len(ar) """ self.wait(0) return len(self) - len(set(self.msg_ids).intersection(self._client.outstanding)) - + @property def elapsed(self): """elapsed time since initial submission""" if self.ready(): return self.wall_time - + now = submitted = utcnow() - for msg_id in self.msg_ids: - if msg_id in self._client.metadata: - stamp = self._client.metadata[msg_id]['submitted'] - if stamp and stamp < submitted: - submitted = stamp + self._parse_metadata_dates() + for md in self._metadata: + stamp = md["submitted"] + if stamp and stamp < submitted: + submitted = stamp return compare_datetimes(now, submitted).total_seconds() - + @property @check_ready def serial_time(self): @@ -473,6 +493,7 @@ def serial_time(self): Computed as the sum of (completed-started) of each task """ t = 0 + self._parse_metadata_dates() for md in self._metadata: t += compare_datetimes(md['completed'], md['started']).total_seconds() return t