Skip to content

Commit 4a5634d

Browse files
authored
Merge branch 'master' into fix-lightningapp-e2e
2 parents 8c44295 + e6a8283 commit 4a5634d

File tree

35 files changed

+380
-163
lines changed

35 files changed

+380
-163
lines changed

requirements/app/test.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,3 +12,4 @@ isort>=5.0
1212
mypy>=0.720
1313
httpx
1414
trio
15+
pympler

setup.cfg

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,7 @@ exclude =
6767
*.egg
6868
build
6969
temp
70+
_notebooks
7071

7172
select = E,W,F
7273
doctests = True

src/lightning_app/CHANGELOG.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,3 +24,5 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
2424
### Deprecated
2525

2626
### Fixed
27+
28+
- Resolved a bug where the work statuses will grow quickly and be duplicated ([#13970](https://github.com/Lightning-AI/lightning/pull/13970))

src/lightning_app/core/app.py

Lines changed: 9 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818
from lightning_app.utilities.app_helpers import _delta_to_appstate_delta, _LightningAppRef
1919
from lightning_app.utilities.commands.base import _populate_commands_endpoint, _process_command_requests
2020
from lightning_app.utilities.component import _convert_paths_after_init
21-
from lightning_app.utilities.enum import AppStage
21+
from lightning_app.utilities.enum import AppStage, CacheCallsKeys
2222
from lightning_app.utilities.exceptions import CacheMissException, ExitAppException
2323
from lightning_app.utilities.layout import _collect_layout
2424
from lightning_app.utilities.proxies import ComponentDelta
@@ -399,8 +399,8 @@ def _run(self) -> bool:
399399
if self.should_publish_changes_to_api and self.api_publish_state_queue:
400400
logger.debug("Publishing the state with changes")
401401
# Push two states to optimize start in the cloud.
402-
self.api_publish_state_queue.put(self.state)
403-
self.api_publish_state_queue.put(self.state)
402+
self.api_publish_state_queue.put(self.state_vars)
403+
self.api_publish_state_queue.put(self.state_vars)
404404

405405
self._reset_run_time_monitor()
406406

@@ -412,7 +412,7 @@ def _run(self) -> bool:
412412
self._update_run_time_monitor()
413413

414414
if self._has_updated and self.should_publish_changes_to_api and self.api_publish_state_queue:
415-
self.api_publish_state_queue.put(self.state)
415+
self.api_publish_state_queue.put(self.state_vars)
416416

417417
return True
418418

@@ -430,16 +430,12 @@ def _apply_restarting(self) -> bool:
430430
self.stage = AppStage.BLOCKING
431431
return False
432432

433-
def _collect_work_finish_status(self) -> dict:
434-
work_finished_status = {}
435-
for work in self.works:
436-
work_finished_status[work.name] = False
437-
for key in work._calls:
438-
if key == "latest_call_hash":
439-
continue
440-
fn_metadata = work._calls[key]
441-
work_finished_status[work.name] = fn_metadata["name"] == "run" and "ret" in fn_metadata
433+
def _has_work_finished(self, work):
434+
latest_call_hash = work._calls[CacheCallsKeys.LATEST_CALL_HASH]
435+
return "ret" in work._calls[latest_call_hash]
442436

437+
def _collect_work_finish_status(self) -> dict:
438+
work_finished_status = {work.name: self._has_work_finished(work) for work in self.works}
443439
assert len(work_finished_status) == len(self.works)
444440
return work_finished_status
445441

src/lightning_app/core/work.py

Lines changed: 73 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,15 @@
1212
from lightning_app.storage.drive import _maybe_create_drive, Drive
1313
from lightning_app.storage.payload import Payload
1414
from lightning_app.utilities.app_helpers import _is_json_serializable, _LightningAppRef
15-
from lightning_app.utilities.component import _sanitize_state
16-
from lightning_app.utilities.enum import make_status, WorkFailureReasons, WorkStageStatus, WorkStatus, WorkStopReasons
15+
from lightning_app.utilities.component import _is_flow_context, _sanitize_state
16+
from lightning_app.utilities.enum import (
17+
CacheCallsKeys,
18+
make_status,
19+
WorkFailureReasons,
20+
WorkStageStatus,
21+
WorkStatus,
22+
WorkStopReasons,
23+
)
1724
from lightning_app.utilities.exceptions import LightningWorkException
1825
from lightning_app.utilities.introspection import _is_init_context
1926
from lightning_app.utilities.network import find_free_network_port
@@ -107,7 +114,21 @@ def __init__(
107114
# setattr_replacement is used by the multiprocessing runtime to send the latest changes to the main coordinator
108115
self._setattr_replacement: Optional[Callable[[str, Any], None]] = None
109116
self._name = ""
110-
self._calls = {"latest_call_hash": None}
117+
# The ``self._calls`` is used to track whether the run
118+
# method with a given set of input arguments has already been called.
119+
# Example of its usage:
120+
# {
121+
# 'latest_call_hash': '167fe2e',
122+
# '167fe2e': {
123+
# 'statuses': [
124+
# {'stage': 'pending', 'timestamp': 1659433519.851271},
125+
# {'stage': 'running', 'timestamp': 1659433519.956482},
126+
# {'stage': 'stopped', 'timestamp': 1659433520.055768}]}
127+
# ]
128+
# },
129+
# ...
130+
# }
131+
self._calls = {CacheCallsKeys.LATEST_CALL_HASH: None}
111132
self._changes = {}
112133
self._raise_exception = raise_exception
113134
self._paths = {}
@@ -215,22 +236,22 @@ def status(self) -> WorkStatus:
215236
216237
All statuses are stored in the state.
217238
"""
218-
call_hash = self._calls["latest_call_hash"]
219-
if call_hash:
239+
call_hash = self._calls[CacheCallsKeys.LATEST_CALL_HASH]
240+
if call_hash in self._calls:
220241
statuses = self._calls[call_hash]["statuses"]
221242
# deltas aren't necessarily coming in the expected order.
222243
statuses = sorted(statuses, key=lambda x: x["timestamp"])
223244
latest_status = statuses[-1]
224-
if latest_status["reason"] == WorkFailureReasons.TIMEOUT:
245+
if latest_status.get("reason") == WorkFailureReasons.TIMEOUT:
225246
return self._aggregate_status_timeout(statuses)
226247
return WorkStatus(**latest_status)
227248
return WorkStatus(stage=WorkStageStatus.NOT_STARTED, timestamp=time.time())
228249

229250
@property
230251
def statuses(self) -> List[WorkStatus]:
231252
"""Return all the status of the work."""
232-
call_hash = self._calls["latest_call_hash"]
233-
if call_hash:
253+
call_hash = self._calls[CacheCallsKeys.LATEST_CALL_HASH]
254+
if call_hash in self._calls:
234255
statuses = self._calls[call_hash]["statuses"]
235256
# deltas aren't necessarily coming in the expected order.
236257
statuses = sorted(statuses, key=lambda x: x["timestamp"])
@@ -398,10 +419,13 @@ def __getattr__(self, item):
398419
return path
399420
return self.__getattribute__(item)
400421

401-
def _call_hash(self, fn, args, kwargs):
422+
def _call_hash(self, fn, args, kwargs) -> str:
402423
hash_args = args[1:] if len(args) > 0 and args[0] == self else args
403424
call_obj = {"args": hash_args, "kwargs": kwargs}
404-
return f"{fn.__name__}:{DeepHash(call_obj)[call_obj]}"
425+
# Note: Generate a hash as 167fe2e.
426+
# Seven was selected after checking upon Github default SHA length
427+
# and to minimize hidden state size.
428+
return str(DeepHash(call_obj)[call_obj])[:7]
405429

406430
def _wrap_run_for_caching(self, fn):
407431
@wraps(fn)
@@ -415,11 +439,11 @@ def new_fn(*args, **kwargs):
415439
entry = self._calls[call_hash]
416440
return entry["ret"]
417441

418-
self._calls[call_hash] = {"name": fn.__name__, "call_hash": call_hash}
442+
self._calls[call_hash] = {}
419443

420444
result = fn(*args, **kwargs)
421445

422-
self._calls[call_hash] = {"name": fn.__name__, "call_hash": call_hash, "ret": result}
446+
self._calls[call_hash] = {"ret": result}
423447

424448
return result
425449

@@ -457,8 +481,40 @@ def set_state(self, provided_state):
457481
if isinstance(v, Dict):
458482
v = _maybe_create_drive(self.name, v)
459483
setattr(self, k, v)
484+
460485
self._changes = provided_state["changes"]
461-
self._calls.update(provided_state["calls"])
486+
487+
# Note, this is handled by the flow only.
488+
if _is_flow_context():
489+
self._cleanup_calls(provided_state["calls"])
490+
491+
self._calls = provided_state["calls"]
492+
493+
@staticmethod
494+
def _cleanup_calls(calls: Dict[str, Any]):
495+
# 1: Collect all the in_progress call hashes
496+
in_progress_call_hash = [k for k in list(calls) if k not in (CacheCallsKeys.LATEST_CALL_HASH)]
497+
498+
for call_hash in in_progress_call_hash:
499+
if "statuses" not in calls[call_hash]:
500+
continue
501+
502+
# 2: Filter the statuses by timestamp
503+
statuses = sorted(calls[call_hash]["statuses"], key=lambda x: x["timestamp"])
504+
505+
# If the latest status is succeeded, then drop everything before.
506+
if statuses[-1]["stage"] == WorkStageStatus.SUCCEEDED:
507+
status = statuses[-1]
508+
status["timestamp"] = int(status["timestamp"])
509+
calls[call_hash]["statuses"] = [status]
510+
else:
511+
# TODO: Some status are being duplicated,
512+
# this seems related to the StateObserver.
513+
final_statuses = []
514+
for status in statuses:
515+
if status not in final_statuses:
516+
final_statuses.append(status)
517+
calls[call_hash]["statuses"] = final_statuses
462518

463519
@abc.abstractmethod
464520
def run(self, *args, **kwargs):
@@ -479,7 +535,7 @@ def _aggregate_status_timeout(self, statuses: List[Dict]) -> WorkStatus:
479535
if succeeded_statuses:
480536
succeed_status_id = succeeded_statuses[-1] + 1
481537
statuses = statuses[succeed_status_id:]
482-
timeout_statuses = [status for status in statuses if status["reason"] == WorkFailureReasons.TIMEOUT]
538+
timeout_statuses = [status for status in statuses if status.get("reason") == WorkFailureReasons.TIMEOUT]
483539
assert statuses[0]["stage"] == WorkStageStatus.PENDING
484540
status = {**timeout_statuses[-1], "timestamp": statuses[0]["timestamp"]}
485541
return WorkStatus(**status, count=len(timeout_statuses))
@@ -501,9 +557,8 @@ def stop(self):
501557
)
502558
if self.status.stage == WorkStageStatus.STOPPED:
503559
return
504-
latest_hash = self._calls["latest_call_hash"]
505-
self._calls[latest_hash]["statuses"].append(
506-
make_status(WorkStageStatus.STOPPED, reason=WorkStopReasons.PENDING)
507-
)
560+
latest_hash = self._calls[CacheCallsKeys.LATEST_CALL_HASH]
561+
stop_status = make_status(WorkStageStatus.STOPPED, reason=WorkStopReasons.PENDING)
562+
self._calls[latest_hash]["statuses"].append(stop_status)
508563
app = _LightningAppRef().get_current()
509564
self._backend.stop_work(app, self)

src/lightning_app/runners/runtime.py

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
from lightning_app import LightningApp
1111
from lightning_app.core.constants import APP_SERVER_HOST, APP_SERVER_PORT
1212
from lightning_app.runners.backends import Backend, BackendType
13-
from lightning_app.utilities.enum import AppStage, make_status, WorkStageStatus
13+
from lightning_app.utilities.enum import AppStage, CacheCallsKeys, make_status, WorkStageStatus
1414
from lightning_app.utilities.load_app import load_app_from_file
1515
from lightning_app.utilities.proxies import WorkRunner
1616

@@ -133,9 +133,10 @@ def dispatch(self, *args, **kwargs):
133133
raise NotImplementedError
134134

135135
def _add_stopped_status_to_work(self, work: "lightning_app.LightningWork") -> None:
136+
136137
if work.status.stage == WorkStageStatus.STOPPED:
137138
return
138-
latest_hash = work._calls["latest_call_hash"]
139-
if latest_hash is None:
140-
return
141-
work._calls[latest_hash]["statuses"].append(make_status(WorkStageStatus.STOPPED))
139+
140+
latest_call_hash = work._calls[CacheCallsKeys.LATEST_CALL_HASH]
141+
if latest_call_hash in work._calls:
142+
work._calls[latest_call_hash]["statuses"].append(make_status(WorkStageStatus.STOPPED))

src/lightning_app/testing/testing.py

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
from lightning_app.runners.multiprocess import MultiProcessRuntime
2323
from lightning_app.testing.config import Config
2424
from lightning_app.utilities.cloud import _get_project
25+
from lightning_app.utilities.enum import CacheCallsKeys
2526
from lightning_app.utilities.imports import _is_playwright_available, requires
2627
from lightning_app.utilities.network import _configure_session, LightningClient
2728
from lightning_app.utilities.proxies import ProxyWorkRun
@@ -114,8 +115,11 @@ def run_work_isolated(work, *args, start_server: bool = False, **kwargs):
114115
start_server=start_server,
115116
).dispatch()
116117
# pop the stopped status.
117-
call_hash = work._calls["latest_call_hash"]
118-
work._calls[call_hash]["statuses"].pop(-1)
118+
call_hash = work._calls[CacheCallsKeys.LATEST_CALL_HASH]
119+
120+
if call_hash in work._calls:
121+
work._calls[call_hash]["statuses"].pop(-1)
122+
119123
if isinstance(work.run, ProxyWorkRun):
120124
work.run = work.run.work_run
121125

@@ -176,7 +180,7 @@ def run_app_in_cloud(app_folder: str, app_name: str = "app.py") -> Generator:
176180
# 3. Launch the application in the cloud from the Lightning CLI.
177181
with tempfile.TemporaryDirectory() as tmpdir:
178182
env_copy = os.environ.copy()
179-
env_copy["PREPARE_LIGHTING"] = "1"
183+
env_copy["PACKAGE_LIGHTNING"] = "1"
180184
shutil.copytree(app_folder, tmpdir, dirs_exist_ok=True)
181185
# TODO - add -no-cache to the command line.
182186
process = Popen(
@@ -216,7 +220,10 @@ def run_app_in_cloud(app_folder: str, app_name: str = "app.py") -> Generator:
216220
record_har_path=Config.har_location,
217221
)
218222
admin_page = context.new_page()
219-
res = requests.post(Config.url + "/v1/auth/login", data=json.dumps(payload))
223+
url = Config.url
224+
if url.endswith("/"):
225+
url = url[:-1]
226+
res = requests.post(url + "/v1/auth/login", data=json.dumps(payload))
220227
token = res.json()["token"]
221228
print(f"The Lightning App Token is: {token}")
222229
print(f"The Lightning App user key is: {Config.key}")

src/lightning_app/utilities/enum.py

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -59,9 +59,16 @@ def __post_init__(self):
5959

6060

6161
def make_status(stage: str, message: Optional[str] = None, reason: Optional[str] = None):
62-
return {
62+
status = {
6363
"stage": stage,
64-
"message": message,
65-
"reason": reason,
6664
"timestamp": datetime.now(tz=timezone.utc).timestamp(),
6765
}
66+
if message:
67+
status["message"] = message
68+
if reason:
69+
status["reason"] = reason
70+
return status
71+
72+
73+
class CacheCallsKeys:
74+
LATEST_CALL_HASH = "latest_call_hash"

src/lightning_app/utilities/network.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@ def _configure_session() -> Session:
4848
return http
4949

5050

51-
def _check_service_url_is_ready(url: str, timeout: float = 1) -> bool:
51+
def _check_service_url_is_ready(url: str, timeout: float = 100) -> bool:
5252
try:
5353
response = requests.get(url, timeout=timeout)
5454
return response.status_code in (200, 404)

src/lightning_app/utilities/packaging/build_config.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -110,7 +110,7 @@ def _find_requirements(self, work: "LightningWork") -> List[str]:
110110
file = inspect.getfile(work.__class__)
111111

112112
# 2. Try to find a requirement file associated the file.
113-
dirname = os.path.dirname(file)
113+
dirname = os.path.dirname(file) or "."
114114
requirement_files = [os.path.join(dirname, f) for f in os.listdir(dirname) if f == "requirements.txt"]
115115
if not requirement_files:
116116
return []
@@ -126,7 +126,7 @@ def _find_dockerfile(self, work: "LightningWork") -> List[str]:
126126
file = inspect.getfile(work.__class__)
127127

128128
# 2. Check for Dockerfile.
129-
dirname = os.path.dirname(file)
129+
dirname = os.path.dirname(file) or "."
130130
dockerfiles = [os.path.join(dirname, f) for f in os.listdir(dirname) if f == "Dockerfile"]
131131

132132
if not dockerfiles:

0 commit comments

Comments
 (0)