From 941b11d8ad6c0670d169de919edd0daf65ee8a60 Mon Sep 17 00:00:00 2001 From: Kathryn M Mazaitis Date: Sun, 24 Oct 2021 10:21:17 -0400 Subject: [PATCH] Don't open a database connection until you've downloaded the files --- src/acquisition/covid_hosp/common/utils.py | 39 +++++++++++++--------- 1 file changed, 24 insertions(+), 15 deletions(-) diff --git a/src/acquisition/covid_hosp/common/utils.py b/src/acquisition/covid_hosp/common/utils.py index b6e7b216d..91f2b8f42 100644 --- a/src/acquisition/covid_hosp/common/utils.py +++ b/src/acquisition/covid_hosp/common/utils.py @@ -148,25 +148,34 @@ def update_dataset(database, network, newer_than=None, older_than=None): Whether a new dataset was acquired. """ metadata = network.fetch_metadata() + datasets = [] with database.connect() as db: max_issue = db.get_max_issue() - older_than = datetime.datetime.today().date() if newer_than is None else older_than - newer_than = max_issue if newer_than is None else newer_than - daily_issues = Utils.issues_to_fetch(metadata, newer_than, older_than) - if not daily_issues: - print("no new issues, nothing to do") - return False - for issue, revisions in daily_issues.items(): - issue_int = int(issue.strftime("%Y%m%d")) - # download the dataset and add it to the database - dataset = Utils.merge_by_key_cols([network.fetch_dataset(url) for url, _ in revisions], - db.KEY_COLS) + + older_than = datetime.datetime.today().date() if newer_than is None else older_than + newer_than = max_issue if newer_than is None else newer_than + daily_issues = Utils.issues_to_fetch(metadata, newer_than, older_than) + if not daily_issues: + print("no new issues, nothing to do") + return False + for issue, revisions in daily_issues.items(): + issue_int = int(issue.strftime("%Y%m%d")) + # download the dataset and add it to the database + dataset = Utils.merge_by_key_cols([network.fetch_dataset(url) for url, _ in revisions], + db.KEY_COLS) + # add metadata to the database using the last revision seen. + last_url, last_index = revisions[-1] + metadata_json = metadata.loc[last_index].reset_index().to_json() + datasets.append(( + issue_int, + dataset, + last_url, + metadata_json + )) + with database.connect() as db: + for issue_int, dataset, last_url, metadata_json in datasets: db.insert_dataset(issue_int, dataset) - # add metadata to the database using the last revision seen. - last_url, last_index = revisions[-1] - metadata_json = metadata.loc[last_index].reset_index().to_json() db.insert_metadata(issue_int, last_url, metadata_json) - print(f'successfully acquired {len(dataset)} rows') # note that the transaction is committed by exiting the `with` block