feat(toggl): convert notebook to reusable module func

thekaveman · thekaveman · commit b393079913df · 2024-04-23T23:36:38.000-07:00
diff --git a/compiler_admin/services/files.py b/compiler_admin/services/files.py
@@ -0,0 +1,25 @@
+import json
+
+import pandas as pd
+
+
+def read_csv(file_path, **kwargs) -> pd.DataFrame:
+    """Read a file path or buffer of CSV data into a pandas.DataFrame."""
+    return pd.read_csv(file_path, **kwargs)
+
+
+def read_json(file_path: str):
+    """Read a file path of JSON data into a python object."""
+    with open(file_path, "r") as f:
+        return json.load(f)
+
+
+def write_csv(file_path, data: pd.DataFrame, columns: list[str] = None):
+    """Write a pandas.DataFrame as CSV to the given path or buffer, with an optional list of columns to write."""
+    data.to_csv(file_path, columns=columns, index=False)
+
+
+def write_json(file_path: str, data):
+    """Write a python object as JSON to the given path."""
+    with open(file_path, "w") as f:
+        json.dump(data, f, indent=2)
diff --git a/compiler_admin/services/toggl.py b/compiler_admin/services/toggl.py
@@ -0,0 +1,132 @@
+import os
+import sys
+from typing import TextIO
+
+import pandas as pd
+
+from compiler_admin.services.google import user_info as google_user_info
+import compiler_admin.services.files as files
+
+# cache of previously seen project information, keyed on Toggl project name
+PROJECT_INFO = {}
+
+# cache of previously seen user information, keyed on email
+USER_INFO = {}
+NOT_FOUND = "NOT FOUND"
+
+# input CSV columns needed for conversion
+INPUT_COLUMNS = ["Email", "Task", "Client", "Start date", "Start time", "Duration", "Description"]
+
+# default output CSV columns
+OUTPUT_COLUMNS = ["Date", "Client", "Project", "Task", "Notes", "Hours", "First Name", "Last Name"]
+
+
+def _harvest_client_name():
+    """Gets the value of the HARVEST_CLIENT_NAME env var."""
+    return os.environ.get("HARVEST_CLIENT_NAME")
+
+
+def _get_info(obj: dict, key: str, env_key: str):
+    """Read key from obj, populating obj once from a file path at env_key."""
+    if obj == {}:
+        file_path = os.environ.get(env_key)
+        if file_path:
+            file_info = files.read_json(file_path)
+            obj.update(file_info)
+    return obj.get(key)
+
+
+def _toggl_project_info(project: str):
+    """Return the cached project for the given project key."""
+    return _get_info(PROJECT_INFO, project, "TOGGL_PROJECT_INFO")
+
+
+def _toggl_user_info(email: str):
+    """Return the cached user for the given email."""
+    return _get_info(USER_INFO, email, "TOGGL_USER_INFO")
+
+
+def _get_first_name(email: str) -> str:
+    """Get cached first name or derive from email."""
+    user = _toggl_user_info(email)
+    first_name = user.get("First Name") if user else None
+    if first_name is None:
+        parts = email.split("@")
+        first_name = parts[0].capitalize()
+        data = {"First Name": first_name}
+        if email in USER_INFO:
+            USER_INFO[email].update(data)
+        else:
+            USER_INFO[email] = data
+    return first_name
+
+
+def _get_last_name(email: str):
+    """Get cached last name or query from Google."""
+    user = _toggl_user_info(email)
+    last_name = user.get("Last Name") if user else None
+    if last_name is None:
+        user = google_user_info(email)
+        last_name = user.get("Last Name") if user else None
+        if email in USER_INFO:
+            USER_INFO[email].update(user)
+        else:
+            USER_INFO[email] = user
+    return last_name
+
+
+def _str_timedelta(td):
+    """Convert a string formatted duration (e.g. 01:30) to a timedelta."""
+    return pd.to_timedelta(pd.to_datetime(td, format="%H:%M:%S").strftime("%H:%M:%S"))
+
+
+def convert_to_harvest(
+    source_path: str | TextIO = sys.stdin,
+    output_path: str | TextIO = sys.stdout,
+    client_name: str = None,
+    output_cols: list[str] = OUTPUT_COLUMNS,
+):
+    """Convert Toggl formatted entries in source_path to equivalent Harvest formatted entries.
+
+    Args:
+        source_path: The path to a readable CSV file of Toggl time entries; or a readable buffer of the same.
+
+        client_name (str): The value to assign in the output "Client" field
+
+        output_cols (list[str]): A list of column names for the output
+
+        output_path: The path to a CSV file where Harvest time entries will be written; or a writeable buffer for the same.
+
+    Returns:
+        None. Either prints the resulting CSV data or writes to output_path.
+    """
+    if client_name is None:
+        client_name = _harvest_client_name()
+
+    # read CSV file, parsing dates and times
+    source = files.read_csv(source_path, usecols=INPUT_COLUMNS, parse_dates=["Start date"], cache_dates=True)
+    source["Start time"] = source["Start time"].apply(_str_timedelta)
+    source["Duration"] = source["Duration"].apply(_str_timedelta)
+    source.sort_values(["Start date", "Start time", "Email"], inplace=True)
+
+    # rename columns that can be imported as-is
+    source.rename(columns={"Task": "Project", "Description": "Notes", "Start date": "Date"}, inplace=True)
+
+    # update static calculated columns
+    source["Client"] = client_name
+    source["Task"] = "Project Consulting"
+
+    # get cached project name if any
+    source["Project"] = source["Project"].apply(lambda x: _toggl_project_info(x) or x)
+
+    # assign First and Last Name
+    source["First Name"] = source["Email"].apply(_get_first_name)
+    source["Last Name"] = source["Email"].apply(_get_last_name)
+
+    # calculate hours as a decimal from duration timedelta
+    source["Hours"] = (source["Duration"].dt.total_seconds() / 3600).round(2)
+
+    if output_path is None:
+        output_path = sys.stdout
+
+    files.write_csv(output_path, source, columns=output_cols)
diff --git a/tests/services/test_toggl.py b/tests/services/test_toggl.py
@@ -0,0 +1,72 @@
+from io import StringIO
+import sys
+
+import pandas as pd
+import pytest
+
+from compiler_admin.services.toggl import INPUT_COLUMNS, files, OUTPUT_COLUMNS, convert_to_harvest, __name__ as MODULE
+
+
+@pytest.fixture(autouse=True)
+def mock_environment(monkeypatch):
+    monkeypatch.setenv("HARVEST_CLIENT_NAME", "Test_Client")
+    monkeypatch.setenv("TOGGL_PROJECT_INFO", "notebooks/data/toggl-project-info-sample.json")
+    monkeypatch.setenv("TOGGL_USER_INFO", "notebooks/data/toggl-user-info-sample.json")
+
+
+@pytest.fixture
+def mock_files(mocker):
+    return mocker.patch(f"{MODULE}.files", spec=files)
+
+
+@pytest.fixture
+def mock_google_user_info(mocker):
+    return mocker.patch(f"{MODULE}.google_user_info")
+
+
+@pytest.fixture
+def source_data():
+    return "notebooks/data/toggl-sample.csv"
+
+
+@pytest.fixture
+def sample_transformed_data():
+    return "notebooks/data/harvest-sample.csv"
+
+
+def test_convert_to_harvest_mocked(source_data, mock_files, mock_google_user_info):
+    mock_google_user_info.return_value = {}
+
+    convert_to_harvest(source_data)
+
+    mock_files.read_csv.assert_called_once()
+    call_args = mock_files.read_csv.call_args
+    assert (source_data,) in call_args
+    assert call_args.kwargs["usecols"] == INPUT_COLUMNS
+    assert call_args.kwargs["parse_dates"] == ["Start date"]
+    assert call_args.kwargs["cache_dates"] is True
+
+    mock_files.write_csv.assert_called_once()
+    call_args = mock_files.write_csv.call_args
+    assert (sys.stdout, mock_files.read_csv.return_value) in call_args
+    assert call_args.kwargs["columns"] == OUTPUT_COLUMNS
+
+
+def test_convert_to_harvest_sample(source_data, sample_transformed_data, mock_google_user_info):
+    mock_google_user_info.return_value = {}
+    output = None
+
+    with StringIO() as output_data:
+        convert_to_harvest(source_data, output_data)
+        output = output_data.getvalue()
+
+    assert output
+    assert isinstance(output, str)
+    assert ",".join(OUTPUT_COLUMNS) in output
+
+    order = ["Date", "First Name", "Hours"]
+    sample_output_df = pd.read_csv(sample_transformed_data).sort_values(order)
+    output_df = pd.read_csv(StringIO(output)).sort_values(order)
+
+    assert set(output_df.columns.to_list()) < set(sample_output_df.columns.to_list())
+    assert output_df["Client"].eq("Test_Client").all()