From abd8a45862fe7d3154df3a784fb61a6bf825c94e Mon Sep 17 00:00:00 2001 From: Armen Zambrano G Date: Wed, 1 Feb 2023 10:46:11 -0500 Subject: [PATCH 1/4] feat(derived_code_mappings): Prevent rate limit API exhaustion One of our customers is intermittently exhausting their rate limit, thus, affecting the commit_context tasks. Changes included: * get_repositories will not make API requests if the remaining requests is 200 or less. We will keep fetching repositories in the next hour when the limit is reset. * Spread caching of repositories across 24 hours. This will make the task only use 1/24th of the required API requests. Fixes (SENTRY-Y3M)[https://sentry.sentry.io/issues/3900033302/] --- tests/sentry/integrations/github/test_integration.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/tests/sentry/integrations/github/test_integration.py b/tests/sentry/integrations/github/test_integration.py index f25fa519cf6fc3..e860df5a520261 100644 --- a/tests/sentry/integrations/github/test_integration.py +++ b/tests/sentry/integrations/github/test_integration.py @@ -611,15 +611,14 @@ def test_get_trees_for_org_works(self): installation = self.get_installation_helper() self.set_rate_limit(MINIMUM_REQUESTS + 50) expected_trees = { - "Test-Organization/bar": RepoTree(Repo("Test-Organization/bar", "main"), []), - "Test-Organization/baz": RepoTree(Repo("Test-Organization/baz", "master"), []), - "Test-Organization/foo": RepoTree( - Repo("Test-Organization/foo", "master"), - ["src/sentry/api/endpoints/auth_login.py"], - ), "Test-Organization/xyz": RepoTree( Repo("Test-Organization/xyz", "master"), ["src/foo.py"] ), + "Test-Organization/foo": RepoTree( + Repo("Test-Organization/foo", "master"), ["src/sentry/api/endpoints/auth_login.py"] + ), + "Test-Organization/bar": RepoTree(Repo("Test-Organization/bar", "main"), []), + "Test-Organization/baz": RepoTree(Repo("Test-Organization/baz", "master"), []), } repos_key = "githubtrees:repositories:Test-Organization" From 1de2311ae2d8d6fa0f6c8cb1cc75e0652bcf9b38 Mon Sep 17 00:00:00 2001 From: Armen Zambrano G Date: Fri, 3 Feb 2023 12:38:08 -0500 Subject: [PATCH 2/4] ref(derived_code_mappings): Adjust handling of errors This fixes an issue where we would give up processing as soon as any Github API error would be returned (to the exception of some defined cases). This also moves handling of errors to the caller in order to have a centralized processing. It seems that [we raise unidentified API errors](https://github.com/getsentry/sentry/blob/d0252f70b97ceef909c86128566716255fab7eab/src/sentry/integrations/github/client.py#L124-L126) in get_tree. [get_cached_repo_files does not catch it](https://github.com/getsentry/sentry/blob/d0252f70b97ceef909c86128566716255fab7eab/src/sentry/integrations/github/client.py#L130-L161). [The iteration of the repositories happens within the try/block](https://github.com/getsentry/sentry/blob/d0252f70b97ceef909c86128566716255fab7eab/src/sentry/integrations/github/client.py#L170-L176), thus, failing to work as soon as one repo fails to process. --- src/sentry/integrations/github/client.py | 116 +++++++++--------- .../integrations/github/test_integration.py | 30 +++-- 2 files changed, 77 insertions(+), 69 deletions(-) diff --git a/src/sentry/integrations/github/client.py b/src/sentry/integrations/github/client.py index 979937709c58b3..43de32e6075176 100644 --- a/src/sentry/integrations/github/client.py +++ b/src/sentry/integrations/github/client.py @@ -97,38 +97,22 @@ def get_rate_limit(self, specific_resource: str = "core") -> GithubRateLimitInfo # https://docs.github.com/en/rest/git/trees#get-a-tree def get_tree(self, repo_full_name: str, tree_sha: str) -> JSONData: tree: JSONData = {} - try: - # We do not cache this call since it is a rather large object - contents: Dict[str, Any] = self.get( - f"/repos/{repo_full_name}/git/trees/{tree_sha}", - # Will cause all objects or subtrees referenced by the tree specified in :tree_sha - params={"recursive": 1}, + # We do not cache this call since it is a rather large object + contents: Dict[str, Any] = self.get( + f"/repos/{repo_full_name}/git/trees/{tree_sha}", + # Will cause all objects or subtrees referenced by the tree specified in :tree_sha + params={"recursive": 1}, + ) + # If truncated is true in the response then the number of items in the tree array exceeded our maximum limit. + # If you need to fetch more items, use the non-recursive method of fetching trees, and fetch one sub-tree at a time. + # Note: The limit for the tree array is 100,000 entries with a maximum size of 7 MB when using the recursive parameter. + # XXX: We will need to improve this by iterating through trees without using the recursive parameter + if contents.get("truncated"): + # e.g. getsentry/DataForThePeople + logger.warning( + f"The tree for {repo_full_name} has been truncated. Use different a approach for retrieving contents of tree." ) - # If truncated is true in the response then the number of items in the tree array exceeded our maximum limit. - # If you need to fetch more items, use the non-recursive method of fetching trees, and fetch one sub-tree at a time. - # Note: The limit for the tree array is 100,000 entries with a maximum size of 7 MB when using the recursive parameter. - # XXX: We will need to improve this by iterating through trees without using the recursive parameter - if contents.get("truncated"): - # e.g. getsentry/DataForThePeople - logger.warning( - f"The tree for {repo_full_name} has been truncated. Use different a approach for retrieving contents of tree." - ) - tree = contents["tree"] - except ApiError as error: - msg = error.text - if error.json: - json_data: JSONData = error.json - msg = json_data.get("message") - - # TODO: Add condition for getsentry/DataForThePeople - # e.g. getsentry/nextjs-sentry-example - if msg == "Git Repository is empty.": - logger.warning(f"{repo_full_name} is empty.") - elif msg == "Not Found": - logger.warning(f"The Github App does not have access to {repo_full_name}.") - else: - # Raise it so we stop hammering the API - raise error + tree = contents["tree"] return tree @@ -176,28 +160,12 @@ def get_trees_for_org(self, gh_org: str, cache_seconds: int = 3600 * 24) -> Dict """ trees: Dict[str, RepoTree] = {} extra = {"gh_org": gh_org} - try: - repositories = self._populate_repositories(gh_org, cache_seconds) - trees = self._populate_trees(repositories) + repositories = self._populate_repositories(gh_org, cache_seconds) + trees = self._populate_trees(repositories) - rate_limit = self.get_rate_limit() - extra.update( - {"remaining": str(rate_limit.remaining), "repos_num": str(len(repositories))} - ) - logger.info("Using cached trees for Github org.", extra=extra) - except ApiError as error: - msg = error.text - if error.json: - json_data: JSONData = error.json - msg = json_data.get("message") - if msg.startswith("API rate limit exceeded for installation"): - # Report to Sentry; we will return whatever tree we have - logger.exception( - "API rate limit exceeded. We will not hit it. Continuing execution.", - extra=extra, - ) - else: - raise error + rate_limit = self.get_rate_limit() + extra.update({"remaining": str(rate_limit.remaining), "repos_num": str(len(repositories))}) + logger.info("Using cached trees for Github org.", extra=extra) return trees @@ -221,6 +189,27 @@ def _populate_trees(self, repositories: List[Dict[str, str]]) -> Dict[str, RepoT For every repository, fetch the tree associated and cache it. This function takes API rate limits into consideration to prevent exhaustion. """ + + def process_error(error: ApiError, extra: Dict[str, str]) -> None: + msg = "Continuing exectuion." + txt = error.text + if error.json: + json_data: JSONData = error.json + txt = json_data.get("message") + + # TODO: Add condition for getsentry/DataForThePeople + # e.g. getsentry/nextjs-sentry-example + if txt == "Git Repository is empty.": + logger.warning(f"The repository is empty. {msg}", extra=extra) + elif txt == "Not Found": + logger.warning(f"The app does not have access to the repo. {msg}", extra=extra) + else: + # We do not raise the exception so we can keep iterating through the repos. + # Nevertheless, investigate the error to determine if we should abort the processing + logger.exception( + f"Investigate if to raise error. An error happened. {msg}", extra=extra + ) + trees: Dict[str, RepoTree] = {} only_use_cache = False @@ -229,19 +218,28 @@ def _populate_trees(self, repositories: List[Dict[str, str]]) -> Dict[str, RepoT logger.info("Current rate limit info.", extra={"rate_limit": rate_limit}) for index, repo_info in enumerate(repositories): + repo_full_name = repo_info["full_name"] + extra = {"repo_full_name": repo_full_name} # Only use the cache if we drop below the lower ceiling # We will fetch after the limit is reset (every hour) if not only_use_cache and remaining_requests <= MINIMUM_REQUESTS: only_use_cache = True logger.info( - "Too few requests remaining. Grabbing values from the cache.", - extra={"repo_info": repo_info}, + "Too few requests remaining. Grabbing values from the cache.", extra=extra ) - # The Github API rate limit is reset every hour - # Spread the expiration of the cache of each repo across the day - trees[repo_info["full_name"]] = self._populate_tree( - repo_info, only_use_cache, (3600 * 24) + (3600 * (index % 24)) - ) + + try: + # The Github API rate limit is reset every hour + # Spread the expiration of the cache of each repo across the day + trees[repo_full_name] = self._populate_tree( + repo_info, only_use_cache, (3600 * 24) + (3600 * (index % 24)) + ) + except ApiError as error: + process_error(error, extra) + except Exception as error: + # Report for investigatagiation but do not stop processing + logger.exception(error, extra=extra) + remaining_requests -= 1 return trees diff --git a/tests/sentry/integrations/github/test_integration.py b/tests/sentry/integrations/github/test_integration.py index e860df5a520261..3a68f62d063524 100644 --- a/tests/sentry/integrations/github/test_integration.py +++ b/tests/sentry/integrations/github/test_integration.py @@ -609,16 +609,15 @@ def get_installation_helper(self): def test_get_trees_for_org_works(self): """Fetch the tree representation of a repo""" installation = self.get_installation_helper() + cache.clear() self.set_rate_limit(MINIMUM_REQUESTS + 50) expected_trees = { - "Test-Organization/xyz": RepoTree( - Repo("Test-Organization/xyz", "master"), ["src/foo.py"] - ), "Test-Organization/foo": RepoTree( Repo("Test-Organization/foo", "master"), ["src/sentry/api/endpoints/auth_login.py"] ), - "Test-Organization/bar": RepoTree(Repo("Test-Organization/bar", "main"), []), - "Test-Organization/baz": RepoTree(Repo("Test-Organization/baz", "master"), []), + "Test-Organization/xyz": RepoTree( + Repo("Test-Organization/xyz", "master"), ["src/foo.py"] + ), } repos_key = "githubtrees:repositories:Test-Organization" @@ -645,6 +644,8 @@ def test_get_trees_for_org_works(self): def test_get_trees_for_org_prevent_exhaustion_some_repos(self): """Some repos will hit the network but the rest will grab from the cache.""" gh_org = "Test-Organization" + repos_key = "githubtrees:repositories:Test-Organization" + cache.clear() installation = self.get_installation_helper() expected_trees = { f"{gh_org}/xyz": RepoTree(Repo(f"{gh_org}/xyz", "master"), ["src/foo.py"]), @@ -656,8 +657,15 @@ def test_get_trees_for_org_prevent_exhaustion_some_repos(self): with patch("sentry.integrations.github.client.MINIMUM_REQUESTS", new=5, autospec=False): # We start with one request left before reaching the minimum remaining requests floor self.set_rate_limit(remaining=6) + assert cache.get(repos_key) is None trees = installation.get_trees_for_org() assert trees == expected_trees # xyz will have files but not foo + assert cache.get(repos_key) == [ + {"full_name": "Test-Organization/xyz", "default_branch": "master"}, + {"full_name": "Test-Organization/foo", "default_branch": "master"}, + {"full_name": "Test-Organization/bar", "default_branch": "main"}, + {"full_name": "Test-Organization/baz", "default_branch": "master"}, + ] # Another call should not make us loose the files for xyz self.set_rate_limit(remaining=5) @@ -667,8 +675,10 @@ def test_get_trees_for_org_prevent_exhaustion_some_repos(self): # We reset the remaining values self.set_rate_limit(remaining=20) trees = installation.get_trees_for_org() - # Now that the rate limit is reset we should get files for foo - expected_trees[f"{gh_org}/foo"] = RepoTree( - Repo(f"{gh_org}/foo", "master"), ["src/sentry/api/endpoints/auth_login.py"] - ) - assert trees == expected_trees + assert trees == { + f"{gh_org}/xyz": RepoTree(Repo(f"{gh_org}/xyz", "master"), ["src/foo.py"]), + # Now that the rate limit is reset we should get files for foo + f"{gh_org}/foo": RepoTree( + Repo(f"{gh_org}/foo", "master"), ["src/sentry/api/endpoints/auth_login.py"] + ), + } From 28b4d56408ab0950ae2e970ae2f0c11bf4a74dad Mon Sep 17 00:00:00 2001 From: Armen Zambrano G Date: Mon, 6 Feb 2023 14:44:13 -0500 Subject: [PATCH 3/4] Minor fix --- src/sentry/integrations/github/client.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/sentry/integrations/github/client.py b/src/sentry/integrations/github/client.py index 43de32e6075176..3f1ec6c44840b9 100644 --- a/src/sentry/integrations/github/client.py +++ b/src/sentry/integrations/github/client.py @@ -236,9 +236,11 @@ def process_error(error: ApiError, extra: Dict[str, str]) -> None: ) except ApiError as error: process_error(error, extra) - except Exception as error: + except Exception: # Report for investigatagiation but do not stop processing - logger.exception(error, extra=extra) + logger.exception( + "Failed to populate_tree. Investigate. Contining execution.", extra=extra + ) remaining_requests -= 1 From 02c71601870303efe62443972d14b1df7d264a67 Mon Sep 17 00:00:00 2001 From: Armen Zambrano G Date: Tue, 7 Feb 2023 09:10:23 -0500 Subject: [PATCH 4/4] Apply suggestions from code review --- src/sentry/integrations/github/client.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/sentry/integrations/github/client.py b/src/sentry/integrations/github/client.py index 3f1ec6c44840b9..bcdebb76518e94 100644 --- a/src/sentry/integrations/github/client.py +++ b/src/sentry/integrations/github/client.py @@ -191,7 +191,7 @@ def _populate_trees(self, repositories: List[Dict[str, str]]) -> Dict[str, RepoT """ def process_error(error: ApiError, extra: Dict[str, str]) -> None: - msg = "Continuing exectuion." + msg = "Continuing execution." txt = error.text if error.json: json_data: JSONData = error.json @@ -237,7 +237,7 @@ def process_error(error: ApiError, extra: Dict[str, str]) -> None: except ApiError as error: process_error(error, extra) except Exception: - # Report for investigatagiation but do not stop processing + # Report for investigation but do not stop processing logger.exception( "Failed to populate_tree. Investigate. Contining execution.", extra=extra )