Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
118 changes: 59 additions & 59 deletions src/sentry/integrations/github/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,38 +97,22 @@ def get_rate_limit(self, specific_resource: str = "core") -> GithubRateLimitInfo
# https://docs.github.com/en/rest/git/trees#get-a-tree
def get_tree(self, repo_full_name: str, tree_sha: str) -> JSONData:
tree: JSONData = {}
try:
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If you use "Hide whitespace" this PR is easier to review.
image

# We do not cache this call since it is a rather large object
contents: Dict[str, Any] = self.get(
f"/repos/{repo_full_name}/git/trees/{tree_sha}",
# Will cause all objects or subtrees referenced by the tree specified in :tree_sha
params={"recursive": 1},
# We do not cache this call since it is a rather large object
contents: Dict[str, Any] = self.get(
f"/repos/{repo_full_name}/git/trees/{tree_sha}",
# Will cause all objects or subtrees referenced by the tree specified in :tree_sha
params={"recursive": 1},
)
# If truncated is true in the response then the number of items in the tree array exceeded our maximum limit.
# If you need to fetch more items, use the non-recursive method of fetching trees, and fetch one sub-tree at a time.
# Note: The limit for the tree array is 100,000 entries with a maximum size of 7 MB when using the recursive parameter.
# XXX: We will need to improve this by iterating through trees without using the recursive parameter
if contents.get("truncated"):
# e.g. getsentry/DataForThePeople
logger.warning(
f"The tree for {repo_full_name} has been truncated. Use different a approach for retrieving contents of tree."
)
# If truncated is true in the response then the number of items in the tree array exceeded our maximum limit.
# If you need to fetch more items, use the non-recursive method of fetching trees, and fetch one sub-tree at a time.
# Note: The limit for the tree array is 100,000 entries with a maximum size of 7 MB when using the recursive parameter.
# XXX: We will need to improve this by iterating through trees without using the recursive parameter
if contents.get("truncated"):
# e.g. getsentry/DataForThePeople
logger.warning(
f"The tree for {repo_full_name} has been truncated. Use different a approach for retrieving contents of tree."
)
tree = contents["tree"]
except ApiError as error:
Copy link
Member Author

@armenzg armenzg Feb 3, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Removing this except block and letting _populate_trees handle it.

try:
# The Github API rate limit is reset every hour
# Spread the expiration of the cache of each repo across the day
trees[repo_full_name] = self._populate_tree(
repo_info, only_use_cache, (3600 * 24) + (3600 * (index % 24))
)
except ApiError as error:
process_error(error, extra)
except Exception:
# Report for investigatagiation but do not stop processing
logger.exception(

msg = error.text
if error.json:
json_data: JSONData = error.json
msg = json_data.get("message")

# TODO: Add condition for getsentry/DataForThePeople
# e.g. getsentry/nextjs-sentry-example
if msg == "Git Repository is empty.":
logger.warning(f"{repo_full_name} is empty.")
elif msg == "Not Found":
logger.warning(f"The Github App does not have access to {repo_full_name}.")
else:
# Raise it so we stop hammering the API
raise error
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In here we raise any non-identified error and this would cause the processing of all repos to abort.

tree = contents["tree"]

return tree

Expand Down Expand Up @@ -176,28 +160,12 @@ def get_trees_for_org(self, gh_org: str, cache_seconds: int = 3600 * 24) -> Dict
"""
trees: Dict[str, RepoTree] = {}
extra = {"gh_org": gh_org}
try:
repositories = self._populate_repositories(gh_org, cache_seconds)
trees = self._populate_trees(repositories)
repositories = self._populate_repositories(gh_org, cache_seconds)
trees = self._populate_trees(repositories)

rate_limit = self.get_rate_limit()
extra.update(
{"remaining": str(rate_limit.remaining), "repos_num": str(len(repositories))}
)
logger.info("Using cached trees for Github org.", extra=extra)
except ApiError as error:
Copy link
Member Author

@armenzg armenzg Feb 3, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

msg = error.text
if error.json:
json_data: JSONData = error.json
msg = json_data.get("message")
if msg.startswith("API rate limit exceeded for installation"):
# Report to Sentry; we will return whatever tree we have
logger.exception(
"API rate limit exceeded. We will not hit it. Continuing execution.",
extra=extra,
)
else:
raise error
rate_limit = self.get_rate_limit()
extra.update({"remaining": str(rate_limit.remaining), "repos_num": str(len(repositories))})
logger.info("Using cached trees for Github org.", extra=extra)

return trees

Expand All @@ -221,6 +189,27 @@ def _populate_trees(self, repositories: List[Dict[str, str]]) -> Dict[str, RepoT
For every repository, fetch the tree associated and cache it.
This function takes API rate limits into consideration to prevent exhaustion.
"""

def process_error(error: ApiError, extra: Dict[str, str]) -> None:
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This block basically comes from get_tree. We're now handling in here to allow keep moving forward with processing repositories rather than aborting.

msg = "Continuing execution."
txt = error.text
if error.json:
json_data: JSONData = error.json
txt = json_data.get("message")

# TODO: Add condition for getsentry/DataForThePeople
# e.g. getsentry/nextjs-sentry-example
if txt == "Git Repository is empty.":
logger.warning(f"The repository is empty. {msg}", extra=extra)
elif txt == "Not Found":
logger.warning(f"The app does not have access to the repo. {msg}", extra=extra)
else:
# We do not raise the exception so we can keep iterating through the repos.
# Nevertheless, investigate the error to determine if we should abort the processing
logger.exception(
f"Investigate if to raise error. An error happened. {msg}", extra=extra
)
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In here we only report the error unlike in get_tree where we would raise the error, thus, making the processing abort.

Original code in get_tree:
image


trees: Dict[str, RepoTree] = {}
only_use_cache = False

Expand All @@ -229,19 +218,30 @@ def _populate_trees(self, repositories: List[Dict[str, str]]) -> Dict[str, RepoT
logger.info("Current rate limit info.", extra={"rate_limit": rate_limit})

for index, repo_info in enumerate(repositories):
repo_full_name = repo_info["full_name"]
extra = {"repo_full_name": repo_full_name}
# Only use the cache if we drop below the lower ceiling
# We will fetch after the limit is reset (every hour)
if not only_use_cache and remaining_requests <= MINIMUM_REQUESTS:
only_use_cache = True
logger.info(
"Too few requests remaining. Grabbing values from the cache.",
extra={"repo_info": repo_info},
"Too few requests remaining. Grabbing values from the cache.", extra=extra
)
# The Github API rate limit is reset every hour
# Spread the expiration of the cache of each repo across the day
trees[repo_info["full_name"]] = self._populate_tree(
repo_info, only_use_cache, (3600 * 24) + (3600 * (index % 24))
)

try:
# The Github API rate limit is reset every hour
# Spread the expiration of the cache of each repo across the day
trees[repo_full_name] = self._populate_tree(
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If a call to _populate_tree raises an error, we will now report to Sentry and not abort the processing.

repo_info, only_use_cache, (3600 * 24) + (3600 * (index % 24))
)
except ApiError as error:
process_error(error, extra)
except Exception:
# Report for investigation but do not stop processing
logger.exception(
"Failed to populate_tree. Investigate. Contining execution.", extra=extra
)

remaining_requests -= 1

return trees
Expand Down
27 changes: 18 additions & 9 deletions tests/sentry/integrations/github/test_integration.py
Original file line number Diff line number Diff line change
Expand Up @@ -609,13 +609,11 @@ def get_installation_helper(self):
def test_get_trees_for_org_works(self):
"""Fetch the tree representation of a repo"""
installation = self.get_installation_helper()
cache.clear()
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I noticed that the next test would sometimes fail because the cache side effects of this test would affect the next.

I tried to add this as part of tearDown but it did not seem to matter. I will investigate on another occasion.

self.set_rate_limit(MINIMUM_REQUESTS + 50)
expected_trees = {
"Test-Organization/bar": RepoTree(Repo("Test-Organization/bar", "main"), []),
"Test-Organization/baz": RepoTree(Repo("Test-Organization/baz", "master"), []),
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

bar and baz return 4xx errors, thus, we do not store them in the trees object anymore.

"Test-Organization/foo": RepoTree(
Repo("Test-Organization/foo", "master"),
["src/sentry/api/endpoints/auth_login.py"],
Repo("Test-Organization/foo", "master"), ["src/sentry/api/endpoints/auth_login.py"]
),
"Test-Organization/xyz": RepoTree(
Repo("Test-Organization/xyz", "master"), ["src/foo.py"]
Expand Down Expand Up @@ -646,6 +644,8 @@ def test_get_trees_for_org_works(self):
def test_get_trees_for_org_prevent_exhaustion_some_repos(self):
"""Some repos will hit the network but the rest will grab from the cache."""
gh_org = "Test-Organization"
repos_key = "githubtrees:repositories:Test-Organization"
cache.clear()
installation = self.get_installation_helper()
expected_trees = {
f"{gh_org}/xyz": RepoTree(Repo(f"{gh_org}/xyz", "master"), ["src/foo.py"]),
Expand All @@ -657,8 +657,15 @@ def test_get_trees_for_org_prevent_exhaustion_some_repos(self):
with patch("sentry.integrations.github.client.MINIMUM_REQUESTS", new=5, autospec=False):
# We start with one request left before reaching the minimum remaining requests floor
self.set_rate_limit(remaining=6)
assert cache.get(repos_key) is None
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a good enough test to notice the cache cleared.

trees = installation.get_trees_for_org()
assert trees == expected_trees # xyz will have files but not foo
assert cache.get(repos_key) == [
{"full_name": "Test-Organization/xyz", "default_branch": "master"},
{"full_name": "Test-Organization/foo", "default_branch": "master"},
{"full_name": "Test-Organization/bar", "default_branch": "main"},
{"full_name": "Test-Organization/baz", "default_branch": "master"},
]

# Another call should not make us loose the files for xyz
self.set_rate_limit(remaining=5)
Expand All @@ -668,8 +675,10 @@ def test_get_trees_for_org_prevent_exhaustion_some_repos(self):
# We reset the remaining values
self.set_rate_limit(remaining=20)
trees = installation.get_trees_for_org()
# Now that the rate limit is reset we should get files for foo
expected_trees[f"{gh_org}/foo"] = RepoTree(
Repo(f"{gh_org}/foo", "master"), ["src/sentry/api/endpoints/auth_login.py"]
)
assert trees == expected_trees
assert trees == {
f"{gh_org}/xyz": RepoTree(Repo(f"{gh_org}/xyz", "master"), ["src/foo.py"]),
# Now that the rate limit is reset we should get files for foo
f"{gh_org}/foo": RepoTree(
Repo(f"{gh_org}/foo", "master"), ["src/sentry/api/endpoints/auth_login.py"]
),
}