From d9fd5f9836f4667ba968c60e99674665bedc94e4 Mon Sep 17 00:00:00 2001 From: cdboer Date: Sat, 19 Nov 2022 14:41:23 +0100 Subject: [PATCH 01/81] Differentiate between gitlab and github instances Differentiate between gitlab and github instances when cloning a git repository using a token over https. --- gitlab2prov/adapters/fetch/git.py | 8 ++++++-- gitlab2prov/adapters/fetch/gitlab.py | 4 ++-- gitlab2prov/adapters/fetch/utils.py | 17 ++++++++++------- tests/unit/test_fetch_utils.py | 20 ++++++++++++++------ 4 files changed, 32 insertions(+), 17 deletions(-) diff --git a/gitlab2prov/adapters/fetch/git.py b/gitlab2prov/adapters/fetch/git.py index e86f90d..a204853 100644 --- a/gitlab2prov/adapters/fetch/git.py +++ b/gitlab2prov/adapters/fetch/git.py @@ -37,9 +37,13 @@ def __exit__(self, exc_type, exc_val, exc_tb): self._tmpdir.cleanup() def do_clone(self) -> None: - url = clone_over_https_url(self.url, self.token) + clone_url = "" + if "gitlab.com" in self.url: + clone_url = clone_over_https_url(self.url, self.token, "gitlab") + if "github.com" in self.url: + clone_url = clone_over_https_url(self.url, self.token, "github") self._repo = Repo.clone_from( - url=url, + url=clone_url, to_path=self._tmpdir.name, ) diff --git a/gitlab2prov/adapters/fetch/gitlab.py b/gitlab2prov/adapters/fetch/gitlab.py index d153f30..9f6b073 100644 --- a/gitlab2prov/adapters/fetch/gitlab.py +++ b/gitlab2prov/adapters/fetch/gitlab.py @@ -13,7 +13,7 @@ from gitlab.v4.objects import ProjectTag from gitlab2prov.adapters.fetch.annotations import parse_annotations -from gitlab2prov.adapters.fetch.utils import gitlab_url +from gitlab2prov.adapters.fetch.utils import instance_url from gitlab2prov.adapters.fetch.utils import project_slug from gitlab2prov.domain.constants import ProvRole from gitlab2prov.domain.objects import Asset @@ -36,7 +36,7 @@ class GitlabFetcher: _project: Project | None = field(init=False, default=None) def do_login(self) -> None: - gl = Gitlab(url=gitlab_url(self.url), private_token=self.token) + gl = Gitlab(url=instance_url(self.url), private_token=self.token) self._project = gl.projects.get(project_slug(self.url)) def fetch_gitlab( diff --git a/gitlab2prov/adapters/fetch/utils.py b/gitlab2prov/adapters/fetch/utils.py index 4b83042..32d26a0 100644 --- a/gitlab2prov/adapters/fetch/utils.py +++ b/gitlab2prov/adapters/fetch/utils.py @@ -2,17 +2,20 @@ def project_slug(url: str) -> str: - path = urlsplit(url).path - if path is None: - return None - return path.strip("/") + if path := urlsplit(url).path: + owner, project = (s for s in path.split("/") if s) + return f"{owner}/{project}" + return None -def gitlab_url(url: str) -> str: +def instance_url(url: str) -> str: split = urlsplit(url) return f"{split.scheme}://{split.netloc}" -def clone_over_https_url(url: str, token: str) -> str: +def clone_over_https_url(url: str, token: str, platform: str = "gitlab") -> str: split = urlsplit(url) - return f"https://gitlab.com:{token}@{split.netloc}/{project_slug(url)}" + if platform == "gitlab": + return f"https://gitlab.com:{token}@{split.netloc}/{project_slug(url)}" + if platform == "github": + return f"https://{token}@{split.netloc}/{project_slug(url)}.git" diff --git a/tests/unit/test_fetch_utils.py b/tests/unit/test_fetch_utils.py index 5cdb6f7..8e0ee58 100644 --- a/tests/unit/test_fetch_utils.py +++ b/tests/unit/test_fetch_utils.py @@ -3,15 +3,23 @@ class TestHelpers: def test_project_slug(self): - expected_slug = "group/project" - assert expected_slug == utils.project_slug("https://gitlab.com/group/project") + expected_slug = "owner/project" + assert expected_slug == utils.project_slug("https://gitlab.com/owner/project") def test_gitlab_url(self): expected_url = "https://gitlab.com" - assert expected_url == utils.gitlab_url("https://gitlab.com/group/project") + assert expected_url == utils.instance_url("https://gitlab.com/owner/project") + + def test_github_url(self): + expected_url = "https://github.com" + assert expected_url == utils.instance_url("https://github.com/owner/project") def test_clone_over_https_url(self): - expected_url = "https://gitlab.com:TOKEN@gitlab.com/group/project" - assert expected_url == utils.clone_over_https_url( - "https://gitlab.com/group/project", "TOKEN" + expected_gitlab_url = "https://gitlab.com:TOKEN@gitlab.com/owner/project" + assert expected_gitlab_url == utils.clone_over_https_url( + "https://gitlab.com/owner/project", "TOKEN", "gitlab" + ) + expected_github_url = "https://TOKEN@github.com/owner/project.git" + assert expected_github_url == utils.clone_over_https_url( + "https://github.com/owner/project", "TOKEN", "github" ) From 3ecf13c9d5b47930eac9f608de6fbdeba2b312d5 Mon Sep 17 00:00:00 2001 From: cdboer Date: Sat, 19 Nov 2022 17:31:36 +0100 Subject: [PATCH 02/81] Restructure getting started section --- README.md | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 8496af8..f21ba16 100644 --- a/README.md +++ b/README.md @@ -57,8 +57,10 @@ pip install gitlab2prov[dev] # PyPi, install with extras ## ⚡ Getting started -`gitlab2prov` needs a [personal access token](https://docs.gitlab.com/ee/user/profile/personal_access_tokens.html) to clone git repositories and to authenticate with the GitLab API. -Follow [this guide](./docs/guides/tokens.md) to create an access token with the required [scopes](https://docs.gitlab.com/ee/user/profile/personal_access_tokens.html#personal-access-token-scopes). +`gitlab2prov` requires a [personal access token](https://docs.gitlab.com/ee/user/profile/personal_access_tokens.html) to clone git repositories and to authenticate with the GitLab API. + +Use the following guide to obtain a token with the required [scopes](https://docs.gitlab.com/ee/user/profile/personal_access_tokens.html#personal-access-token-scopes) for yourself: +- [Create a personal access token (GitLab)](./docs/guides/tokens.md) ## 🚀‍ Usage From 2865e5e9eb008f1f8426f8cd00806dbf224e81d8 Mon Sep 17 00:00:00 2001 From: cdboer Date: Sat, 3 Dec 2022 13:48:10 +0100 Subject: [PATCH 03/81] Add PyGithub to dependencies --- pyproject.toml | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 88b6b1a..d30a7a9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -12,7 +12,15 @@ authors = [{ name = "Claas de Boer", email = "claas.deboer@dlr.de" }] maintainers = [ { name = "Andreas Schreiber", email = "andreas.schreiber@dlr.de" }, ] -dependencies = ["prov>=2.0.0", "git-python", "python-gitlab", "jsonschema", "ruamel.yaml", "pydot>=1.2.0"] +dependencies = [ + "prov>=2.0.0", + "git-python", + "python-gitlab", + "jsonschema", + "ruamel.yaml", + "pydot>=1.2.0", + "PyGithub", +] keywords = [ "prov", "gitlab", From b49eae10d8862ae2725d4c10508eba6a937a0f0b Mon Sep 17 00:00:00 2001 From: cdboer Date: Sun, 4 Dec 2022 19:08:18 +0100 Subject: [PATCH 04/81] Add github specific dataclasses --- gitlab2prov/domain/objects.py | 114 ++++++++++++++++++++++++++++++++++ 1 file changed, 114 insertions(+) diff --git a/gitlab2prov/domain/objects.py b/gitlab2prov/domain/objects.py index 657f4b5..50e82a4 100644 --- a/gitlab2prov/domain/objects.py +++ b/gitlab2prov/domain/objects.py @@ -210,6 +210,44 @@ def annotated_versions(self) -> list[AnnotatedVersion]: ] +@dataclass(unsafe_hash=True, kw_only=True) +class GithubIssue(ProvMixin, EntityMixin): + number: str # id + id: str # analogous to gitlab iid + title: str + body: str = field(repr=False) + url: str = field(repr=False) + author: User = field(repr=False, metadata=IS_RELATION) + annotations: list[Annotation] = field(repr=False, metadata=IS_RELATION) + created_at: datetime = field(repr=False) + closed_at: datetime | None = field(repr=False, default=None) + prov_type: ProvType = field(init=False, repr=False, default=ProvType.ISSUE) + + @property + def creation(self) -> Creation: + return Creation( + creation_id=self.number, + prov_start=self.created_at, + prov_end=self.closed_at, + prov_type=ProvType.ISSUE_CREATION, + ) + + @property + def first_version(self) -> Version: + return Version(version_id=self.number, prov_type=ProvType.ISSUE_VERSION) + + @property + def annotated_versions(self) -> list[AnnotatedVersion]: + return [ + AnnotatedVersion( + version_id=self.number, + annotation_id=annotation.id, + prov_type=ProvType.ISSUE_VERSION_ANNOTATED, + ) + for annotation in self.annotations + ] + + @dataclass(unsafe_hash=True, kw_only=True) class GitlabCommit(ProvMixin, EntityMixin): hexsha: str @@ -245,6 +283,41 @@ def annotated_versions(self) -> list[AnnotatedVersion]: ] +@dataclass(unsafe_hash=True, kw_only=True) +class GithubCommit(ProvMixin, EntityMixin): + hexsha: str + url: str = field(repr=False) + author: User = field(repr=False, metadata=IS_RELATION) + annotations: list[Annotation] = field(repr=False, metadata=IS_RELATION) # comments ... + authored_at: datetime = field(repr=False) + committed_at: datetime = field(repr=False) + prov_type: ProvType = field(init=False, repr=False, default=ProvType.GITHUB_COMMIT) + + @property + def creation(self) -> Creation: + return Creation( + creation_id=self.sha, + prov_start=self.authored_at, + prov_end=self.committed_at, + prov_type=ProvType.GITHUB_COMMIT_CREATION, + ) + + @property + def first_version(self) -> Version: + return Version(version_id=self.hexsha, prov_type=ProvType.GITHUB_COMMIT_VERSION) + + @property + def annotated_versions(self) -> list[AnnotatedVersion]: + return [ + AnnotatedVersion( + version_id=self.hexsha, + annotation_id=annotation.id, + prov_type=ProvType.GITHUB_COMMIT_VERSION_ANNOTATED, + ) + for annotation in self.annotations + ] + + @dataclass(unsafe_hash=True, kw_only=True) class MergeRequest(ProvMixin, EntityMixin): id: str @@ -287,6 +360,47 @@ def annotated_versions(self) -> list[AnnotatedVersion]: ] +@dataclass(unsafe_hash=True, kw_only=True) +class GithubPullRequest(ProvMixin, EntityMixin): + number: str # id + id: str # iid + title: str + body: str = field(repr=False) + url: str = field(repr=False) + head: str = field(repr=False) # source_branch + base: str = field(repr=False) # target_branch + author: User = field(repr=False, metadata=IS_RELATION) + annotations: list[Annotation] = field(repr=False, metadata=IS_RELATION) + created_at: datetime = field(repr=False) + closed_at: datetime | None = field(repr=False, default=None) + merged_at: datetime | None = field(repr=False, default=None) # TODO: is this field necessary? + prov_type: ProvType = field(init=False, repr=False, default=ProvType.PULL_REQUEST) + + @property + def creation(self) -> Creation: + return Creation( + creation_id=self.number, + prov_start=self.created_at, + prov_end=self.closed_at, + prov_type=ProvType.PULL_REQUEST_CREATION, + ) + + @property + def first_version(self) -> Version: + return Version(version_id=self.number, prov_type=ProvType.PULL_REQUEST_VERSION) + + @property + def annotated_versions(self) -> list[AnnotatedVersion]: + return [ + AnnotatedVersion( + version_id=self.number, + annotation_id=annotation.id, + prov_type=ProvType.PULL_REQUEST_VERSION_ANNOTATED, + ) + for annotation in self.annotations + ] + + @dataclass(unsafe_hash=True, kw_only=True) class Tag(ProvMixin, EntityMixin): name: str From beb5f9d18b96d0b98a84eb1969867eac8152600a Mon Sep 17 00:00:00 2001 From: cdboer Date: Sun, 4 Dec 2022 19:08:36 +0100 Subject: [PATCH 05/81] Add fetcher for github resources --- gitlab2prov/adapters/fetch/github.py | 240 +++++++++++++++++++++++++++ 1 file changed, 240 insertions(+) create mode 100644 gitlab2prov/adapters/fetch/github.py diff --git a/gitlab2prov/adapters/fetch/github.py b/gitlab2prov/adapters/fetch/github.py new file mode 100644 index 0000000..737798f --- /dev/null +++ b/gitlab2prov/adapters/fetch/github.py @@ -0,0 +1,240 @@ +import logging + +from dataclasses import dataclass, field + +import github + +from gitlab2prov.adapters.fetch.annotations import parse_annotations +from gitlab2prov.adapters.fetch.utils import project_slug +from gitlab2prov.domain.constants import ProvRole +from gitlab2prov.domain.objects import ( + Asset, + GithubCommit, + Release, + Tag, + User, + GithubCommit, + GithubIssue, + GithubPullRequest, +) + + +log = logging.getLogger(__name__) + + +@dataclass +class GithubFetcher: + repository: github.Repository.Repository = field(init=False) + + def do_login(self, url: str, token: str) -> None: + gh = github.Github(login_or_token=token, per_page=100) + log.warn(f"Remaining requests: {gh.rate_limiting[0]}") + self.repository = gh.get_repo(full_name_or_id=project_slug(url)) + + def do_fetch(self): + yield from self.fetch_commits() + yield from self.fetch_issues() + yield from self.fetch_pullrequests() + yield from self.fetch_releases() + yield from self.fetch_tags() + + def fetch_commits(self): + """ + commits can have statuses + https://docs.github.com/en/rest/reference/repos#list-commit-statuses-for-a-reference + commits can have checks + https://docs.github.com/en/rest/reference/checks#list-check-runs-for-a-git-reference + commits can have comments + https://docs.github.com/en/rest/reference/repos#list-commit-comments-for-a-repository + which users can add reactions to (emoji) + https://docs.github.com/en/rest/reference/reactions#list-reactions-for-a-commit-comment + + annotations should be parsed from anything that adds additional information to a commit + - commit statuses + - commit checks + - commit comments + - commit reactions + """ + for commit in self.repository.get_commits(): + + parseables = [] + parseables.extend(commit.get_statuses()) + parseables.extend(commit.get_comments()) + parseables.extend(comment.get_reactions() for comment in commit.get_comments()) + + yield self.commit2ir(commit, raw_annotations=parseables) + + @staticmethod + def commit2ir(commit, raw_annotations): + return GithubCommit( + hexsha=commit.sha, + url=commit.url, + author=User( + name=commit.commit.author.name, + email=commit.commit.author.email, + prov_role=ProvRole.AUTHOR_GITHUB_COMMIT, + ), + annotations=[], + # parse_annotations(parseables), + authored_at=commit.commit.author.date, + committed_at=commit.commit.committer.date, + ) + + def fetch_issues(self): + """ + issues can have comments + https://docs.github.com/en/rest/reference/issues#list-issue-comments + which users can add reactions to (emoji) + https://docs.github.com/en/rest/reference/reactions#list-reactions-for-an-issue-comment + users can add reactions to issues + https://docs.github.com/en/rest/reference/reactions#list-reactions-for-an-issue + issues can have labels + https://docs.github.com/en/rest/reference/issues#list-labels-for-an-issue + issues can have events + https://docs.github.com/en/rest/reference/issues#list-issue-events + issues can have a timeline + https://docs.github.com/en/rest/reference/issues#list-timeline-events-for-an-issue + issues can have milestones + https://docs.github.com/en/rest/reference/issues#list-milestones-for-an-issue + + annotations should be parsed from anything that adds additional information to an issue + - issue comments + - issue reactions + - issue comment reactions + - issue events + - issue labels + - issue milestones + + TODO: do events overlap with information gained from other resources? + """ + for issue in self.repository.get_issues(state="all"): + + parseables = [] + parseables.extend(issue.get_comments()) + parseables.extend(comment.get_reactions() for comment in issue.get_comments()) + parseables.extend(issue.get_labels()) + parseables.extend(issue.get_events()) + parseables.extend(issue.get_timeline()) + + yield self.issue2ir(issue, raw_annotations=parseables) + + @staticmethod + def issue2ir(issue, raw_annotations): + return GithubIssue( + number=issue.number, + id=issue.id, + title=issue.title, + body=issue.body, + url=issue.url, + author=User( + name=issue.user.name, email=issue.user.email, prov_role=ProvRole.AUTHOR_ISSUE + ), + annotations=[], + # parse_annotations(raw_annotations), + created_at=issue.created_at, + closed_at=issue.closed_at, + ) + + def fetch_pullrequests(self): + """ + all pull requests are issues, but not all issues are pull requests + therefore, we can use the same api for some shared resources but not others + + pull requests can have comments (same as issues) + https://docs.github.com/en/rest/reference/pulls#list-review-comments-on-a-pull-request + users can add reactions to pull request comments (same as issues) + https://docs.github.com/en/rest/reactions?apiVersion=2022-11-28#list-reactions-for-an-issue-comment + to get the reactions for a pull request, we need to get the reactions for it as an issue + https://docs.github.com/en/rest/reference/issues#list-reactions-for-an-issue + pull requests can have labels (same as issues) + https://docs.github.com/en/rest/reference/issues#list-labels-for-an-issue + pull requests can have milestones (same as issues) + https://docs.github.com/en/rest/reference/issues#list-milestones-for-an-issue + pull requests can have events (same as issues) + https://docs.github.com/en/rest/reference/issues#list-issue-events + pull requests can have a timeline???? if treated as an issue (same as issues) + https://docs.github.com/en/rest/reference/issues#list-timeline-events-for-an-issue + pull requests can have reviews (grouped review comments with a state and optional body) + https://docs.github.com/en/rest/reference/pulls#list-reviews-on-a-pull-request + pull requests can have review comments + https://docs.github.com/en/rest/reference/pulls#list-review-comments-on-a-pull-request + pull requests review comment can have reactions + https://docs.github.com/en/rest/reference/reactions#list-reactions-for-a-pull-request-review-comment + """ + for pull in self.repository.get_pulls(state="all"): + + raw_annotations = [] + raw_annotations.extend(pull.get_comments()) + raw_annotations.extend(comment.get_reactions() for comment in pull.get_comments()) + raw_annotations.extend(pull.get_labels()) + raw_annotations.extend(pull.get_review_comments()) + raw_annotations.extend( + comment.get_reactions() for comment in pull.get_review_comments() + ) + raw_annotations.extend(pull.get_reviews()) + raw_annotations.extend(pull.as_issue().get_reactions()) + raw_annotations.extend(pull.as_issue().get_events()) + raw_annotations.extend(pull.as_issue().get_timeline()) + + yield self.pull2ir(pull, raw_annotations=raw_annotations) + + @staticmethod + def pull2ir(pull, raw_annotations): + return GithubPullRequest( + number=pull.number, + id=pull.id, + title=pull.title, + body=pull.body, + url=pull.url, + head=pull.head.ref, + base=pull.base.ref, + author=User( + name=pull.user.name, + email=pull.user.email, + prov_role=ProvRole.AUTHOR_PULL_REQUEST, + ), + annotations=[], + # parse_annotations(raw_annotations), + created_at=pull.created_at, + closed_at=pull.closed_at, + merged_at=pull.merged_at, + ) + + def fetch_releases(self): + for release in self.repository.get_releases(): + yield self.release2ir(release) + + @staticmethod + def release2ir(release): + return Release( + name=release.title, + description=release.body, + tag_name=release.tag_name, + author=User( + name=release.author.name, + email=release.author.email, + prov_role=ProvRole.AUTHOR_RELEASE, + ), + assets=[Asset(asset.url, asset.content_type) for asset in release.get_assets()], + evidences=[], + created_at=release.created_at, + released_at=release.published_at, + ) + + def fetch_tags(self): + for tag in self.repository.get_tags(): + yield self.tag2ir(tag) + + @staticmethod + def tag2ir(tag): + return Tag( + name=tag.name, + hexsha=tag.commit.sha, + message=tag.commit.commit.message, + author=User( + name=tag.commit.author.name, + email=tag.commit.author.email, + prov_role=ProvRole.AUTHOR_TAG, + ), + created_at=tag.commit.commit.author.date, + ) From e4ca20cdcd6e23eca468ce08f4ed7da80188bc72 Mon Sep 17 00:00:00 2001 From: cdboer Date: Sun, 4 Dec 2022 19:08:53 +0100 Subject: [PATCH 06/81] Change login behaviour for gitlab fetcher --- gitlab2prov/adapters/fetch/gitlab.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/gitlab2prov/adapters/fetch/gitlab.py b/gitlab2prov/adapters/fetch/gitlab.py index 9f6b073..d6539a9 100644 --- a/gitlab2prov/adapters/fetch/gitlab.py +++ b/gitlab2prov/adapters/fetch/gitlab.py @@ -31,13 +31,11 @@ @dataclass class GitlabFetcher: - url: str - token: str _project: Project | None = field(init=False, default=None) - def do_login(self) -> None: - gl = Gitlab(url=instance_url(self.url), private_token=self.token) - self._project = gl.projects.get(project_slug(self.url)) + def do_login(self, url, token) -> None: + gl = Gitlab(url=instance_url(url), private_token=token) + self._project = gl.projects.get(project_slug(url)) def fetch_gitlab( self, From af35224f66a1040679104c49432977b8706fa5da Mon Sep 17 00:00:00 2001 From: cdboer Date: Sun, 4 Dec 2022 19:09:18 +0100 Subject: [PATCH 07/81] Choose the correct fetcher when handling urls --- gitlab2prov/bootstrap.py | 3 ++- gitlab2prov/service_layer/handlers.py | 6 +++--- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/gitlab2prov/bootstrap.py b/gitlab2prov/bootstrap.py index 7d4adf2..dbae7ea 100644 --- a/gitlab2prov/bootstrap.py +++ b/gitlab2prov/bootstrap.py @@ -3,7 +3,7 @@ from typing import Type from gitlab2prov.service_layer import handlers, messagebus, unit_of_work -from gitlab2prov.adapters.fetch import GitFetcher, GitlabFetcher +from gitlab2prov.adapters.fetch import GitFetcher, GitlabFetcher, GithubFetcher log = logging.getLogger(__name__) @@ -18,6 +18,7 @@ def bootstrap( "uow": uow, "git_fetcher": git_fetcher, "gitlab_fetcher": gitlab_fetcher, + "github_fetcher": GithubFetcher, } injected_handlers = { command_type: [inject_dependencies(handler, dependencies) for handler in handlers] diff --git a/gitlab2prov/service_layer/handlers.py b/gitlab2prov/service_layer/handlers.py index 9e6535e..88259ea 100644 --- a/gitlab2prov/service_layer/handlers.py +++ b/gitlab2prov/service_layer/handlers.py @@ -18,9 +18,9 @@ def fetch_git(cmd: commands.Fetch, uow, git_fetcher) -> None: uow.commit() -def fetch_gitlab(cmd: commands.Fetch, uow, gitlab_fetcher) -> None: - fetcher = gitlab_fetcher(cmd.url, cmd.token) - fetcher.do_login() +def fetch_gitlab(cmd: commands.Fetch, uow, gitlab_fetcher, github_fetcher) -> None: + fetcher = gitlab_fetcher() if "gitlab" in cmd.url else github_fetcher() + fetcher.do_login(cmd.url, cmd.token) with uow: for resource in fetcher.fetch_gitlab(): log.info(f"add {resource=}") From 5c58bb2306474e4ad3aaef714c6b962cc03f05b6 Mon Sep 17 00:00:00 2001 From: cdboer Date: Sun, 4 Dec 2022 19:09:34 +0100 Subject: [PATCH 08/81] Add constants for github specific content --- gitlab2prov/domain/constants.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/gitlab2prov/domain/constants.py b/gitlab2prov/domain/constants.py index 45779df..ea416e7 100644 --- a/gitlab2prov/domain/constants.py +++ b/gitlab2prov/domain/constants.py @@ -32,8 +32,10 @@ class ProvRole: COMMITTER = "Committer" AUTHOR = "Author" AUTHOR_GITLAB_COMMIT = "GitlabCommitAuthor" + AUTHOR_GITHUB_COMMIT = "GithubCommitAuthor" AUTHOR_ISSUE = "IssueAuthor" AUTHOR_MERGE_REQUEST = "MergeRequestAuthor" + AUTHOR_PULL_REQUEST = "PullRequestAuthor" AUTHOR_RELEASE = "ReleaseAuthor" AUTHOR_TAG = "TagAuthor" ANNOTATOR = "Annotator" @@ -60,6 +62,10 @@ class ProvType: GITLAB_COMMIT_VERSION = "GitlabCommitVersion" GITLAB_COMMIT_VERSION_ANNOTATED = "AnnotatedGitlabCommitVersion" GITLAB_COMMIT_CREATION = "GitlabCommitCreation" + GITHUB_COMMIT = "GithubCommit" + GITHUB_COMMIT_VERSION = "GithubCommitVersion" + GITHUB_COMMIT_VERSION_ANNOTATED = "AnnotatedGithubCommitVersion" + GITHUB_COMMIT_CREATION = "GithubCommitCreation" ISSUE = "Issue" ISSUE_VERSION = "IssueVersion" ISSUE_VERSION_ANNOTATED = "AnnotatedIssueVersion" @@ -68,6 +74,10 @@ class ProvType: MERGE_REQUEST_VERSION = "MergeRequestVersion" MERGE_REQUEST_VERSION_ANNOTATED = "AnnotatedMergeRequestVersion" MERGE_REQUEST_CREATION = "MergeRequestCreation" + PULL_REQUEST = "PullRequest" + PULL_REQUEST_VERSION = "PullRequestVersion" + PULL_REQUEST_VERSION_ANNOTAED = "AnnotatedPullRequestVersion" + PULL_REQUEST_CREATION = "PullRequestCreation" ANNOTATION = "Annotation" TAG = "Tag" TAG_CREATION = "TagCreation" From 1f4a8ca534dd156fb85ebf5c538f7549266c60fe Mon Sep 17 00:00:00 2001 From: cdboer Date: Sun, 4 Dec 2022 19:09:53 +0100 Subject: [PATCH 09/81] Allow models to handle gitlab resources --- gitlab2prov/prov/model.py | 34 ++++++++++++++++++++++++++-------- 1 file changed, 26 insertions(+), 8 deletions(-) diff --git a/gitlab2prov/prov/model.py b/gitlab2prov/prov/model.py index f936ed3..b24aad1 100644 --- a/gitlab2prov/prov/model.py +++ b/gitlab2prov/prov/model.py @@ -9,8 +9,10 @@ FileRevision, GitCommit, GitlabCommit, + GithubCommit, Issue, MergeRequest, + GithubPullRequest, Release, Tag, ) @@ -172,11 +174,19 @@ def deletion( def gitlab_commit_model(resources, graph: ProvDocument = None): if graph is None: graph = graph_factory() - for gitlab_commit in resources.list_all(GitlabCommit): - git_commit = resources.get(GitCommit, hexsha=gitlab_commit.hexsha) - graph.update(commit_creation(gitlab_commit, git_commit)) - graph.update(annotation_chain(gitlab_commit)) - return graph + + github_commits = resources.list_all(GitlabCommit) + gitlab_commits = resources.list_all(GithubCommit) + + for commit in {*github_commits, *gitlab_commits}: + git_commit = resources.get(GitCommit, hexsha=commit.hexsha) + + creation = commit_creation(commit, git_commit) + annotats = annotation_chain(commit) + + graph.update(creation) + graph.update(annotats) + return graph @@ -192,9 +202,16 @@ def gitlab_issue_model(resources, graph: ProvDocument = None): def gitlab_merge_request_model(resources, graph: ProvDocument = None): if graph is None: graph = graph_factory() - for merge_request in resources.list_all(MergeRequest): - graph.update(resource_creation(merge_request)) - graph.update(annotation_chain(merge_request)) + + merge_requests = resources.list_all(MergeRequest) + pull_requests = resources.list_all(GithubPullRequest) + + for merge_request in {*merge_requests, *pull_requests}: + creation = resource_creation(merge_request) + annotats = annotation_chain(merge_request) + + graph.update(creation) + graph.update(annotats) return graph @@ -205,6 +222,7 @@ def commit_creation( ): if graph is None: graph = graph_factory() + resource = graph.entity(*gitlab_commit) creation = graph.activity(*gitlab_commit.creation) first_version = graph.entity(*gitlab_commit.first_version) From 8cea2cefcf940938a720e56835e9d59c5a871c00 Mon Sep 17 00:00:00 2001 From: cdboer Date: Sun, 4 Dec 2022 19:10:21 +0100 Subject: [PATCH 10/81] Add GithubFetcher to toplevel package objects --- gitlab2prov/adapters/fetch/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/gitlab2prov/adapters/fetch/__init__.py b/gitlab2prov/adapters/fetch/__init__.py index 9daabd6..f727350 100644 --- a/gitlab2prov/adapters/fetch/__init__.py +++ b/gitlab2prov/adapters/fetch/__init__.py @@ -1,2 +1,3 @@ from gitlab2prov.adapters.fetch.git import GitFetcher from gitlab2prov.adapters.fetch.gitlab import GitlabFetcher +from gitlab2prov.adapters.fetch.github import GithubFetcher From 851564e6d22f1d6497da64b41eb1d2156cf034a0 Mon Sep 17 00:00:00 2001 From: cdboer Date: Sun, 4 Dec 2022 19:18:43 +0100 Subject: [PATCH 11/81] Rename tokens.md to gitlab-token.md --- README.md | 2 +- docs/guides/{tokens.md => gitlab-token.md} | 0 2 files changed, 1 insertion(+), 1 deletion(-) rename docs/guides/{tokens.md => gitlab-token.md} (100%) diff --git a/README.md b/README.md index f21ba16..91a58e2 100644 --- a/README.md +++ b/README.md @@ -60,7 +60,7 @@ pip install gitlab2prov[dev] # PyPi, install with extras `gitlab2prov` requires a [personal access token](https://docs.gitlab.com/ee/user/profile/personal_access_tokens.html) to clone git repositories and to authenticate with the GitLab API. Use the following guide to obtain a token with the required [scopes](https://docs.gitlab.com/ee/user/profile/personal_access_tokens.html#personal-access-token-scopes) for yourself: -- [Create a personal access token (GitLab)](./docs/guides/tokens.md) +- [Create a personal access token (GitLab)](./docs/guides/gitlab-token.md) ## 🚀‍ Usage diff --git a/docs/guides/tokens.md b/docs/guides/gitlab-token.md similarity index 100% rename from docs/guides/tokens.md rename to docs/guides/gitlab-token.md From 18b6c02daccc058d4c24605d81bd0b5d6408cb3e Mon Sep 17 00:00:00 2001 From: cdboer Date: Sun, 4 Dec 2022 19:19:44 +0100 Subject: [PATCH 12/81] Add token guide for github --- README.md | 1 + docs/guides/github-token.md | 50 +++++++++++++++++++++++++++++++++++++ 2 files changed, 51 insertions(+) create mode 100644 docs/guides/github-token.md diff --git a/README.md b/README.md index 91a58e2..656945c 100644 --- a/README.md +++ b/README.md @@ -61,6 +61,7 @@ pip install gitlab2prov[dev] # PyPi, install with extras Use the following guide to obtain a token with the required [scopes](https://docs.gitlab.com/ee/user/profile/personal_access_tokens.html#personal-access-token-scopes) for yourself: - [Create a personal access token (GitLab)](./docs/guides/gitlab-token.md) +- [Create a personal access token (GitLab)](./docs/guides/github-token.md) ## 🚀‍ Usage diff --git a/docs/guides/github-token.md b/docs/guides/github-token.md new file mode 100644 index 0000000..b713db2 --- /dev/null +++ b/docs/guides/github-token.md @@ -0,0 +1,50 @@ +# Create a personal access token (GitHub) + +### 1. Go to GitHub + + +### 2. Click on View profile and more +![Step 2 screenshot](https://images.tango.us/workflows/1e971c6d-e957-48ff-b8ef-d1be891a2621/steps/336d1f23-3426-4408-85c2-a5b3f26d71c1/3117887f-9162-4c3d-9c08-15d7436a56d3.png?crop=focalpoint&fit=crop&fp-x=0.5000&fp-y=0.5000&fp-z=1.0000&w=1200&mark-w=0.2&mark-pad=0&mark64=aHR0cHM6Ly9pbWFnZXMudGFuZ28udXMvc3RhdGljL21hZGUtd2l0aC10YW5nby13YXRlcm1hcmsucG5n&ar=2560%3A1339) + + +### 3. Click on Settings +![Step 3 screenshot](https://images.tango.us/workflows/1e971c6d-e957-48ff-b8ef-d1be891a2621/steps/0d512e13-98ee-4e5e-aea9-baff60d22e8c/bcb0d791-bbde-4781-be05-cd54d5162a67.png?crop=focalpoint&fit=crop&fp-x=0.5000&fp-y=0.5000&fp-z=1.0000&w=1200&mark-w=0.2&mark-pad=0&mark64=aHR0cHM6Ly9pbWFnZXMudGFuZ28udXMvc3RhdGljL21hZGUtd2l0aC10YW5nby13YXRlcm1hcmsucG5n&ar=2560%3A1339) + + +### 4. Click on Developer settings +![Step 4 screenshot](https://images.tango.us/workflows/1e971c6d-e957-48ff-b8ef-d1be891a2621/steps/df279561-db15-4b6c-9130-62c7b3e6d276/e01ae958-3c31-442c-ae50-6fdbf776c458.png?crop=focalpoint&fit=crop&fp-x=0.5000&fp-y=0.5000&fp-z=1.0000&w=1200&mark-w=0.2&mark-pad=0&mark64=aHR0cHM6Ly9pbWFnZXMudGFuZ28udXMvc3RhdGljL21hZGUtd2l0aC10YW5nby13YXRlcm1hcmsucG5n&ar=2560%3A1339) + + +### 5. Click on Personal access tokens +![Step 5 screenshot](https://images.tango.us/workflows/1e971c6d-e957-48ff-b8ef-d1be891a2621/steps/a70cae0e-f907-4c2f-8902-a1c3225bfb8a/bc65dfa5-4014-4b3c-b6b1-52a4494acbd0.png?crop=focalpoint&fit=crop&fp-x=0.3094&fp-y=0.1748&fp-z=2.4615&w=1200&mark-w=0.2&mark-pad=0&mark64=aHR0cHM6Ly9pbWFnZXMudGFuZ28udXMvc3RhdGljL21hZGUtd2l0aC10YW5nby13YXRlcm1hcmsucG5n&ar=2560%3A1339) + + +### 6. Click on Tokens (classic) +![Step 6 screenshot](https://images.tango.us/workflows/1e971c6d-e957-48ff-b8ef-d1be891a2621/steps/2b6ed918-f15e-4f63-805e-94c499ca8e61/3b558527-56ad-4846-9d9f-c25b777613e0.png?crop=focalpoint&fit=crop&fp-x=0.3141&fp-y=0.2226&fp-z=2.5600&w=1200&mark-w=0.2&mark-pad=0&mark64=aHR0cHM6Ly9pbWFnZXMudGFuZ28udXMvc3RhdGljL21hZGUtd2l0aC10YW5nby13YXRlcm1hcmsucG5n&ar=2560%3A1339) + + +### 7. Click on Generate new token +![Step 7 screenshot](https://images.tango.us/workflows/1e971c6d-e957-48ff-b8ef-d1be891a2621/steps/1586942a-e679-4609-8799-a12fcc836251/e1f708c5-66eb-46e3-b0c0-6e57118cd7a1.png?crop=focalpoint&fit=crop&fp-x=0.6510&fp-y=0.1285&fp-z=2.9058&w=1200&mark-w=0.2&mark-pad=0&mark64=aHR0cHM6Ly9pbWFnZXMudGFuZ28udXMvc3RhdGljL21hZGUtd2l0aC10YW5nby13YXRlcm1hcmsucG5n&ar=2560%3A1339) + + +### 8. Click on Generate new token (classic)… +![Step 8 screenshot](https://images.tango.us/workflows/1e971c6d-e957-48ff-b8ef-d1be891a2621/steps/7c765b37-6a3a-4026-9b2a-b61ac4d89bcb/9d8559a1-725e-4bb5-8ab5-6f1b398ff1dd.png?crop=focalpoint&fit=crop&fp-x=0.6227&fp-y=0.2061&fp-z=2.4015&w=1200&mark-w=0.2&mark-pad=0&mark64=aHR0cHM6Ly9pbWFnZXMudGFuZ28udXMvc3RhdGljL21hZGUtd2l0aC10YW5nby13YXRlcm1hcmsucG5n&ar=2560%3A1339) + + +### 9. Assign a name to your token to remember its purpose +![Step 9 screenshot](https://images.tango.us/workflows/1e971c6d-e957-48ff-b8ef-d1be891a2621/steps/223acec5-4696-43c0-82d8-56122dc8ff63/49c93bfd-07a5-4206-aeb0-8666d7ad8f0a.png?crop=focalpoint&fit=crop&fp-x=0.4641&fp-y=0.2353&fp-z=2.1192&w=1200&mark-w=0.2&mark-pad=0&mark64=aHR0cHM6Ly9pbWFnZXMudGFuZ28udXMvc3RhdGljL21hZGUtd2l0aC10YW5nby13YXRlcm1hcmsucG5n&ar=2560%3A1339) + + +### 10. Check repo … +![Step 10 screenshot](https://images.tango.us/workflows/1e971c6d-e957-48ff-b8ef-d1be891a2621/steps/3a682b99-9665-4810-a83c-d9b6a8d709ab/9d3c1dfa-cb59-4bd4-8880-277070d8a1d0.png?crop=focalpoint&fit=crop&fp-x=0.3842&fp-y=0.3118&fp-z=3.2288&w=1200&mark-w=0.2&mark-pad=0&mark64=aHR0cHM6Ly9pbWFnZXMudGFuZ28udXMvc3RhdGljL21hZGUtd2l0aC10YW5nby13YXRlcm1hcmsucG5n&ar=2560%3A1339) + + +### 11. Click on Generate token +![Step 11 screenshot](https://images.tango.us/workflows/1e971c6d-e957-48ff-b8ef-d1be891a2621/steps/8fb20c70-b98e-49e4-a41a-f880508909fa/8156f227-b314-4e2d-96c7-19044632b761.png?crop=focalpoint&fit=crop&fp-x=0.4039&fp-y=0.8648&fp-z=2.8444&w=1200&mark-w=0.2&mark-pad=0&mark64=aHR0cHM6Ly9pbWFnZXMudGFuZ28udXMvc3RhdGljL21hZGUtd2l0aC10YW5nby13YXRlcm1hcmsucG5n&ar=2560%3A1339) + + +### 12. Click on Copy token +![Step 12 screenshot](https://images.tango.us/workflows/1e971c6d-e957-48ff-b8ef-d1be891a2621/steps/69e552d2-16c6-48e0-81e7-46f5660c0c4f/6404501f-7fec-4857-9ea5-a9c6cf50eb5b.png?crop=focalpoint&fit=crop&fp-x=0.4882&fp-y=0.3543&fp-z=2.0000&w=1200&mark-w=0.2&mark-pad=0&mark64=aHR0cHM6Ly9pbWFnZXMudGFuZ28udXMvc3RhdGljL21hZGUtd2l0aC10YW5nby13YXRlcm1hcmsucG5n&ar=2560%3A1339) + + +### 13. Done! From cb3f0e0e76d97c78f7382f5fa322467f757b5263 Mon Sep 17 00:00:00 2001 From: cdboer Date: Sun, 4 Dec 2022 19:20:28 +0100 Subject: [PATCH 13/81] Fix typo --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 656945c..66bf374 100644 --- a/README.md +++ b/README.md @@ -61,7 +61,7 @@ pip install gitlab2prov[dev] # PyPi, install with extras Use the following guide to obtain a token with the required [scopes](https://docs.gitlab.com/ee/user/profile/personal_access_tokens.html#personal-access-token-scopes) for yourself: - [Create a personal access token (GitLab)](./docs/guides/gitlab-token.md) -- [Create a personal access token (GitLab)](./docs/guides/github-token.md) +- [Create a personal access token (GitHub)](./docs/guides/github-token.md) ## 🚀‍ Usage From 525b9f27cca6a9344b57a2baaf15f3293ce62614 Mon Sep 17 00:00:00 2001 From: cdboer Date: Mon, 9 Jan 2023 10:44:40 +0100 Subject: [PATCH 14/81] Update fetcher implementation --- .../adapters/fetch/{git.py => _git.py} | 18 +- gitlab2prov/adapters/fetch/_github.py | 161 ++++++++++++ gitlab2prov/adapters/fetch/_gitlab.py | 199 +++++++++++++++ gitlab2prov/adapters/fetch/github.py | 240 ------------------ gitlab2prov/adapters/fetch/gitlab.py | 214 ---------------- 5 files changed, 370 insertions(+), 462 deletions(-) rename gitlab2prov/adapters/fetch/{git.py => _git.py} (91%) create mode 100644 gitlab2prov/adapters/fetch/_github.py create mode 100644 gitlab2prov/adapters/fetch/_gitlab.py delete mode 100644 gitlab2prov/adapters/fetch/github.py delete mode 100644 gitlab2prov/adapters/fetch/gitlab.py diff --git a/gitlab2prov/adapters/fetch/git.py b/gitlab2prov/adapters/fetch/_git.py similarity index 91% rename from gitlab2prov/adapters/fetch/git.py rename to gitlab2prov/adapters/fetch/_git.py index a204853..1daa138 100644 --- a/gitlab2prov/adapters/fetch/git.py +++ b/gitlab2prov/adapters/fetch/_git.py @@ -2,6 +2,7 @@ from dataclasses import dataclass from itertools import zip_longest from tempfile import TemporaryDirectory +from pathlib import Path from git import Commit from git import Repo @@ -38,10 +39,11 @@ def __exit__(self, exc_type, exc_val, exc_tb): def do_clone(self) -> None: clone_url = "" - if "gitlab.com" in self.url: + if "gitlab" in self.url: clone_url = clone_over_https_url(self.url, self.token, "gitlab") - if "github.com" in self.url: + if "github" in self.url: clone_url = clone_over_https_url(self.url, self.token, "github") + self._repo = Repo.clone_from( url=clone_url, to_path=self._tmpdir.name, @@ -100,14 +102,14 @@ def parse_log(log: str): def extract_commits(repo: Repo) -> Iterator[GitCommit]: for commit in repo.iter_commits("--all"): yield GitCommit( - hexsha=commit.hexsha, - message=commit.message, + sha=commit.hexsha, title=commit.summary, + message=commit.message, author=get_author(commit), committer=get_committer(commit), parents=[parent.hexsha for parent in commit.parents], - prov_start=commit.authored_datetime, - prov_end=commit.committed_datetime, + start=commit.authored_datetime, + end=commit.committed_datetime, ) @@ -122,7 +124,7 @@ def extract_files(repo: Repo) -> Iterator[File]: # disregard modifications and deletions for diff_item in diff.iter_change_type(ChangeType.ADDED): # path for new files is stored in diff b_path - yield File(path=diff_item.b_path, committed_in=commit.hexsha) + yield File(name=Path(diff_item.b_path).name, path=diff_item.b_path, commit=commit.hexsha) def extract_revisions(repo: Repo) -> Iterator[FileRevision]: @@ -140,7 +142,7 @@ def extract_revisions(repo: Repo) -> Iterator[FileRevision]: ) ): revs.append( - FileRevision(path=path, committed_in=hexsha, change_type=status, original=file) + FileRevision(name=Path(path).name, path=path, commit=hexsha, status=status, file=file) ) # revisions remeber their predecessor (previous revision) for rev, prev in zip_longest(revs, revs[1:]): diff --git a/gitlab2prov/adapters/fetch/_github.py b/gitlab2prov/adapters/fetch/_github.py new file mode 100644 index 0000000..346b799 --- /dev/null +++ b/gitlab2prov/adapters/fetch/_github.py @@ -0,0 +1,161 @@ +import logging +import itertools + +from typing import Iterator +from dataclasses import dataclass, field, InitVar + +from github import Github +from github.Repository import Repository + +from gitlab2prov.adapters.fetch.annotations import parse_annotations +from gitlab2prov.adapters.fetch.utils import project_slug +from gitlab2prov.domain.constants import ProvRole +from gitlab2prov.domain.objects import ( + Asset, + Release, + User, + Commit, + Issue, + MergeRequest, + GitTag, + Release, +) + + +log = logging.getLogger(__name__) + + +@dataclass +class GithubFetcher: + private_token: InitVar[str] + project_url: InitVar[str] + + client: Github = field(init=False) + repository: Repository = field(init=False) + + def __post_init__(self, private_token, project_url) -> None: + self.client = Github(login_or_token=private_token, per_page=100) + self.repository = self.client.get_repo(full_name_or_id=project_slug(project_url)) + log.warning(f"Remaining requests: {self.client.rate_limiting[0]}") + + def fetch_all(self) -> Iterator[Commit | Issue | MergeRequest | Release | GitTag]: + yield from itertools.chain( + self.fetch_commits(), + self.fetch_issues(), + self.fetch_mergerequests(), + self.fetch_releases(), + self.fetch_tags(), + ) + + def fetch_commits(self) -> Iterator[Commit]: + for commit in self.repository.get_commits(): + parseables = [ + *commit.get_statuses(), + *commit.get_comments(), + *(comment.get_reactions() for comment in commit.get_comments()), + ] + yield Commit( + sha=commit.sha, + url=commit.url, + author=User( + commit.commit.author.name, + commit.commit.author.email, + prov_role=ProvRole.COMMIT_AUTHOR, + ), + platform="github", + annotations=parse_annotations(parseables), + authored_at=commit.commit.author.date, + committed_at=commit.commit.committer.date, + ) + + def fetch_issues(self) -> Iterator[Issue]: + for issue in self.repository.get_issues(state="all"): + parseables = [ + *issue.get_comments(), + *(comment.get_reactions() for comment in issue.get_comments()), + *issue.get_labels(), + *issue.get_events(), + *issue.get_timeline(), + ] + yield Issue( + id=issue.number, + iid=issue.id, + platform="github", + title=issue.title, + body=issue.body, + url=issue.url, + author=User(issue.user.name, issue.user.email, prov_role=ProvRole.ISSUE_AUTHOR), + annotations=[], + created_at=issue.created_at, + closed_at=issue.closed_at, + ) + + def fetch_mergerequests(self) -> Iterator[MergeRequest]: + for pull in self.repository.get_pulls(state="all"): + raw_annotations = [] + raw_annotations.extend(pull.get_comments()) + raw_annotations.extend(comment.get_reactions() for comment in pull.get_comments()) + raw_annotations.extend(pull.get_labels()) + raw_annotations.extend(pull.get_review_comments()) + raw_annotations.extend( + comment.get_reactions() for comment in pull.get_review_comments() + ) + raw_annotations.extend(pull.get_reviews()) + raw_annotations.extend(pull.as_issue().get_reactions()) + raw_annotations.extend(pull.as_issue().get_events()) + raw_annotations.extend(pull.as_issue().get_timeline()) + + yield MergeRequest( + id=pull.number, + iid=pull.id, + title=pull.title, + body=pull.body, + url=pull.url, + platform="github", + source_branch=pull.base.ref, + target_branch=pull.head.ref, + author=User( + name=pull.user.name, + email=pull.user.email, + prov_role=ProvRole.MERGE_REQUEST_AUTHOR, + ), + annotations=[], + created_at=pull.created_at, + closed_at=pull.closed_at, + merged_at=pull.merged_at, + ) + + def fetch_releases(self) -> Iterator[Release]: + for release in self.repository.get_releases(): + yield Release( + name=release.title, + body=release.body, + tag_name=release.tag_name, + platform="github", + author=User( + name=release.author.name, + email=release.author.email, + prov_role=ProvRole.RELEASE_AUTHOR, + ), + assets=[ + Asset(url=asset.url, format=asset.content_type) + for asset in release.get_assets() + ], + evidences=[], + created_at=release.created_at, + released_at=release.published_at, + ) + + def fetch_tags(self) -> Iterator[GitTag]: + for tag in self.repository.get_tags(): + yield GitTag( + name=tag.name, + sha=tag.commit.sha, + message=tag.commit.commit.message, + author=User( + name=tag.commit.author.name, + email=tag.commit.author.email, + prov_role=ProvRole.TAG_AUTHOR, + ), + created_at=tag.commit.commit.author.date, + ) diff --git a/gitlab2prov/adapters/fetch/_gitlab.py b/gitlab2prov/adapters/fetch/_gitlab.py new file mode 100644 index 0000000..5bf94db --- /dev/null +++ b/gitlab2prov/adapters/fetch/_gitlab.py @@ -0,0 +1,199 @@ +import logging +import itertools + +from dataclasses import dataclass, field, InitVar +from typing import Iterator + +from gitlab import Gitlab +from gitlab.exceptions import GitlabListError + +from gitlab2prov.adapters.fetch.annotations import parse_annotations +from gitlab2prov.adapters.fetch.utils import instance_url, project_slug +from gitlab2prov.domain.constants import ProvRole +from gitlab2prov.domain.objects import ( + Asset, + Evidence, + Commit, + Issue, + MergeRequest, + Release, + GitTag, + User, + GitTag, +) + + +log = logging.getLogger(__name__) + + +@dataclass +class GitlabFetcher: + private_token: InitVar[str] + url: InitVar[str] = "https://gitlab.com" + + client: Gitlab = field(init=False) + project: Gitlab = field(init=False) + + def __post_init__(self, private_token, url) -> None: + self.client = Gitlab(instance_url(url), private_token=private_token) + self.project = self.client.projects.get(project_slug(url)) + + def log_list_err(self, log: logging.Logger, err: GitlabListError, cls: str) -> None: + log.error(f"failed to fetch {cls} from {instance_url(self.project)}") + log.error(f"error: {err}") + + def fetch_all(self) -> Iterator[Commit | Issue | MergeRequest | Release | GitTag]: + yield from itertools.chain( + self.fetch_commits(), + self.fetch_issues(), + self.fetch_mergerequests(), + self.fetch_releases(), + self.fetch_tags(), + ) + + def fetch_commits(self) -> Iterator[Commit]: + try: + for commit in self.project.commits.list(all=True, per_page=100): + yield Commit( + sha=commit.id, + url=commit.web_url, + platform="gitlab", + author=User( + commit.author_name, commit.author_email, prov_role=ProvRole.COMMIT_AUTHOR + ), + annotations=parse_annotations( + [ + *commit.comments.list(all=True, system=False), + *commit.comments.list(all=True, system=True), + ] + ), + authored_at=commit.authored_date, + committed_at=commit.committed_date, + ) + except GitlabListError as err: + self.log_list_err(log, err, "commits") + + def fetch_issues(self, state="all") -> Iterator[Issue]: + try: + for issue in self.project.issues.list(all=True, state=state, per_page=100): + yield Issue( + id=issue.id, + iid=issue.iid, + platform="gitlab", + title=issue.title, + body=issue.description, + url=issue.web_url, + author=User( + issue.author.get("name"), + issue.author.get("email"), + gitlab_username=issue.author.get("username"), + gitlab_id=issue.author.get("id"), + prov_role=ProvRole.ISSUE_AUTHOR, + ), + annotations=parse_annotations( + [ + *issue.notes.list(all=True, system=False), + *issue.notes.list(all=True, system=True), + *issue.awardemojis.list(all=True), + *issue.resourcelabelevents.list(all=True), + *( + award + for note in issue.notes.list(all=True) + for award in note.awardemojis.list(all=True) + ), + ] + ), + created_at=issue.created_at, + closed_at=issue.closed_at, + ) + except GitlabListError as err: + self.log_list_err(log, err, "issues") + + def fetch_mergerequests(self, state="all") -> Iterator[MergeRequest]: + try: + for merge in self.project.mergerequests.list(all=True, state=state, per_page=100): + yield MergeRequest( + id=merge.id, + iid=merge.iid, + title=merge.title, + body=merge.description, + url=merge.web_url, + platform="gitlab", + source_branch=merge.source_branch, + target_branch=merge.target_branch, + author=User( + merge.author.get("name"), + merge.author.get("email"), + gitlab_username=merge.author.get("username"), + gitlab_id=merge.author.get("id"), + prov_role=ProvRole.MERGE_REQUEST_AUTHOR, + ), + annotations=parse_annotations( + ( + *merge.notes.list(all=True, system=False), + *merge.notes.list(all=True, system=True), + *merge.awardemojis.list(all=True), + *merge.resourcelabelevents.list(all=True), + *( + award + for note in merge.notes.list(all=True) + for award in note.awardemojis.list(all=True) + ), + ) + ), + created_at=merge.created_at, + closed_at=merge.closed_at, + merged_at=merge.merged_at, + first_deployed_to_production_at=getattr(merge, "first_deployed_to_production_at", None), + ) + except GitlabListError as err: + self.log_list_err(log, err, "merge requests") + + def fetch_releases(self) -> Iterator[Release]: + try: + for release in self.project.releases.list(all=True, per_page=100): + yield Release( + name=release.name, + body=release.description, + tag_name=release.tag_name, + author=User( + name=release.author.get("name"), + email=release.author.get("email"), + gitlab_username=release.author.get("username"), + gitlab_id=release.author.get("id"), + prov_role=ProvRole.RELEASE_AUTHOR, + ), + assets=[ + Asset(url=asset.get("url"), format=asset.get("format")) + for asset in release.assets.get("sources", []) + ], + evidences=[ + Evidence( + sha=evidence.get("sha"), + url=evidence.get("filepath"), + collected_at=evidence.get("collected_at"), + ) + for evidence in release.evidences + ], + created_at=release.created_at, + released_at=release.released_at, + ) + except GitlabListError as err: + self.log_list_err(log, err, "releases") + + def fetch_tags(self) -> Iterator[GitTag]: + try: + for tag in self.project.tags.list(all=True, per_page=100): + yield GitTag( + name=tag.name, + sha=tag.target, + message=tag.message, + author=User( + name=tag.commit.get("author_name"), + email=tag.commit.get("author_email"), + prov_role=ProvRole.TAG_AUTHOR, + ), + created_at=tag.commit.get("created_at"), + ) + except GitlabListError as err: + self.log_list_err(log, err, "tags") diff --git a/gitlab2prov/adapters/fetch/github.py b/gitlab2prov/adapters/fetch/github.py deleted file mode 100644 index 737798f..0000000 --- a/gitlab2prov/adapters/fetch/github.py +++ /dev/null @@ -1,240 +0,0 @@ -import logging - -from dataclasses import dataclass, field - -import github - -from gitlab2prov.adapters.fetch.annotations import parse_annotations -from gitlab2prov.adapters.fetch.utils import project_slug -from gitlab2prov.domain.constants import ProvRole -from gitlab2prov.domain.objects import ( - Asset, - GithubCommit, - Release, - Tag, - User, - GithubCommit, - GithubIssue, - GithubPullRequest, -) - - -log = logging.getLogger(__name__) - - -@dataclass -class GithubFetcher: - repository: github.Repository.Repository = field(init=False) - - def do_login(self, url: str, token: str) -> None: - gh = github.Github(login_or_token=token, per_page=100) - log.warn(f"Remaining requests: {gh.rate_limiting[0]}") - self.repository = gh.get_repo(full_name_or_id=project_slug(url)) - - def do_fetch(self): - yield from self.fetch_commits() - yield from self.fetch_issues() - yield from self.fetch_pullrequests() - yield from self.fetch_releases() - yield from self.fetch_tags() - - def fetch_commits(self): - """ - commits can have statuses - https://docs.github.com/en/rest/reference/repos#list-commit-statuses-for-a-reference - commits can have checks - https://docs.github.com/en/rest/reference/checks#list-check-runs-for-a-git-reference - commits can have comments - https://docs.github.com/en/rest/reference/repos#list-commit-comments-for-a-repository - which users can add reactions to (emoji) - https://docs.github.com/en/rest/reference/reactions#list-reactions-for-a-commit-comment - - annotations should be parsed from anything that adds additional information to a commit - - commit statuses - - commit checks - - commit comments - - commit reactions - """ - for commit in self.repository.get_commits(): - - parseables = [] - parseables.extend(commit.get_statuses()) - parseables.extend(commit.get_comments()) - parseables.extend(comment.get_reactions() for comment in commit.get_comments()) - - yield self.commit2ir(commit, raw_annotations=parseables) - - @staticmethod - def commit2ir(commit, raw_annotations): - return GithubCommit( - hexsha=commit.sha, - url=commit.url, - author=User( - name=commit.commit.author.name, - email=commit.commit.author.email, - prov_role=ProvRole.AUTHOR_GITHUB_COMMIT, - ), - annotations=[], - # parse_annotations(parseables), - authored_at=commit.commit.author.date, - committed_at=commit.commit.committer.date, - ) - - def fetch_issues(self): - """ - issues can have comments - https://docs.github.com/en/rest/reference/issues#list-issue-comments - which users can add reactions to (emoji) - https://docs.github.com/en/rest/reference/reactions#list-reactions-for-an-issue-comment - users can add reactions to issues - https://docs.github.com/en/rest/reference/reactions#list-reactions-for-an-issue - issues can have labels - https://docs.github.com/en/rest/reference/issues#list-labels-for-an-issue - issues can have events - https://docs.github.com/en/rest/reference/issues#list-issue-events - issues can have a timeline - https://docs.github.com/en/rest/reference/issues#list-timeline-events-for-an-issue - issues can have milestones - https://docs.github.com/en/rest/reference/issues#list-milestones-for-an-issue - - annotations should be parsed from anything that adds additional information to an issue - - issue comments - - issue reactions - - issue comment reactions - - issue events - - issue labels - - issue milestones - - TODO: do events overlap with information gained from other resources? - """ - for issue in self.repository.get_issues(state="all"): - - parseables = [] - parseables.extend(issue.get_comments()) - parseables.extend(comment.get_reactions() for comment in issue.get_comments()) - parseables.extend(issue.get_labels()) - parseables.extend(issue.get_events()) - parseables.extend(issue.get_timeline()) - - yield self.issue2ir(issue, raw_annotations=parseables) - - @staticmethod - def issue2ir(issue, raw_annotations): - return GithubIssue( - number=issue.number, - id=issue.id, - title=issue.title, - body=issue.body, - url=issue.url, - author=User( - name=issue.user.name, email=issue.user.email, prov_role=ProvRole.AUTHOR_ISSUE - ), - annotations=[], - # parse_annotations(raw_annotations), - created_at=issue.created_at, - closed_at=issue.closed_at, - ) - - def fetch_pullrequests(self): - """ - all pull requests are issues, but not all issues are pull requests - therefore, we can use the same api for some shared resources but not others - - pull requests can have comments (same as issues) - https://docs.github.com/en/rest/reference/pulls#list-review-comments-on-a-pull-request - users can add reactions to pull request comments (same as issues) - https://docs.github.com/en/rest/reactions?apiVersion=2022-11-28#list-reactions-for-an-issue-comment - to get the reactions for a pull request, we need to get the reactions for it as an issue - https://docs.github.com/en/rest/reference/issues#list-reactions-for-an-issue - pull requests can have labels (same as issues) - https://docs.github.com/en/rest/reference/issues#list-labels-for-an-issue - pull requests can have milestones (same as issues) - https://docs.github.com/en/rest/reference/issues#list-milestones-for-an-issue - pull requests can have events (same as issues) - https://docs.github.com/en/rest/reference/issues#list-issue-events - pull requests can have a timeline???? if treated as an issue (same as issues) - https://docs.github.com/en/rest/reference/issues#list-timeline-events-for-an-issue - pull requests can have reviews (grouped review comments with a state and optional body) - https://docs.github.com/en/rest/reference/pulls#list-reviews-on-a-pull-request - pull requests can have review comments - https://docs.github.com/en/rest/reference/pulls#list-review-comments-on-a-pull-request - pull requests review comment can have reactions - https://docs.github.com/en/rest/reference/reactions#list-reactions-for-a-pull-request-review-comment - """ - for pull in self.repository.get_pulls(state="all"): - - raw_annotations = [] - raw_annotations.extend(pull.get_comments()) - raw_annotations.extend(comment.get_reactions() for comment in pull.get_comments()) - raw_annotations.extend(pull.get_labels()) - raw_annotations.extend(pull.get_review_comments()) - raw_annotations.extend( - comment.get_reactions() for comment in pull.get_review_comments() - ) - raw_annotations.extend(pull.get_reviews()) - raw_annotations.extend(pull.as_issue().get_reactions()) - raw_annotations.extend(pull.as_issue().get_events()) - raw_annotations.extend(pull.as_issue().get_timeline()) - - yield self.pull2ir(pull, raw_annotations=raw_annotations) - - @staticmethod - def pull2ir(pull, raw_annotations): - return GithubPullRequest( - number=pull.number, - id=pull.id, - title=pull.title, - body=pull.body, - url=pull.url, - head=pull.head.ref, - base=pull.base.ref, - author=User( - name=pull.user.name, - email=pull.user.email, - prov_role=ProvRole.AUTHOR_PULL_REQUEST, - ), - annotations=[], - # parse_annotations(raw_annotations), - created_at=pull.created_at, - closed_at=pull.closed_at, - merged_at=pull.merged_at, - ) - - def fetch_releases(self): - for release in self.repository.get_releases(): - yield self.release2ir(release) - - @staticmethod - def release2ir(release): - return Release( - name=release.title, - description=release.body, - tag_name=release.tag_name, - author=User( - name=release.author.name, - email=release.author.email, - prov_role=ProvRole.AUTHOR_RELEASE, - ), - assets=[Asset(asset.url, asset.content_type) for asset in release.get_assets()], - evidences=[], - created_at=release.created_at, - released_at=release.published_at, - ) - - def fetch_tags(self): - for tag in self.repository.get_tags(): - yield self.tag2ir(tag) - - @staticmethod - def tag2ir(tag): - return Tag( - name=tag.name, - hexsha=tag.commit.sha, - message=tag.commit.commit.message, - author=User( - name=tag.commit.author.name, - email=tag.commit.author.email, - prov_role=ProvRole.AUTHOR_TAG, - ), - created_at=tag.commit.commit.author.date, - ) diff --git a/gitlab2prov/adapters/fetch/gitlab.py b/gitlab2prov/adapters/fetch/gitlab.py deleted file mode 100644 index d6539a9..0000000 --- a/gitlab2prov/adapters/fetch/gitlab.py +++ /dev/null @@ -1,214 +0,0 @@ -import logging -from collections.abc import Iterator -from dataclasses import dataclass -from dataclasses import field - -from gitlab import Gitlab -from gitlab.exceptions import GitlabListError -from gitlab.v4.objects import Project -from gitlab.v4.objects import ProjectCommit -from gitlab.v4.objects import ProjectIssue -from gitlab.v4.objects import ProjectMergeRequest -from gitlab.v4.objects import ProjectRelease -from gitlab.v4.objects import ProjectTag - -from gitlab2prov.adapters.fetch.annotations import parse_annotations -from gitlab2prov.adapters.fetch.utils import instance_url -from gitlab2prov.adapters.fetch.utils import project_slug -from gitlab2prov.domain.constants import ProvRole -from gitlab2prov.domain.objects import Asset -from gitlab2prov.domain.objects import Evidence -from gitlab2prov.domain.objects import GitlabCommit -from gitlab2prov.domain.objects import Issue -from gitlab2prov.domain.objects import MergeRequest -from gitlab2prov.domain.objects import Release -from gitlab2prov.domain.objects import Tag -from gitlab2prov.domain.objects import User - - -log = logging.getLogger(__name__) - - -@dataclass -class GitlabFetcher: - _project: Project | None = field(init=False, default=None) - - def do_login(self, url, token) -> None: - gl = Gitlab(url=instance_url(url), private_token=token) - self._project = gl.projects.get(project_slug(url)) - - def fetch_gitlab( - self, - ) -> Iterator[GitlabCommit | Issue | MergeRequest | Release | Tag]: - yield from extract_commits(self._project) - yield from extract_issues(self._project) - yield from extract_mergerequests(self._project) - yield from extract_releases(self._project) - yield from extract_tags(self._project) - - -def on_gitlab_list_error(func): - def wrapped(*args, **kwargs): - try: - return func(*args, **kwargs) - except GitlabListError as e: - msg = f"{func.__module__}.{func.__name__}: {type(e)} due to {e.response_code} HTTP Error." - log.info(msg) - - return wrapped - - -def get_commit_author(commit: ProjectCommit) -> User: - return User( - name=commit.committer_name, - email=commit.committer_email, - gitlab_username=None, - gitlab_id=None, - prov_role=ProvRole.AUTHOR_GITLAB_COMMIT, - ) - - -def get_tag_author(tag: ProjectTag) -> User: - return User( - name=tag.commit.get("author_name"), - email=tag.commit.get("author_email"), - gitlab_username=None, - gitlab_id=None, - prov_role=ProvRole.AUTHOR_TAG, - ) - - -def get_resource_author( - resource: ProjectIssue | ProjectMergeRequest | ProjectRelease, role: ProvRole -) -> User | None: - if not hasattr(resource, "author"): - return None - return User( - name=resource.author.get("name"), - email=resource.author.get("email"), - gitlab_username=resource.author.get("username"), - gitlab_id=resource.author.get("id"), - prov_role=role, - ) - - -def get_assets(release: ProjectRelease) -> list[Asset]: - return [ - Asset(url=asset.get("url"), format=asset.get("format")) - for asset in release.assets.get("sources", []) - ] - - -def get_evidences(release: ProjectRelease) -> list[Evidence]: - return [ - Evidence( - hexsha=evidence.get("sha"), - url=evidence.get("filepath"), - collected_at=evidence.get("collected_at"), - ) - for evidence in release.evidences - ] - - -@on_gitlab_list_error -def extract_commits(project: Project) -> Iterator[GitlabCommit]: - for commit in project.commits.list(all=True): - parseables = { - *commit.comments.list(all=True, system=False), - *commit.comments.list(all=True, system=True), - } - yield GitlabCommit( - hexsha=commit.id, - url=commit.web_url, - author=get_commit_author(commit), - annotations=parse_annotations(parseables), - authored_at=commit.authored_date, - committed_at=commit.committed_date, - ) - - -@on_gitlab_list_error -def extract_issues(project: Project) -> Iterator[Issue]: - for issue in project.issues.list(all=True): - parseables = { - *issue.notes.list(all=True, system=False), - *issue.notes.list(all=True, system=True), - *issue.awardemojis.list(all=True), - *issue.resourcelabelevents.list(all=True), - *( - award - for note in issue.notes.list(all=True) - for award in note.awardemojis.list(all=True) - ), - } - yield Issue( - id=issue.id, - iid=issue.iid, - title=issue.title, - description=issue.description, - url=issue.web_url, - author=get_resource_author(issue, ProvRole.AUTHOR_ISSUE), - annotations=parse_annotations(parseables), - created_at=issue.created_at, - closed_at=issue.closed_at, - ) - - -@on_gitlab_list_error -def extract_mergerequests(project: Project) -> Iterator[MergeRequest]: - for mergerequest in project.mergerequests.list(all=True): - parseables = { - *mergerequest.notes.list(all=True, system=False), - *mergerequest.notes.list(all=True, system=True), - *mergerequest.awardemojis.list(all=True), - *mergerequest.resourcelabelevents.list(all=True), - *( - award - for note in mergerequest.notes.list(all=True) - for award in note.awardemojis.list(all=True) - ), - } - yield MergeRequest( - id=mergerequest.id, - iid=mergerequest.iid, - title=mergerequest.title, - description=mergerequest.description, - url=mergerequest.web_url, - source_branch=mergerequest.source_branch, - target_branch=mergerequest.target_branch, - author=get_resource_author(mergerequest, ProvRole.AUTHOR_MERGE_REQUEST), - annotations=parse_annotations(parseables), - created_at=mergerequest.created_at, - closed_at=mergerequest.closed_at, - merged_at=mergerequest.merged_at, - first_deployed_to_production_at=getattr( - mergerequest, "first_deployed_to_production_at", None - ), - ) - - -@on_gitlab_list_error -def extract_releases(project: Project) -> Iterator[Release]: - for release in project.releases.list(all=True): - yield Release( - name=release.name, - description=release.description, - tag_name=release.tag_name, - author=get_resource_author(release, ProvRole.AUTHOR_RELEASE), - assets=get_assets(release), - evidences=get_evidences(release), - created_at=release.created_at, - released_at=release.released_at, - ) - - -@on_gitlab_list_error -def extract_tags(project: Project) -> Iterator[Tag]: - for tag in project.tags.list(all=True): - yield Tag( - name=tag.name, - hexsha=tag.target, - message=tag.message, - author=get_tag_author(tag), - created_at=tag.commit.get("created_at"), - ) From fd938c2c7ae220ee7b2db1a9cee420a512c90bb6 Mon Sep 17 00:00:00 2001 From: cdboer Date: Mon, 9 Jan 2023 10:45:06 +0100 Subject: [PATCH 15/81] Add fetcher factory to choose the correct fetcher based on a project url --- gitlab2prov/adapters/fetch/__init__.py | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/gitlab2prov/adapters/fetch/__init__.py b/gitlab2prov/adapters/fetch/__init__.py index f727350..10e7664 100644 --- a/gitlab2prov/adapters/fetch/__init__.py +++ b/gitlab2prov/adapters/fetch/__init__.py @@ -1,3 +1,14 @@ -from gitlab2prov.adapters.fetch.git import GitFetcher -from gitlab2prov.adapters.fetch.gitlab import GitlabFetcher -from gitlab2prov.adapters.fetch.github import GithubFetcher +from gitlab2prov.adapters.fetch._git import GitFetcher +from gitlab2prov.adapters.fetch._gitlab import GitlabFetcher +from gitlab2prov.adapters.fetch._github import GithubFetcher + + +class FetcherFactory: + @staticmethod + def factory(url: str): + if "github" in url: + return GithubFetcher + if "gitlab" in url: + return GitlabFetcher + raise ValueError(f"can't derive fetcher for unknown url {url=}") + \ No newline at end of file From 634784a71e96d69445ae423800b83a6f0b90ac4f Mon Sep 17 00:00:00 2001 From: cdboer Date: Mon, 9 Jan 2023 10:45:43 +0100 Subject: [PATCH 16/81] Update domain object implementation --- gitlab2prov/domain/objects.py | 744 ++++++++++++++++++---------------- 1 file changed, 392 insertions(+), 352 deletions(-) diff --git a/gitlab2prov/domain/objects.py b/gitlab2prov/domain/objects.py index 50e82a4..9f22a79 100644 --- a/gitlab2prov/domain/objects.py +++ b/gitlab2prov/domain/objects.py @@ -1,465 +1,505 @@ from __future__ import annotations from dataclasses import dataclass -from dataclasses import Field from dataclasses import field -from dataclasses import fields from datetime import datetime -from itertools import cycle from typing import Any -from urllib.parse import urlencode +from prov.model import ( + PROV_LABEL, + PROV_ROLE, + ProvDocument, + ProvAgent, + ProvActivity, + ProvEntity, + PROV_TYPE, + PROV_ATTR_STARTTIME, + PROV_ATTR_ENDTIME, +) from prov.identifier import QualifiedName -from prov.model import PROV_LABEL -from gitlab2prov.domain.constants import PROV_FIELD_MAP -from gitlab2prov.domain.constants import ProvRole from gitlab2prov.domain.constants import ProvType from gitlab2prov.prov.operations import qualified_name -# metadata for dataclass attributes that relate objects with one another -# such attributes will not be included in the list of prov attributes of a dataclass -IS_RELATION = {"IS_RELATION": True} +PLACEHOLDER = ProvDocument() +PLACEHOLDER.set_default_namespace("http://github.com/dlr-sc/gitlab2prov/") -def is_relation(field: Field): - return field.metadata == IS_RELATION +@dataclass +class User: + name: str + email: str + gitlab_username: str | None = None + github_username: str | None = None + gitlab_id: str | None = None + github_id: str | None = None + prov_role: str | None = None + def __post_init__(self): + self.email = self.email.lower() if self.email else None -class ProvMixin: @property - def prov_identifier(self) -> QualifiedName: - attrs = urlencode(dict(self._traverse_repr_fields())) - label = f"{self._prov_type()}?{attrs}" - return qualified_name(label) + def identifier(self) -> QualifiedName: + return qualified_name(f"User?{self.name=}&{self.email=}") + + def to_prov_element(self) -> ProvAgent: + attributes = [ + ("name", self.name), + ("email", self.email), + (PROV_ROLE, self.prov_role), + (PROV_TYPE, ProvType.USER), + ] + if self.gitlab_username: + attributes.append(("gitlab_username", self.gitlab_username)) + if self.github_username: + attributes.append(("github_username", self.github_username)) + if self.gitlab_id: + attributes.append(("gitlab_id", self.gitlab_id)) + if self.github_id: + attributes.append(("github_id", self.github_id)) + return ProvAgent(PLACEHOLDER, self.identifier, attributes) - @property - def prov_label(self) -> QualifiedName: - attrs = urlencode(dict(self._traverse_repr_fields())) - label = f"{self._prov_type()}?{attrs}" - return qualified_name(label) + +@dataclass +class File: + name: str + path: str + commit: str @property - def prov_attributes(self) -> list[tuple[str, str | int | datetime | None]]: - return list(self._traverse_attributes()) - - def _prov_type(self) -> str: - match self.prov_type: - case list(): - return self.prov_type[0] - case _: - return self.prov_type - - def _traverse_repr_fields(self): - for f in fields(self): - if f.repr: - yield f.name, getattr(self, f.name) - - def _traverse_attributes(self): - for f in fields(self): - if not is_relation(f): - yield from self._expand_attribute(f.name, getattr(self, f.name)) - yield (PROV_LABEL, self.prov_label) - - def _expand_attribute(self, key, val): - key = PROV_FIELD_MAP.get(key, key) - match val: - case list(): - yield from zip(cycle([key]), val) - case dict(): - yield from val.items() - case _: - yield key, val + def identifier(self) -> QualifiedName: + return qualified_name(f"File?{self.name=}&{self.path=}&{self.commit=}") + + def to_prov_element(self) -> ProvEntity: + attributes = [("name", self.name), ("path", self.path), (PROV_TYPE, ProvType.FILE)] + return ProvEntity( + PLACEHOLDER, + self.identifier, + attributes, + ) @dataclass -class AgentMixin: - def __iter__(self): - yield self.prov_identifier - yield self.prov_attributes +class FileRevision(File): + status: str + file: File | None = None + previous: FileRevision | None = None + @property + def identifier(self) -> QualifiedName: + return qualified_name( + f"FileRevision?{self.name=}&{self.path=}&{self.commit=}&{self.status=}" + ) -@dataclass -class EntityMixin: - def __iter__(self): - yield self.prov_identifier - yield self.prov_attributes + def to_prov_element(self) -> ProvEntity: + attributes = [ + ("name", self.name), + ("path", self.path), + ("status", self.status), + (PROV_TYPE, ProvType.FILE_REVISION), + ] + return ProvEntity( + PLACEHOLDER, + self.identifier, + attributes, + ) -@dataclass(kw_only=True) -class ActivityMixin: - def __iter__(self): - yield self.prov_identifier - yield self.prov_start - yield self.prov_end - yield self.prov_attributes +@dataclass +class Annotation: + uid: str + name: str + body: str + start: datetime + end: datetime + annotator: User + captured_kwargs: dict[str, Any] = field(default_factory=dict) + @property + def identifier(self) -> QualifiedName: + return qualified_name(f"Annotation?{self.uid=}&{self.name=}") + + def to_prov_element(self) -> ProvActivity: + attributes = [ + ("uid", self.uid), + ("name", self.name), + ("body", self.body), + (PROV_ATTR_STARTTIME, self.start), + (PROV_ATTR_ENDTIME, self.end), + (PROV_TYPE, ProvType.ANNOTATION), + *(("captured_" + k, v) for k, v in self.captured_kwargs.items()), + ] + return ProvActivity(PLACEHOLDER, self.identifier, attributes) -@dataclass(unsafe_hash=True, kw_only=True) -class User(ProvMixin, AgentMixin): - name: str - email: str | None = field(default=None) - gitlab_username: str | None = field(repr=False, default=None) - gitlab_id: str | None = field(repr=False, default=None) - prov_role: ProvRole = field(repr=False, default=None) - prov_type: ProvType = field(init=False, repr=False, default=ProvType.USER) - def __post_init__(self): - self.email = self.email.lower() if self.email else None +@dataclass +class Version: + uid: str + resource: str + @property + def identifier(self) -> QualifiedName: + return qualified_name(f"{self.resource}Version?{self.uid=}") -@dataclass(unsafe_hash=True, kw_only=True) -class File(ProvMixin, EntityMixin): - path: str - committed_in: str - prov_type: str = field(init=False, repr=False, default=ProvType.FILE) + @classmethod + def from_commit(cls, commit: Commit): + return cls(uid=commit.sha, resource=ProvType.COMMIT) + @classmethod + def from_issue(cls, issue: Issue): + return cls(uid=issue.id, resource=ProvType.ISSUE) -@dataclass(unsafe_hash=True, kw_only=True) -class FileRevision(ProvMixin, EntityMixin): - path: str - committed_in: str - change_type: str - original: File = field(repr=False, metadata=IS_RELATION) - previous: FileRevision | None = field(repr=False, default=None, metadata=IS_RELATION) - prov_type: ProvType = field(init=False, repr=False, default=ProvType.FILE_REVISION) + @classmethod + def from_merge_request(cls, merge_request: MergeRequest): + return cls(uid=merge_request.id, resource=ProvType.MERGE_REQUEST) + def to_prov_element(self) -> ProvEntity: + attributes = [("uid", self.uid), (PROV_TYPE, f"{self.resource}Version")] + return ProvEntity(PLACEHOLDER, self.identifier, attributes) -@dataclass(unsafe_hash=True, kw_only=True) -class Annotation(ProvMixin, ActivityMixin): - id: str - type: str - body: str = field(repr=False) - kwargs: dict[str, Any] = field(repr=False, default_factory=dict) - annotator: User = field(repr=False, metadata=IS_RELATION) - prov_start: datetime = field(repr=False) - prov_end: datetime = field(repr=False) - prov_type: ProvType = field(init=False, repr=False, default=ProvType.ANNOTATION) - - -@dataclass(unsafe_hash=True, kw_only=True) -class Version(ProvMixin, EntityMixin): - version_id: str - prov_type: ProvType = field(repr=False) - - -@dataclass(unsafe_hash=True, kw_only=True) -class AnnotatedVersion(ProvMixin, EntityMixin): - version_id: str - annotation_id: str - prov_type: ProvType = field(repr=False) - - -@dataclass(unsafe_hash=True, kw_only=True) -class Creation(ProvMixin, ActivityMixin): - creation_id: str - prov_start: datetime = field(repr=False) - prov_end: datetime = field(repr=False) - prov_type: ProvType = field(repr=False) - - -@dataclass(unsafe_hash=True, kw_only=True) -class GitCommit(ProvMixin, ActivityMixin): - hexsha: str - message: str = field(repr=False) - title: str = field(repr=False) - author: User = field(repr=False, metadata=IS_RELATION) - committer: User = field(repr=False, metadata=IS_RELATION) - parents: list[str] = field(repr=False, metadata=IS_RELATION) - prov_start: datetime = field(repr=False) - prov_end: datetime = field(repr=False) - prov_type: ProvType = field(init=False, repr=False, default=ProvType.GIT_COMMIT) - - -@dataclass(unsafe_hash=True, kw_only=True) -class Issue(ProvMixin, EntityMixin): - id: str - iid: str - title: str - description: str = field(repr=False) - url: str = field(repr=False) - author: User = field(repr=False, metadata=IS_RELATION) - annotations: list[Annotation] = field(repr=False, metadata=IS_RELATION) - created_at: datetime = field(repr=False) - closed_at: datetime | None = field(repr=False, default=None) - prov_type: ProvType = field(init=False, repr=False, default=ProvType.ISSUE) + +@dataclass +class AnnotatedVersion: + uid: str + aid: str + resource: str @property - def creation(self) -> Creation: - return Creation( - creation_id=self.id, - prov_start=self.created_at, - prov_end=self.closed_at, - prov_type=ProvType.ISSUE_CREATION, + def identifier(self) -> QualifiedName: + return qualified_name(f"Annotated{self.resource}Version?{self.uid=}&{self.aid=}") + + @classmethod + def from_commit(cls, commit: Commit, annotation: Annotation): + return cls(uid=commit.sha, aid=annotation.uid, resource=ProvType.COMMIT) + + @classmethod + def from_issue(cls, issue: Issue, annotation: Annotation): + return cls(uid=issue.id, aid=annotation.uid, resource=ProvType.ISSUE) + + @classmethod + def from_merge_request(cls, merge_request: MergeRequest, annotation: Annotation): + return cls(uid=merge_request.id, aid=annotation.uid, resource=ProvType.MERGE_REQUEST) + + def to_prov_element(self) -> ProvEntity: + attributes = [("uid", self.uid), (PROV_TYPE, f"Annotated{self.resource}Version")] + return ProvEntity( + PLACEHOLDER, + self.identifier, + attributes, ) - @property - def first_version(self) -> Version: - return Version(version_id=self.id, prov_type=ProvType.ISSUE_VERSION) - @property - def annotated_versions(self) -> list[AnnotatedVersion]: - return [ - AnnotatedVersion( - version_id=self.id, - annotation_id=annotation.id, - prov_type=ProvType.ISSUE_VERSION_ANNOTATED, - ) - for annotation in self.annotations - ] +@dataclass +class Creation: + uid: str + resource: str + start: datetime + end: datetime + @property + def identifier(self) -> QualifiedName: + return qualified_name(f"Creation?{self.uid=}&{self.resource=}") + + @classmethod + def from_tag(cls, tag: GitTag): + return cls(uid=tag.name, resource=ProvType.TAG, start=tag.start, end=tag.end) + + @classmethod + def from_commit(cls, commit: Commit): + return cls( + uid=commit.sha, + resource=ProvType.COMMIT, + start=commit.authored_at, + end=commit.committed_at, + ) -@dataclass(unsafe_hash=True, kw_only=True) -class GithubIssue(ProvMixin, EntityMixin): - number: str # id - id: str # analogous to gitlab iid - title: str - body: str = field(repr=False) - url: str = field(repr=False) - author: User = field(repr=False, metadata=IS_RELATION) - annotations: list[Annotation] = field(repr=False, metadata=IS_RELATION) - created_at: datetime = field(repr=False) - closed_at: datetime | None = field(repr=False, default=None) - prov_type: ProvType = field(init=False, repr=False, default=ProvType.ISSUE) + @classmethod + def from_issue(cls, issue: Issue): + return cls( + uid=issue.id, resource=ProvType.ISSUE, start=issue.created_at, end=issue.closed_at + ) - @property - def creation(self) -> Creation: - return Creation( - creation_id=self.number, - prov_start=self.created_at, - prov_end=self.closed_at, - prov_type=ProvType.ISSUE_CREATION, + @classmethod + def from_merge_request(cls, merge_request: MergeRequest): + return cls( + uid=merge_request.id, + resource=ProvType.MERGE_REQUEST, + start=merge_request.created_at, + end=merge_request.closed_at, ) - @property - def first_version(self) -> Version: - return Version(version_id=self.number, prov_type=ProvType.ISSUE_VERSION) + def to_prov_element(self) -> ProvActivity: + attributes = [ + ("uid", self.uid), + (PROV_ATTR_STARTTIME, self.start), + (PROV_ATTR_ENDTIME, self.end), + (PROV_TYPE, ProvType.CREATION), + ] + return ProvActivity(PLACEHOLDER, self.identifier, attributes) + + +@dataclass +class GitCommit: + sha: str + title: str + message: str + author: User + committer: User + parents: list[str] + start: datetime # authored date + end: datetime # committed date @property - def annotated_versions(self) -> list[AnnotatedVersion]: - return [ - AnnotatedVersion( - version_id=self.number, - annotation_id=annotation.id, - prov_type=ProvType.ISSUE_VERSION_ANNOTATED, - ) - for annotation in self.annotations + def identifier(self) -> QualifiedName: + return qualified_name(f"GitCommit?{self.sha=}") + + def to_prov_element(self) -> ProvActivity: + attributes = [ + ("sha", self.sha), + ("title", self.title), + ("message", self.message), + ("authored_at", self.start), + ("committed_at", self.end), + (PROV_ATTR_STARTTIME, self.start), + (PROV_ATTR_ENDTIME, self.end), + (PROV_TYPE, ProvType.GIT_COMMIT), ] + return ProvActivity(PLACEHOLDER, self.identifier, attributes) -@dataclass(unsafe_hash=True, kw_only=True) -class GitlabCommit(ProvMixin, EntityMixin): - hexsha: str - url: str = field(repr=False) - author: User = field(repr=False, metadata=IS_RELATION) - annotations: list[Annotation] = field(repr=False, metadata=IS_RELATION) - authored_at: datetime = field(repr=False) - committed_at: datetime = field(repr=False) - prov_type: ProvType = field(init=False, repr=False, default=ProvType.GITLAB_COMMIT) +@dataclass +class Issue: + id: str + iid: str + platform: str + title: str + body: str + url: str + author: User + annotations: list[Annotation] + created_at: datetime = field(repr=False) + closed_at: datetime | None = field(repr=False, default=None) + + @property + def identifier(self) -> QualifiedName: + return qualified_name(f"Issue?{self.id=}") @property def creation(self) -> Creation: - return Creation( - creation_id=self.hexsha, - prov_start=self.authored_at, - prov_end=self.committed_at, - prov_type=ProvType.GITLAB_COMMIT_CREATION, - ) + return Creation.from_issue(self) @property def first_version(self) -> Version: - return Version(version_id=self.hexsha, prov_type=ProvType.GITLAB_COMMIT_VERSION) + return Version.from_issue(self) @property def annotated_versions(self) -> list[AnnotatedVersion]: - return [ - AnnotatedVersion( - version_id=self.hexsha, - annotation_id=annotation.id, - prov_type=ProvType.GITLAB_COMMIT_VERSION_ANNOTATED, - ) - for annotation in self.annotations + return [AnnotatedVersion.from_issue(self, annotation) for annotation in self.annotations] + + def to_prov_element(self) -> ProvActivity: + attributes = [ + ("id", self.id), + ("iid", self.iid), + ("platform", self.platform), + ("title", self.title), + ("body", self.body), + ("url", self.url), + ("platform", self.platform), + (PROV_ATTR_STARTTIME, self.created_at), + (PROV_ATTR_ENDTIME, self.closed_at), + (PROV_TYPE, ProvType.ISSUE), ] + return ProvActivity(PLACEHOLDER, self.identifier, attributes) -@dataclass(unsafe_hash=True, kw_only=True) -class GithubCommit(ProvMixin, EntityMixin): - hexsha: str - url: str = field(repr=False) - author: User = field(repr=False, metadata=IS_RELATION) - annotations: list[Annotation] = field(repr=False, metadata=IS_RELATION) # comments ... - authored_at: datetime = field(repr=False) - committed_at: datetime = field(repr=False) - prov_type: ProvType = field(init=False, repr=False, default=ProvType.GITHUB_COMMIT) +@dataclass +class Commit: + sha: str + url: str + author: User + platform: str + annotations: list[Annotation] + authored_at: datetime + committed_at: datetime + + @property + def identifier(self) -> QualifiedName: + return qualified_name(f"Commit?{self.sha=}") @property def creation(self) -> Creation: - return Creation( - creation_id=self.sha, - prov_start=self.authored_at, - prov_end=self.committed_at, - prov_type=ProvType.GITHUB_COMMIT_CREATION, - ) + return Creation.from_commit(self) @property def first_version(self) -> Version: - return Version(version_id=self.hexsha, prov_type=ProvType.GITHUB_COMMIT_VERSION) + return Version.from_commit(self) @property def annotated_versions(self) -> list[AnnotatedVersion]: - return [ - AnnotatedVersion( - version_id=self.hexsha, - annotation_id=annotation.id, - prov_type=ProvType.GITHUB_COMMIT_VERSION_ANNOTATED, - ) - for annotation in self.annotations + return [AnnotatedVersion.from_commit(self, annotation) for annotation in self.annotations] + + def to_prov_element(self) -> ProvActivity: + attributes = [ + ("sha", self.sha), + ("url", self.url), + ("platform", self.platform), + (PROV_ATTR_STARTTIME, self.authored_at), + (PROV_ATTR_ENDTIME, self.committed_at), + (PROV_TYPE, ProvType.COMMIT), ] + return ProvActivity(PLACEHOLDER, self.identifier, attributes) -@dataclass(unsafe_hash=True, kw_only=True) -class MergeRequest(ProvMixin, EntityMixin): +@dataclass +class MergeRequest: id: str iid: str title: str - description: str = field(repr=False) - url: str = field(repr=False) - source_branch: str = field(repr=False) - target_branch: str = field(repr=False) - author: User = field(repr=False, metadata=IS_RELATION) - annotations: list[Annotation] = field(repr=False, metadata=IS_RELATION) - created_at: datetime = field(repr=False) - closed_at: datetime | None = field(repr=False, default=None) - merged_at: datetime | None = field(repr=False, default=None) - first_deployed_to_production_at: datetime | None = field(repr=False, default=None) - prov_type: ProvType = field(init=False, repr=False, default=ProvType.MERGE_REQUEST) - - @property - def creation(self) -> Creation: - return Creation( - creation_id=self.id, - prov_start=self.created_at, - prov_end=self.closed_at, - prov_type=ProvType.MERGE_REQUEST_CREATION, - ) - - @property - def first_version(self) -> Version: - return Version(version_id=self.id, prov_type=ProvType.MERGE_REQUEST_VERSION) + body: str + url: str + platform: str + source_branch: str # base for github + target_branch: str # head for github + author: User + annotations: list[Annotation] + created_at: datetime + closed_at: datetime | None = None + merged_at: datetime | None = None + first_deployed_to_production_at: datetime | None = None @property - def annotated_versions(self) -> list[AnnotatedVersion]: - return [ - AnnotatedVersion( - version_id=self.id, - annotation_id=annotation.id, - prov_type=ProvType.MERGE_REQUEST_VERSION_ANNOTATED, - ) - for annotation in self.annotations - ] - - -@dataclass(unsafe_hash=True, kw_only=True) -class GithubPullRequest(ProvMixin, EntityMixin): - number: str # id - id: str # iid - title: str - body: str = field(repr=False) - url: str = field(repr=False) - head: str = field(repr=False) # source_branch - base: str = field(repr=False) # target_branch - author: User = field(repr=False, metadata=IS_RELATION) - annotations: list[Annotation] = field(repr=False, metadata=IS_RELATION) - created_at: datetime = field(repr=False) - closed_at: datetime | None = field(repr=False, default=None) - merged_at: datetime | None = field(repr=False, default=None) # TODO: is this field necessary? - prov_type: ProvType = field(init=False, repr=False, default=ProvType.PULL_REQUEST) + def identifier(self) -> QualifiedName: + return qualified_name(f"MergeRequest?{self.id=}") @property def creation(self) -> Creation: - return Creation( - creation_id=self.number, - prov_start=self.created_at, - prov_end=self.closed_at, - prov_type=ProvType.PULL_REQUEST_CREATION, - ) + return Creation.from_merge_request(self) @property def first_version(self) -> Version: - return Version(version_id=self.number, prov_type=ProvType.PULL_REQUEST_VERSION) + return Version.from_merge_request(self) @property def annotated_versions(self) -> list[AnnotatedVersion]: return [ - AnnotatedVersion( - version_id=self.number, - annotation_id=annotation.id, - prov_type=ProvType.PULL_REQUEST_VERSION_ANNOTATED, - ) + AnnotatedVersion.from_merge_request(self, annotation) for annotation in self.annotations ] + def to_prov_element(self) -> ProvActivity: + attributes = [ + ("id", self.id), + ("iid", self.iid), + ("title", self.title), + ("body", self.body), + ("url", self.url), + ("platform", self.platform), + ("source_branch", self.source_branch), + ("target_branch", self.target_branch), + (PROV_ATTR_STARTTIME, self.created_at), + (PROV_ATTR_ENDTIME, self.closed_at), + (PROV_TYPE, ProvType.MERGE_REQUEST), + ] + return ProvActivity(PLACEHOLDER, self.identifier, attributes) + -@dataclass(unsafe_hash=True, kw_only=True) -class Tag(ProvMixin, EntityMixin): +@dataclass +class GitTag: name: str - hexsha: str - message: str | None = field(repr=False) - author: User = field(repr=False, metadata=IS_RELATION) - created_at: datetime = field(repr=False) - prov_type: list[ProvType] = field( - init=False, - repr=False, - default_factory=lambda: [ProvType.TAG, ProvType.COLLECTION], - ) + sha: str + message: str | None + author: User + created_at: datetime + + @property + def identifier(self) -> QualifiedName: + return qualified_name(f"GitTag?{self.name=}") @property def creation(self) -> Creation: - return Creation( - creation_id=self.name, - prov_start=self.created_at, - prov_end=self.created_at, - prov_type=ProvType.TAG_CREATION, - ) + return Creation.from_tag(self) + + def to_prov_element(self) -> ProvEntity: + attributes = [ + ("name", self.name), + ("sha", self.sha), + ("message", self.message), + (PROV_ATTR_STARTTIME, self.created_at), + (PROV_ATTR_ENDTIME, self.created_at), + (PROV_TYPE, ProvType.TAG), + (PROV_TYPE, ProvType.COLLECTION), + ] + return ProvEntity(PLACEHOLDER, self.identifier, attributes) -@dataclass(unsafe_hash=True, kw_only=True) -class Asset(ProvMixin, EntityMixin): +@dataclass +class Asset: url: str format: str - prov_type: ProvType = field(init=False, repr=False, default=ProvType.ASSET) + + @property + def identifier(self) -> QualifiedName: + return qualified_name(f"Asset?{self.url=}") + + def to_prov_element(self) -> ProvEntity: + attributes = [ + ("url", self.url), + ("format", self.format), + (PROV_TYPE, ProvType.ASSET), + ] + return ProvEntity(PLACEHOLDER, self.identifier, attributes) -@dataclass(unsafe_hash=True, kw_only=True) -class Evidence(ProvMixin, EntityMixin): - hexsha: str +@dataclass +class Evidence: + sha: str url: str collected_at: datetime - prov_type: ProvType = field(init=False, repr=False, default=ProvType.EVIDENCE) + @property + def identifier(self) -> QualifiedName: + return qualified_name(f"Evidence?{self.sha=}") + + def to_prov_element(self) -> ProvEntity: + attributes = [ + ("sha", self.sha), + ("url", self.url), + ("collected_at", self.collected_at), + (PROV_TYPE, ProvType.EVIDENCE), + ] + return ProvEntity(PLACEHOLDER, self.identifier, attributes) -@dataclass(unsafe_hash=True, kw_only=True) -class Release(ProvMixin, EntityMixin): + +@dataclass +class Release: name: str - description: str = field(repr=False) - tag_name: str = field(repr=False) - author: User | None = field(repr=False, metadata=IS_RELATION) - assets: list[Asset] = field(repr=False, metadata=IS_RELATION) - evidences: list[Evidence] = field(repr=False, metadata=IS_RELATION) - created_at: datetime = field(repr=False) - released_at: datetime = field(repr=False) - prov_type: list[ProvType] = field( - init=False, - repr=False, - default_factory=lambda: [ProvType.RELEASE, ProvType.COLLECTION], - ) + body: str + tag_name: str + platform: str + author: User | None + assets: list[Asset] + evidences: list[Evidence] + created_at: datetime + released_at: datetime + + @property + def identifier(self) -> QualifiedName: + return qualified_name(f"Release?{self.name=}") @property def creation(self) -> Creation: - return Creation( - creation_id=self.name, - prov_start=self.created_at, - prov_end=self.released_at, - prov_type=ProvType.RELEASE_CREATION, - ) + return Creation.from_release(self) + + def to_prov_element(self) -> ProvEntity: + attributes = [ + ("name", self.name), + ("body", self.body), + ("tag_name", self.tag_name), + ("platform", self.platform), + ("created_at", self.created_at), + ("released_at", self.released_at), + (PROV_TYPE, ProvType.RELEASE), + (PROV_TYPE, ProvType.COLLECTION), + ] + return ProvEntity(PLACEHOLDER, self.identifier, attributes) From 2be0f6c8e21cc9774c37ea4a91155314077423c8 Mon Sep 17 00:00:00 2001 From: cdboer Date: Mon, 9 Jan 2023 10:46:01 +0100 Subject: [PATCH 17/81] Update constants --- gitlab2prov/domain/constants.py | 26 +++++++++++++++++--------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/gitlab2prov/domain/constants.py b/gitlab2prov/domain/constants.py index ea416e7..13cf12d 100644 --- a/gitlab2prov/domain/constants.py +++ b/gitlab2prov/domain/constants.py @@ -29,15 +29,14 @@ class ChangeType: class ProvRole: GIT_COMMIT = "GitCommit" + COMMIT = "Commit" COMMITTER = "Committer" AUTHOR = "Author" - AUTHOR_GITLAB_COMMIT = "GitlabCommitAuthor" - AUTHOR_GITHUB_COMMIT = "GithubCommitAuthor" - AUTHOR_ISSUE = "IssueAuthor" - AUTHOR_MERGE_REQUEST = "MergeRequestAuthor" - AUTHOR_PULL_REQUEST = "PullRequestAuthor" - AUTHOR_RELEASE = "ReleaseAuthor" - AUTHOR_TAG = "TagAuthor" + COMMIT_AUTHOR = "CommitAuthor" + ISSUE_AUTHOR = "IssueAuthor" + MERGE_REQUEST_AUTHOR = "MergeRequestAuthor" + RELEASE_AUTHOR = "ReleaseAuthor" + TAG_AUTHOR = "TagAuthor" ANNOTATOR = "Annotator" FILE = "File" FILE_REVISION_TO_BE_MODIFIED = "FileRevisionToBeModified" @@ -45,12 +44,20 @@ class ProvRole: FILE_REVISION_AT_POINT_OF_ADDITION = "FileRevisionAtPointOfAddition" FILE_REVISION_AT_POINT_OF_DELETION = "FileRevisionAtPointOfDeletion" RESOURCE = "Resource" + FIRST_RESOURCE_VERSION = "FirstResourceVersion" RESOURCE_VERSION_AT_POINT_OF_CREATION = "ResourceVersionAtPointOfCreation" RESOURCE_VERSION_TO_BE_ANNOTATED = "ResourceVersionToBeAnnotated" RESOURCE_VERSION_AFTER_ANNOTATION = "ResourceVersionAfterAnnotation" + PRE_ANNOTATION_VERSION = "PreAnnotationVersion" + POST_ANNOTATION_VERSION = "PostAnnotationVersion" RELEASE = "Release" TAG = "Tag" - GitCommit = "GitCommit" + GITCOMMIT = "GitCommit" + ADDED_REVISION = "AddedRevision" + DELETED_REVISION = "DeletedRevision" + MODIFIED_REVISION = "ModifiedRevision" + PREVIOUS_REVISION = "PreviousRevision" + class ProvType: @@ -58,7 +65,7 @@ class ProvType: GIT_COMMIT = "GitCommit" FILE = "File" FILE_REVISION = "FileRevision" - GITLAB_COMMIT = "GitlabCommit" + COMMIT = "Commit" GITLAB_COMMIT_VERSION = "GitlabCommitVersion" GITLAB_COMMIT_VERSION_ANNOTATED = "AnnotatedGitlabCommitVersion" GITLAB_COMMIT_CREATION = "GitlabCommitCreation" @@ -78,6 +85,7 @@ class ProvType: PULL_REQUEST_VERSION = "PullRequestVersion" PULL_REQUEST_VERSION_ANNOTAED = "AnnotatedPullRequestVersion" PULL_REQUEST_CREATION = "PullRequestCreation" + CREATION = "Creation" ANNOTATION = "Annotation" TAG = "Tag" TAG_CREATION = "TagCreation" From ed20caa9b2d79ad37e3c46da7538688e0d32c726 Mon Sep 17 00:00:00 2001 From: cdboer Date: Mon, 9 Jan 2023 10:46:57 +0100 Subject: [PATCH 18/81] Annotation parsing should use the new domain objects --- .../adapters/fetch/annotations/parse.py | 44 +++++++++---------- 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/gitlab2prov/adapters/fetch/annotations/parse.py b/gitlab2prov/adapters/fetch/annotations/parse.py index c423222..383625d 100644 --- a/gitlab2prov/adapters/fetch/annotations/parse.py +++ b/gitlab2prov/adapters/fetch/annotations/parse.py @@ -74,13 +74,13 @@ def parse_system_note(note: Note) -> Annotation: ) annotation_type, kwargs = classify_system_note(note.body) return Annotation( - id=note.id, - type=annotation_type, + uid=note.id, + name=annotation_type, body=note.body, - kwargs=kwargs, + start=note.created_at, + end=note.created_at, + captured_kwargs=kwargs, annotator=annotator, - prov_start=note.created_at, - prov_end=note.created_at, ) @@ -93,12 +93,12 @@ def parse_comment(comment: Comment) -> Annotation: prov_role=ProvRole.ANNOTATOR, ) return Annotation( - id=f"{uuid.uuid4()}{annotator.gitlab_id}{abs(hash(comment.note))}", - type="add_comment", + uid=f"{uuid.uuid4()}{annotator.gitlab_id}{abs(hash(comment.note))}", + name="add_comment", body=comment.note, + start=comment.created_at, + end=comment.created_at, annotator=annotator, - prov_start=comment.created_at, - prov_end=comment.created_at, ) @@ -111,12 +111,12 @@ def parse_note(note: Note) -> Annotation: prov_role=ProvRole.ANNOTATOR, ) return Annotation( - id=note.id, - type="add_note", + uid=note.id, + name="add_note", body=note.body, annotator=annotator, - prov_start=note.created_at, - prov_end=note.created_at, + start=note.created_at, + end=note.created_at, ) @@ -129,12 +129,12 @@ def parse_award(award: AwardEmoji) -> Annotation: prov_role=ProvRole.ANNOTATOR, ) return Annotation( - id=award.id, - type="award_emoji", + uid=award.id, + name="award_emoji", body=award.name, annotator=annotator, - prov_start=award.created_at, - prov_end=award.created_at, + start=award.created_at, + end=award.created_at, ) @@ -147,12 +147,12 @@ def parse_label(label: Label) -> Annotation: prov_role=ProvRole.ANNOTATOR, ) return Annotation( - id=label.id, - type=f"{label.action}_label", + uid=label.id, + name=f"{label.action}_label", body=label.action, annotator=annotator, - prov_start=label.created_at, - prov_end=label.created_at, + start=label.created_at, + end=label.created_at, ) @@ -182,4 +182,4 @@ def parse_annotations( for parseable in parseables: if parser := choose_parser(parseable): annotations.append(parser(parseable)) - return sorted(annotations, key=operator.attrgetter("prov_start")) + return sorted(annotations, key=operator.attrgetter("start")) From 300c8bf74e8eae61a5658bce19c287f6f2f4e848 Mon Sep 17 00:00:00 2001 From: cdboer Date: Mon, 9 Jan 2023 10:49:19 +0100 Subject: [PATCH 19/81] Add validation exception handling --- gitlab2prov/config/parser.py | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/gitlab2prov/config/parser.py b/gitlab2prov/config/parser.py index 073de78..7899cc6 100644 --- a/gitlab2prov/config/parser.py +++ b/gitlab2prov/config/parser.py @@ -2,7 +2,9 @@ from typing import Any import jsonschema -from ruamel.yaml import YAML +import jsonschema.exceptions + +from ruamel.yaml import YAML, constructor from gitlab2prov.root import get_package_root @@ -21,8 +23,17 @@ def get_schema() -> dict[str, Any]: class ConfigParser: @staticmethod - def validate(filepath: str) -> None: - jsonschema.validate(read_file(filepath), get_schema()) + def validate(filepath: str) -> tuple[bool, str]: + try: + validator = jsonschema.Draft7Validator(get_schema()) + validator.validate(read_file(filepath)) + except jsonschema.exceptions.ValidationError as err: + return False, err.message + except jsonschema.exceptions.SchemaError as err: + return False, err.message + except constructor.DuplicateKeyError as err: + return False, err.problem + return True, "Everything is fine!" def parse(self, filepath: str) -> list[str]: content = read_file(filepath) From 60fb4d90aa5e59ed87d39f328bddf50a0b2cc9a5 Mon Sep 17 00:00:00 2001 From: cdboer Date: Mon, 9 Jan 2023 10:50:16 +0100 Subject: [PATCH 20/81] Add 'additionalProperties: false' to command properties to allow only schema defined keys --- gitlab2prov/config/schema.json | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/gitlab2prov/config/schema.json b/gitlab2prov/config/schema.json index ff77cb9..2262428 100644 --- a/gitlab2prov/config/schema.json +++ b/gitlab2prov/config/schema.json @@ -19,6 +19,7 @@ "type": "string" } }, + "additionalProperties": false, "required": [ "url", "token" @@ -39,6 +40,7 @@ } } }, + "additionalProperties": false, "required": [ "input" ] @@ -69,6 +71,7 @@ } } }, + "additionalProperties": false, "required": [ "output", "format" @@ -106,9 +109,10 @@ "csv" ] } - } + }, + "additionalProperties": false } } } ] -} +} \ No newline at end of file From a9b1b9e1fd285f9f560bdd0a9901e9640133918e Mon Sep 17 00:00:00 2001 From: cdboer Date: Mon, 9 Jan 2023 10:50:41 +0100 Subject: [PATCH 21/81] Update handlers --- gitlab2prov/service_layer/handlers.py | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/gitlab2prov/service_layer/handlers.py b/gitlab2prov/service_layer/handlers.py index 88259ea..e15bf48 100644 --- a/gitlab2prov/service_layer/handlers.py +++ b/gitlab2prov/service_layer/handlers.py @@ -18,11 +18,12 @@ def fetch_git(cmd: commands.Fetch, uow, git_fetcher) -> None: uow.commit() -def fetch_gitlab(cmd: commands.Fetch, uow, gitlab_fetcher, github_fetcher) -> None: - fetcher = gitlab_fetcher() if "gitlab" in cmd.url else github_fetcher() - fetcher.do_login(cmd.url, cmd.token) +def fetch_githosted(cmd: commands.Fetch, uow, fetcher_factory) -> None: + fetcher = fetcher_factory.factory(cmd.url) + log.info("choose fetcher {fetcher=} for {cmd.url=}") + fetcher = fetcher(cmd.token, cmd.url) with uow: - for resource in fetcher.fetch_gitlab(): + for resource in fetcher.fetch_all(): log.info(f"add {resource=}") uow.resources.add(resource) uow.commit() @@ -35,15 +36,18 @@ def reset(cmd: commands.Reset, uow): def serialize(cmd: commands.Serialize, uow) -> ProvDocument: log.info(f"serialize graph consisting of {model.MODELS=}") - graph = operations.combine(prov_model(uow.resources) for prov_model in model.MODELS) - graph = operations.dedupe(graph) - return graph + document = ProvDocument() + for prov_model in model.MODELS: + provenance = prov_model(uow.resources) + document = operations.combine(document, provenance) + document = operations.dedupe(document) + return document HANDLERS = { commands.Fetch: [ fetch_git, - fetch_gitlab, + fetch_githosted, ], commands.Reset: [reset], commands.Serialize: [serialize], From 277b7eb8786be9f4306e6766f32e91ad4e84e256 Mon Sep 17 00:00:00 2001 From: cdboer Date: Mon, 9 Jan 2023 10:51:05 +0100 Subject: [PATCH 22/81] Add new model implementation --- gitlab2prov/prov/model.py | 958 +++++++++++++++++++++++--------------- 1 file changed, 574 insertions(+), 384 deletions(-) diff --git a/gitlab2prov/prov/model.py b/gitlab2prov/prov/model.py index b24aad1..d2334ce 100644 --- a/gitlab2prov/prov/model.py +++ b/gitlab2prov/prov/model.py @@ -1,404 +1,594 @@ -from typing import Optional, Union - -from prov.model import ProvDocument, PROV_ROLE +from typing import Optional, Union, Type, Iterable, Callable, Any +from dataclasses import dataclass, field +from operator import attrgetter + +from prov.model import ( + ProvDocument, + ProvDerivation, + PROV_ROLE, + PROV_ATTR_STARTTIME, + ProvInvalidation, + ProvMembership, + ProvElement, + ProvUsage, + ProvAssociation, + ProvAttribution, + ProvGeneration, + ProvRelation, + ProvSpecialization, + ProvCommunication, + ProvRelation, + ProvRecord, +) +from prov.identifier import QualifiedName, Namespace +from functools import partial -from gitlab2prov.prov.operations import graph_factory from gitlab2prov.adapters.repository import AbstractRepository -from gitlab2prov.domain.constants import ChangeType, ProvRole +from gitlab2prov.domain.constants import ProvRole from gitlab2prov.domain.objects import ( FileRevision, GitCommit, - GitlabCommit, - GithubCommit, + Commit, Issue, MergeRequest, - GithubPullRequest, Release, - Tag, + GitTag, + Annotation, + Creation, + AnnotatedVersion, ) -Resource = Union[GitlabCommit, Issue, MergeRequest] - - -def git_commit_model(resources: AbstractRepository, graph: ProvDocument = None): - """Commit model implementation.""" - if graph is None: - graph = graph_factory() - for commit in resources.list_all(GitCommit): - file_revisions = resources.list_all(FileRevision, committed_in=commit.hexsha) - parents = [resources.get(GitCommit, hexsha=hexsha) for hexsha in commit.parents] - parents = [parent for parent in parents if parent is not None] - for rev in file_revisions: - model = choose_rev_model(rev) - if model is None: - continue - graph.update(model(commit, parents, rev)) - return graph - - -def choose_rev_model(rev: FileRevision): - """Add the file change models based on the change type of each file version.""" - if rev.change_type == ChangeType.ADDED: - return addition - if ( - rev.change_type == ChangeType.MODIFIED - or rev.change_type == ChangeType.RENAMED - or rev.change_type == ChangeType.COPIED - or rev.change_type == ChangeType.CHANGED +AUTHOR_ROLE_MAP = { + Commit: ProvRole.COMMIT_AUTHOR, + Issue: ProvRole.ISSUE_AUTHOR, + MergeRequest: ProvRole.MERGE_REQUEST_AUTHOR, +} + + +HostedResource = Commit | Issue | MergeRequest +Query = Callable[[AbstractRepository], Iterable[HostedResource]] +DEFAULT_NAMESPACE = Namespace("ex", "example.org") + + +def file_status_query(repository: AbstractRepository, status: str): + for revision in repository.list_all(FileRevision, status=status): + commit = repository.get(GitCommit, sha=revision.commit) + for parent in [repository.get(GitCommit, sha=sha) for sha in commit.parents]: + yield commit, parent, revision, revision.previous if status == "modified" else None + + +def hosted_resource_query(repository: AbstractRepository, resource_type: Type[HostedResource]): + for resource in repository.list_all(resource_type): + if resource_type == Commit: + yield (resource, repository.get(GitCommit, sha=resource.sha)) + yield (resource, None) + + +FileAdditionQuery = partial(file_status_query, status="added") +FileDeletionQuery = partial(file_status_query, status="deleted") +FileModificationQuery = partial(file_status_query, status="modified") +HostedCommitQuery = partial(hosted_resource_query, resource_type=Commit) +HostedIssueQuery = partial(hosted_resource_query, resource_type=Issue) +HostedMergeQuery = partial(hosted_resource_query, resource_type=MergeRequest) + + +@dataclass +class ProvenanceContext: + document: ProvDocument + namespace: Optional[str] = None + + def add_element(self, dataclass_instance) -> ProvRecord: + # Convert the dataclass instance to a ProvElement + element = self.convert_to_prov_element(dataclass_instance) + # Add the namespace to the element if it is provided + if self.namespace: + element.add_namespace(self.namespace) + # Return the newly added element + return self.document.add_record(element) + + def convert_to_prov_element(self, dataclass_instance) -> ProvElement: + # Convert the dataclass instance to a ProvElement + element = dataclass_instance.to_prov_element() + # Add the element to the ProvDocument and return it + return self.document.new_record(element._prov_type, element.identifier, element.attributes) + + def add_relation( + self, + source_dataclass_instance, + target_dataclass_instance, + relationship_type: Type[ProvRelation], + attributes: dict[str, Any] = None, + ) -> None: + # Initialize attributes if they are not provided + if not attributes: + attributes = dict() + # Make sure that both source and target are part of the document + source = self.add_element(source_dataclass_instance) + target = self.add_element(target_dataclass_instance) + # Create a relationship between the source and target + relationship = self.document.new_record( + relationship_type._prov_type, + QualifiedName(DEFAULT_NAMESPACE, f"relation:{source.identifier}:{target.identifier}"), + { + relationship_type.FORMAL_ATTRIBUTES[0]: source, + relationship_type.FORMAL_ATTRIBUTES[1]: target, + }, + ) + # Add the remaining attributes to the relationship + relationship.add_attributes(attributes) + # Add the relationship to the ProvDocument + self.document.add_record(relationship) + + def get_document(self): + return self.document + + +@dataclass +class FileAdditionModel: + commit: GitCommit + parent: GitCommit + revisions: FileRevision + ctx: ProvenanceContext = field(init=False) + + def __post_init__(self): + self.ctx = ProvenanceContext(ProvDocument()) + + def build_provenance_model(self) -> ProvDocument: + # Add the elements to the context + self.ctx.add_element(self.commit) + self.ctx.add_element(self.commit.author) + self.ctx.add_element(self.commit.committer) + self.ctx.add_element(self.revision) + self.ctx.add_element(self.revision.file) + # Check if parent exists + if self.parent: + # Add the parent to the context + self.ctx.add_element(self.parent) + # Add the communication relation (wasInformedBy) between the parent and the commit + self.ctx.add_relation(self.commit, self.parent, ProvCommunication, {}) + # Add the relations to the context + self.ctx.add_relation( + self.commit, + self.commit.author, + ProvAssociation, + {PROV_ROLE: ProvRole.AUTHOR}, + ) + self.ctx.add_relation( + self.commit, + self.commit.committer, + ProvAssociation, + {PROV_ROLE: ProvRole.COMMITTER}, + ) + self.ctx.add_relation( + self.revision, + self.commit, + ProvGeneration, + { + PROV_ATTR_STARTTIME: self.commit.start, + PROV_ROLE: ProvRole.FILE, + }, + ) + self.ctx.add_relation( + self.revision.file, + self.commit, + ProvGeneration, + { + PROV_ATTR_STARTTIME: self.commit.start, + PROV_ROLE: ProvRole.ADDED_REVISION, + }, + ) + self.ctx.add_relation(self.revision.file, self.commit.author, ProvAttribution) + self.ctx.add_relation(self.revision, self.revision.file, ProvSpecialization) + # Return the document + return self.ctx.get_document() + + +@dataclass +class FileDeletionModel: + commit: GitCommit + parent: GitCommit + revision: FileRevision + ctx: ProvenanceContext = field(init=False) + + def __post_init__(self): + # Initialize the context + self.ctx = ProvenanceContext(ProvDocument()) + + def build_provenance_model(self) -> ProvDocument: + # Add the elements to the context + self.ctx.add_element(self.commit) + self.ctx.add_element(self.revision) + self.ctx.add_element(self.revision.file) + self.ctx.add_element(self.commit.author) + self.ctx.add_element(self.commit.committer) + # Check if parent exists + if self.parent: + # Add the parent to the context + self.ctx.add_element(self.parent) + # Add the communication relation (wasInformedBy) between the parent and the commit + self.ctx.add_relation(self.commit, self.parent, ProvCommunication) + # Add the relations to the context + self.ctx.add_relation( + self.commit, self.comitter, ProvAssociation, {PROV_ROLE: ProvRole.COMMITTER} + ) + self.ctx.add_relation( + self.commit, self.author, ProvAssociation, {PROV_ROLE: ProvRole.AUTHOR} + ) + self.ctx.add_relation(self.revision, self.revision.file, ProvSpecialization) + self.ctx.add_relation( + self.revision, + self.commit, + ProvInvalidation, + {PROV_ATTR_STARTTIME: self.commit.start, PROV_ROLE: ProvRole.DELETED_REVISION}, + ) + # Return the document + return self.ctx.get_document() + + +@dataclass +class FileModificationModel: + commit: GitCommit + parent: GitCommit + revision: FileRevision + previous: FileRevision + ctx: ProvenanceContext = field(init=False) + + def __post_init__(self): + # Initialize the context + self.ctx = ProvenanceContext(ProvDocument()) + + def build_provenance_model(self) -> ProvDocument: + # Add the elements to the context + self.ctx.add_element(self.commit) + self.ctx.add_element(self.revision) + self.ctx.add_element(self.revision.file) + self.ctx.add_element(self.previous) + self.ctx.add_element(self.commit.author) + self.ctx.add_element(self.commit.committer) + # Check if parent exists + if self.parent: + # Add the parent to the context + self.ctx.add_element(self.parent) + # Add the communication relation (wasInformedBy) between the parent and the commit + self.ctx.add_relation(self.commit, self.parent, ProvCommunication) + # Add the relations to the context + self.ctx.add_relation( + self.commit, self.commit.author, ProvAssociation, {PROV_ROLE: ProvRole.AUTHOR} + ) + self.ctx.add_relation( + self.commit, self.commit.committer, ProvAssociation, {PROV_ROLE: ProvRole.COMMITTER} + ) + self.ctx.add_relation(self.revision, self.revision.file, ProvSpecialization) + self.ctx.add_relation( + self.revision, + self.commit, + ProvGeneration, + {PROV_ATTR_STARTTIME: self.commit.start, PROV_ROLE: ProvRole.MODIFIED_REVISION}, + ) + self.ctx.add_relation(self.revision, self.commit.author, ProvAttribution) + self.ctx.add_relation( + self.revision, self.previous, ProvDerivation + ) # TODO: has to be wasRevisionOf record, add asserted type 'Revison' + self.ctx.add_relation( + self.commit, + self.previous, + ProvUsage, + {PROV_ATTR_STARTTIME: self.commit.start, PROV_ROLE: ProvRole.PREVIOUS_REVISION}, + ) + # Return the document + return self.ctx.get_document() + + +@dataclass +class HostedResourceModel: + """Model for a hosted resource (e.g., commit, issue, merge request).""" + + resource: Union[Commit, Issue, MergeRequest] + commit: Optional[GitCommit] = None + ctx: ProvenanceContext = field(init=False) + + def __post_init__(self): + # Initialize the context + self.ctx = ProvenanceContext(ProvDocument()) + + def build_provenance_model(self): + # Choose the creation part based on the type of resource + if isinstance(self.resource, Commit) and self.commit: + self._add_creation_part_for_hosted_commits() + else: + self._add_creation_part() + # Set the previous annotation and version to the creation / original version + previous_annotation = self.resource.creation + previous_version = self.resource.first_version + # For each annotation and version, add the annotation part, sort by time ascending + for current_annotation, current_version in zip( + sorted(self.resource.annotations, key=attrgetter("start")), + sorted(self.resource.annotated_versions, key=attrgetter("start")), + ): + # Add the annotation chain link + self._add_annotation_part( + current_annotation, + previous_annotation, + current_version, + previous_version, + ) + # Update the previous annotation and version + previous_annotation = current_annotation + previous_version = current_version + + return self.ctx.get_document() + + def _add_creation_part_for_hosted_commits(self): + # Add the elements to the context + self.ctx.add_element(self.resource) + self.ctx.add_element(self.resource.creation) + self.ctx.add_element(self.resource.first_version) + self.ctx.add_element(self.resource.author) + self.ctx.add_element(self.commit) + self.ctx.add_element(self.commit.committer) + # Add the relations to the context + self.ctx.add_relation( + self.resource.creation, + self.resource.author, + ProvAssociation, + {PROV_ROLE: ProvRole.COMMIT_AUTHOR}, + ) + self.ctx.add_relation(self.resource, self.resource.author, ProvAttribution) + self.ctx.add_relation(self.resource.first_version, self.resource, ProvSpecialization) + self.ctx.add_relation(self.resource.first_version, self.resource.author, ProvAttribution) + self.ctx.add_relation( + self.resource, + self.resource.creation, + ProvGeneration, + {PROV_ATTR_STARTTIME: self.resource.creation.start, PROV_ROLE: ProvRole.RESOURCE}, + ) + self.ctx.add_relation( + self.resource.first_version, + self.resource.creation, + ProvGeneration, + { + PROV_ATTR_STARTTIME: self.resource.creation.start, + PROV_ROLE: ProvRole.FIRST_RESOURCE_VERSION, + }, + ) + self.ctx.add_relation(self.resource.creation, self.commit, ProvCommunication) + self.ctx.add_relation( + self.commit, self.commit.committer, ProvAssociation, {PROV_ROLE: ProvRole.COMMIT_AUTHOR} + ) + + def _add_creation_part(self): + self.ctx.add_element(self.resource) + self.ctx.add_element(self.resource.creation) + self.ctx.add_element(self.resource.first_version) + self.ctx.add_element(self.resource.author) + + self.ctx.add_relation(self.resource, self.resource.author, ProvAttribution) + self.ctx.add_relation(self.resource.first_version, self.resource, ProvSpecialization) + self.ctx.add_relation(self.resource.first_version, self.resource.author, ProvAttribution) + self.ctx.add_relation( + self.resource.creation, + self.resource.author, + ProvAssociation, + {PROV_ROLE: AUTHOR_ROLE_MAP[type(self.resource)]}, + ) + self.ctx.add_relation( + self.resource, + self.resource.creation, + ProvGeneration, + {PROV_ATTR_STARTTIME: self.resource.creation.start, PROV_ROLE: ProvRole.RESOURCE}, + ) + self.ctx.add_relation( + self.resource.first_version, + self.resource.creation, + ProvGeneration, + { + PROV_ATTR_STARTTIME: self.resource.creation.start, + PROV_ROLE: ProvRole.FIRST_RESOURCE_VERSION, + }, + ) + + def _add_annotation_part( + self, + current_annotation: Annotation, + previous_annotation: Union[Annotation, Creation], + current_version: AnnotatedVersion, + previous_version: AnnotatedVersion, ): - return modification - if rev.change_type == ChangeType.DELETED: - return deletion - return None - - -def addition( - commit: GitCommit, - parents: list[GitCommit], - rev: FileRevision, - graph: ProvDocument = None, -): - """Add model for the addition of a new file in a commit.""" - if graph is None: - graph = graph_factory() - c = graph.activity(*commit) - at = graph.agent(*commit.author) - ct = graph.agent(*commit.committer) - - c.wasAssociatedWith( - at, plan=None, attributes=[(PROV_ROLE, list(at.get_attribute(PROV_ROLE))[0])] - ) - c.wasAssociatedWith( - ct, plan=None, attributes=[(PROV_ROLE, list(ct.get_attribute(PROV_ROLE))[0])] - ) - - for parent in parents: - graph.activity(*commit).wasInformedBy(graph.activity(*parent)) - - f = graph.entity(*rev.original) - f.wasAttributedTo(at) - f.wasGeneratedBy(c, time=c.get_startTime(), attributes=[(PROV_ROLE, ProvRole.FILE)]) - - rev = graph.entity(*rev) - rev.wasAttributedTo(at) - rev.specializationOf(f) - rev.wasGeneratedBy( - c, - time=c.get_startTime(), - attributes=[(PROV_ROLE, ProvRole.FILE_REVISION_AT_POINT_OF_ADDITION)], - ) - return graph - - -def modification( - commit: GitCommit, - parents: list[GitCommit], - fv: FileRevision, - graph: ProvDocument = None, -): - if graph is None: - graph = graph_factory() - c = graph.activity(*commit) - at = graph.agent(*commit.author) - ct = graph.agent(*commit.committer) - - c.wasAssociatedWith( - at, plan=None, attributes=[(PROV_ROLE, list(at.get_attribute(PROV_ROLE))[0])] - ) - c.wasAssociatedWith( - ct, plan=None, attributes=[(PROV_ROLE, list(ct.get_attribute(PROV_ROLE))[0])] - ) - - for parent in parents: - graph.activity(*commit).wasInformedBy(graph.activity(*parent)) - - f = graph.entity(*fv.original) - rev = graph.entity(*fv) - rev.wasAttributedTo(at) - rev.specializationOf(f) - rev.wasGeneratedBy( - c, - time=c.get_startTime(), - attributes=[(PROV_ROLE, ProvRole.FILE_REVISION_AFTER_MODIFICATION)], - ) - - # skip previous revisions if none exist - if fv.previous is None: - return graph - - prev = graph.entity(*fv.previous) - prev.specializationOf(f) - graph.wasRevisionOf(rev, prev) # NOTE: rev.wasRevisionOf(prev) is not impl in prov pkg - c.used( - prev, - c.get_startTime(), - [(PROV_ROLE, ProvRole.FILE_REVISION_TO_BE_MODIFIED)], - ) - return graph - - -def deletion( - commit: GitCommit, - parents: list[GitCommit], - fv: FileRevision, - graph: ProvDocument = None, -): - if graph is None: - graph = graph_factory() - c = graph.activity(*commit) - at = graph.agent(*commit.author) - ct = graph.agent(*commit.committer) - - c.wasAssociatedWith( - at, plan=None, attributes=[(PROV_ROLE, list(at.get_attribute(PROV_ROLE))[0])] - ) - c.wasAssociatedWith( - ct, plan=None, attributes=[(PROV_ROLE, list(ct.get_attribute(PROV_ROLE))[0])] - ) - - for parent in parents: - graph.activity(*commit).wasInformedBy(graph.activity(*parent)) - - f = graph.entity(*fv.original) - rev = graph.entity(*fv) - rev.specializationOf(f) - rev.wasInvalidatedBy( - c, - c.get_startTime(), - [(PROV_ROLE, ProvRole.FILE_REVISION_AT_POINT_OF_DELETION)], - ) - return graph - - -def gitlab_commit_model(resources, graph: ProvDocument = None): - if graph is None: - graph = graph_factory() - - github_commits = resources.list_all(GitlabCommit) - gitlab_commits = resources.list_all(GithubCommit) - - for commit in {*github_commits, *gitlab_commits}: - git_commit = resources.get(GitCommit, hexsha=commit.hexsha) - - creation = commit_creation(commit, git_commit) - annotats = annotation_chain(commit) - - graph.update(creation) - graph.update(annotats) - - return graph - - -def gitlab_issue_model(resources, graph: ProvDocument = None): - if graph is None: - graph = graph_factory() - for issue in resources.list_all(Issue): - graph.update(resource_creation(issue)) - graph.update(annotation_chain(issue)) - return graph - - -def gitlab_merge_request_model(resources, graph: ProvDocument = None): - if graph is None: - graph = graph_factory() - - merge_requests = resources.list_all(MergeRequest) - pull_requests = resources.list_all(GithubPullRequest) - - for merge_request in {*merge_requests, *pull_requests}: - creation = resource_creation(merge_request) - annotats = annotation_chain(merge_request) - - graph.update(creation) - graph.update(annotats) - return graph - - -def commit_creation( - gitlab_commit: GitlabCommit, - git_commit: Optional[GitCommit], - graph: ProvDocument = None, -): - if graph is None: - graph = graph_factory() - - resource = graph.entity(*gitlab_commit) - creation = graph.activity(*gitlab_commit.creation) - first_version = graph.entity(*gitlab_commit.first_version) - author = graph.agent(*gitlab_commit.author) - - resource.wasAttributedTo(author) - creation.wasAssociatedWith( - author, plan=None, attributes=[(PROV_ROLE, ProvRole.AUTHOR_GITLAB_COMMIT)] - ) - resource.wasGeneratedBy( - creation, - time=creation.get_startTime(), - attributes=[(PROV_ROLE, ProvRole.RESOURCE)], - ) - first_version.wasGeneratedBy( - creation, - time=creation.get_startTime(), - attributes=[(PROV_ROLE, ProvRole.RESOURCE_VERSION_AT_POINT_OF_CREATION)], - ) - first_version.specializationOf(resource) - first_version.wasAttributedTo(author) - - if git_commit is None: - return graph - - commit = graph.activity(*git_commit) - committer = graph.agent(*git_commit.committer) - commit.wasAssociatedWith(committer, plan=None, attributes=[(PROV_ROLE, ProvRole.COMMITTER)]) - creation.wasInformedBy(commit) - - return graph - - -def resource_creation(resource: Resource, graph: ProvDocument = None): - if graph is None: - graph = graph_factory() - r = graph.entity(*resource) - c = graph.activity(*resource.creation) - rv = graph.entity(*resource.first_version) - at = graph.agent(*resource.author) - - c.wasAssociatedWith( - at, - plan=None, - attributes=[(PROV_ROLE, list(at.get_attribute(PROV_ROLE))[0])], - ) - - r.wasAttributedTo(at) - rv.wasAttributedTo(at) - rv.specializationOf(r) - r.wasGeneratedBy( - c, - time=c.get_startTime(), - attributes=[(PROV_ROLE, ProvRole.RESOURCE)], - ) - rv.wasGeneratedBy( - c, - time=c.get_startTime(), - attributes=[(PROV_ROLE, ProvRole.RESOURCE_VERSION_AT_POINT_OF_CREATION)], - ) - return graph - - -def annotation_chain(resource, graph=None): - if graph is None: - graph = graph_factory() - r = graph.entity(*resource) - c = graph.activity(*resource.creation) - fv = graph.entity(*resource.first_version) - - prev_annot = c - prev_annot_ver = fv - - for annotation, annotated_version in zip(resource.annotations, resource.annotated_versions): - annot = graph.activity(*annotation) - annot_ver = graph.entity(*annotated_version) - annotator = graph.agent(*annotation.annotator) - - annot.wasInformedBy(prev_annot) - annot_ver.wasDerivedFrom(prev_annot_ver) - annot_ver.wasAttributedTo(annotator) - annot_ver.specializationOf(r) - - annot.wasAssociatedWith( - annotator, - plan=None, - attributes=[(PROV_ROLE, list(annotator.get_attribute(PROV_ROLE))[0])], + # Add the elements to the context + self.ctx.add_element(self.resource) + self.ctx.add_element(self.resource.creation) + self.ctx.add_element(current_annotation) + self.ctx.add_element(current_annotation.annotator) + self.ctx.add_element(current_version) + self.ctx.add_element(previous_annotation) + self.ctx.add_element(previous_version) + # Add the relations to the context + self.ctx.add_relation(current_annotation, previous_annotation, ProvCommunication) + self.ctx.add_relation(current_version, previous_version, ProvDerivation) + self.ctx.add_relation(current_version, current_annotation.annotator, ProvAttribution) + self.ctx.add_relation( + current_annotation, + current_annotation.annotator, + ProvAssociation, + {PROV_ROLE: ProvRole.ANNOTATOR}, + ) + self.ctx.add_relation( + current_annotation, + previous_version, + ProvUsage, + { + PROV_ATTR_STARTTIME: current_annotation.start, + PROV_ROLE: ProvRole.PRE_ANNOTATION_VERSION, + }, ) + self.ctx.add_relation( + current_version, + current_annotation, + ProvGeneration, + { + PROV_ATTR_STARTTIME: current_annotation.start, + PROV_ROLE: ProvRole.POST_ANNOTATION_VERSION, + }, + ) + - annot.used( - prev_annot_ver, - annot.get_startTime(), - [(PROV_ROLE, list(annotator.get_attribute(PROV_ROLE))[0])], +@dataclass +class ReleaseModel: + release: Release + tag: GitTag + ctx: ProvenanceContext = field(init=False) + + def __post_init__(self): + self.ctx = ProvenanceContext(ProvDocument()) + + @staticmethod + def query(repository: AbstractRepository) -> Iterable[tuple[Release, GitTag]]: + for release in repository.list_all(Release): + tag = repository.get(GitTag, sha=release.tag_sha) + yield release, tag + + def build_provenance_model(self) -> ProvDocument: + # Add the release + self.ctx.add_element(self.release) + self.ctx.add_element(self.release.author) + self.ctx.add_element(self.release.creation) + # Add all evidence files + for evidence in self.release.evidences: + self.ctx.add_element(evidence) + # Add all assets + for asset in self.release.assets: + self.ctx.add_element(asset) + # Add the tag + self.ctx.add_element(self.tag) + self.ctx.add_element(self.tag.creation) + self.ctx.add_element(self.tag.author) + # Add the release relationships + self.ctx.add_relation(self.release, self.release.author, ProvAttribution) + self.ctx.add_relation( + self.release, + self.release.creation, + ProvGeneration, + {PROV_ATTR_STARTTIME: self.release.creation.start, PROV_ROLE: ProvRole.RELEASE}, + ) + self.ctx.add_relation( + self.release.creation, + self.release.author, + ProvAssociation, + {PROV_ROLE: ProvRole.RELEASE_AUTHOR}, + ) + # Add the evidence and asset relationships + for evidence in self.release.evidences: + self.ctx.add_relation(evidence, self.release, ProvMembership) + self.ctx.add_relation(evidence, self.release.creation, ProvGeneration) + for asset in self.release.assets: + self.ctx.add_relation(asset, self.release, ProvMembership) + self.ctx.add_relation(asset, self.release.creation, ProvGeneration) + # Add tag relationships + self.ctx.add_relation(self.tag, self.release, ProvMembership) + self.ctx.add_relation(self.tag, self.tag.author, ProvAttribution) + self.ctx.add_relation( + self.tag, + self.tag.creation, + ProvGeneration, + {PROV_ATTR_STARTTIME: self.tag.creation.start, PROV_ROLE: ProvRole.TAG}, + ) + self.ctx.add_relation( + self.tag.creation, self.tag.author, ProvAssociation, {PROV_ROLE: ProvRole.TAG_AUTHOR} + ) + + +@dataclass +class GitTagModel: + """Model for a Git tag.""" + + tag: GitTag + commit: GitCommit + ctx: ProvenanceContext = field(init=False) + + def __post_init__(self): + self.ctx = ProvenanceContext(ProvDocument()) + + @staticmethod + def query(repository: AbstractRepository) -> Iterable[tuple[GitTag, GitCommit]]: + for tag in repository.list_all(GitTag): + commit = repository.get(GitCommit, sha=tag.commit_sha) + yield tag, commit + + def build_provenance_model(self) -> ProvDocument: + # Add the tag + self.ctx.add_element(self.tag) + self.ctx.add_element(self.tag.creation) + self.ctx.add_element(self.tag.author) + # Add the commit + self.ctx.add_element(self.commit) + self.ctx.add_element(self.commit.creation) + self.ctx.add_element(self.commit.author) + # Add tag relationships + self.ctx.add_relation( + self.tag, + self.tag.creation, + ProvGeneration, + {PROV_ATTR_STARTTIME: self.tag.creation.start, PROV_ROLE: ProvRole.TAG}, + ) + self.ctx.add_relation(self.tag, self.tag.author, ProvAttribution) + self.ctx.add_relation( + self.tag.creation, self.tag.author, ProvAssociation, {PROV_ROLE: ProvRole.TAG_AUTHOR} + ) + # Add commit relationships + self.ctx.add_relation(self.commit, self.tag, ProvMembership) + self.ctx.add_relation( + self.commit, + self.commit.creation, + ProvGeneration, + {PROV_ATTR_STARTTIME: self.commit.creation.start, PROV_ROLE: ProvRole.COMMIT}, ) - annot_ver.wasGeneratedBy( - annot, - time=annot.get_startTime(), - attributes=[(PROV_ROLE, ProvRole.RESOURCE_VERSION_AFTER_ANNOTATION)], + self.ctx.add_relation(self.commit, self.commit.author, ProvAttribution) + self.ctx.add_relation( + self.commit.creation, + self.commit.author, + ProvAssociation, + {PROV_ROLE: ProvRole.COMMIT_AUTHOR}, ) - prev_annot = annot - prev_annot_ver = annot_ver - return graph - - -def gitlab_release_tag_model(resources, graph: ProvDocument = None): - if graph is None: - graph = graph_factory() - for tag in resources.list_all(Tag): - release = resources.get(Release, tag_name=tag.name) - commit = resources.get(GitlabCommit, hexsha=tag.hexsha) - graph.update(release_and_tag(release, tag)) - graph.update(tag_and_commit(tag, commit)) - return graph - - -def release_and_tag(release: Optional[Release], tag: Tag, graph: ProvDocument = None): - if graph is None: - graph = graph_factory() - t = graph.collection(*tag) - - if release is None: - return graph - - r = graph.collection(*release) - c = graph.activity(*release.creation) - t.hadMember(r) - r.wasGeneratedBy(c, time=c.get_startTime(), attributes=[(PROV_ROLE, ProvRole.RELEASE)]) - for asset in release.assets: - graph.entity(*asset).hadMember(graph.entity(*release)) - for evidence in release.evidences: - graph.entity(*evidence).hadMember(graph.entity(*release)) - - if release.author is None: - return graph - - at = graph.agent(*release.author) - r.wasAttributedTo(at) - c.wasAssociatedWith( - at, plan=None, attributes=[(PROV_ROLE, list(at.get_attribute(PROV_ROLE))[0])] - ) - - return graph - - -def tag_and_commit(tag: Tag, commit: Optional[GitlabCommit], graph: ProvDocument = None): - if graph is None: - graph = graph_factory() - t = graph.collection(*tag) - tc = graph.activity(*tag.creation) - at = graph.agent(*tag.author) - t.wasAttributedTo(at) - t.wasGeneratedBy(tc, time=tc.get_startTime(), attributes=[(PROV_ROLE, ProvRole.TAG)]) - tc.wasAssociatedWith( - at, plan=None, attributes=[(PROV_ROLE, list(at.get_attribute(PROV_ROLE))[0])] - ) - - if commit is None: - return graph - - cmt = graph.entity(*commit) - cc = graph.activity(*commit.creation) - at = graph.agent(*commit.author) - cmt.hadMember(t) - cmt.wasAttributedTo(at) - cmt.wasGeneratedBy(cc, time=cc.get_startTime(), attributes=[(PROV_ROLE, ProvRole.GIT_COMMIT)]) - cc.wasAssociatedWith( - at, plan=None, attributes=[(PROV_ROLE, list(at.get_attribute(PROV_ROLE))[0])] - ) - - return graph + return self.ctx.get_document() + + +@dataclass +class CallableModel: + """A model that can be called to build a provenance document.""" + + model: Type[ + FileAdditionModel + | FileModificationModel + | FileDeletionModel + | HostedResourceModel + | GitTagModel + | ReleaseModel + ] + query: Query + document: ProvDocument = field(init=False) + + def __post_init__(self): + # Initialize the document + self.document = ProvDocument() + + def __call__(self, repository: AbstractRepository): + # Pass the repository to the query + for args in self.query(repository): + # Initialize the model + m = self.model(*args) + # Update the document with the model + self.document.update(m.build_provenance_model()) + return self.document MODELS = [ - git_commit_model, - gitlab_commit_model, - gitlab_issue_model, - gitlab_merge_request_model, - gitlab_release_tag_model, + CallableModel(FileAdditionModel, FileAdditionQuery), + CallableModel(FileDeletionModel, FileDeletionQuery), + CallableModel(FileModificationModel, FileModificationQuery), + CallableModel(HostedResourceModel, HostedIssueQuery), + CallableModel(HostedResourceModel, HostedCommitQuery), + CallableModel(HostedResourceModel, HostedMergeQuery), + CallableModel(ReleaseModel, ReleaseModel.query), + CallableModel(GitTagModel, GitTagModel.query), ] From ae97b0396624b93dfa646a2af66d18d932675f54 Mon Sep 17 00:00:00 2001 From: cdboer Date: Mon, 9 Jan 2023 10:52:02 +0100 Subject: [PATCH 23/81] operations.combine should use a sequence of arguments instead of an iterator --- gitlab2prov/prov/operations.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/gitlab2prov/prov/operations.py b/gitlab2prov/prov/operations.py index e70d37d..09da949 100644 --- a/gitlab2prov/prov/operations.py +++ b/gitlab2prov/prov/operations.py @@ -92,12 +92,9 @@ def graph_factory(records: Optional[Sequence[ProvRecord]] = None) -> ProvDocumen return graph -def combine(graphs: Iterable[ProvDocument]) -> ProvDocument: - log.info(f"combine graphs {graphs}") - try: - acc = next(graphs) - except StopIteration: - return graph_factory() +def combine(*graphs: ProvDocument) -> ProvDocument: + log.info(f"combine graphs {graphs=}") + acc = graphs[0] for graph in graphs: acc.update(graph) return dedupe(acc) From bce781f0854bd34b8542185fd414f180ee399908 Mon Sep 17 00:00:00 2001 From: cdboer Date: Mon, 9 Jan 2023 10:52:34 +0100 Subject: [PATCH 24/81] Inject the fetcher_factory instead of github/gitlab fetchers into handlers --- gitlab2prov/bootstrap.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/gitlab2prov/bootstrap.py b/gitlab2prov/bootstrap.py index dbae7ea..33f7a30 100644 --- a/gitlab2prov/bootstrap.py +++ b/gitlab2prov/bootstrap.py @@ -3,7 +3,7 @@ from typing import Type from gitlab2prov.service_layer import handlers, messagebus, unit_of_work -from gitlab2prov.adapters.fetch import GitFetcher, GitlabFetcher, GithubFetcher +from gitlab2prov.adapters.fetch import GitFetcher, FetcherFactory log = logging.getLogger(__name__) @@ -12,13 +12,12 @@ def bootstrap( uow: unit_of_work.AbstractUnitOfWork = unit_of_work.InMemoryUnitOfWork(), git_fetcher: Type[GitFetcher] = GitFetcher, - gitlab_fetcher: Type[GitlabFetcher] = GitlabFetcher, + fetcher_factory: Type[FetcherFactory] = FetcherFactory, ): dependencies = { "uow": uow, "git_fetcher": git_fetcher, - "gitlab_fetcher": gitlab_fetcher, - "github_fetcher": GithubFetcher, + "fetcher_factory": fetcher_factory, } injected_handlers = { command_type: [inject_dependencies(handler, dependencies) for handler in handlers] From ccfcf68748f4fd724df052a4cdc9984e537cef08 Mon Sep 17 00:00:00 2001 From: cdboer Date: Mon, 9 Jan 2023 10:53:32 +0100 Subject: [PATCH 25/81] Format with black & adjust validation to the new parser validation method --- gitlab2prov/entrypoints/cli.py | 53 +++++++++++++++++++++++----------- 1 file changed, 36 insertions(+), 17 deletions(-) diff --git a/gitlab2prov/entrypoints/cli.py b/gitlab2prov/entrypoints/cli.py index 6f375ce..6a678ba 100644 --- a/gitlab2prov/entrypoints/cli.py +++ b/gitlab2prov/entrypoints/cli.py @@ -30,12 +30,13 @@ def invoke_from_config(ctx: click.Context, _, filepath: str): def validate_config(ctx: click.Context, _, filepath: str): """Callback that validates config file using gitlab2prov/config/schema.json.""" if filepath: - try: - ConfigParser().validate(filepath) - print(ConfigParser().parse(filepath)) - except Exception as err: - ctx.fail(f"validation failed: {err}") - click.echo(f"-- OK --") + ok, err = ConfigParser().validate(filepath) + if ok: + config = ConfigParser().parse(filepath) + click.echo("Validation successful, the following command would be executed:\n") + click.echo(f"gitlab2prov {' '.join(config)}") + else: + ctx.fail(f"Validation failed: {err}") ctx.exit() @@ -120,13 +121,15 @@ def process_commands(processors, **kwargs): @cli.command("extract") -@click.option("-u", "--url", "urls", multiple=True, type=str, required=True, help="Project url[s].") +@click.option( + "-u", "--url", "urls", multiple=True, type=str, required=True, help="Project url[s]." +) @click.option("-t", "--token", required=True, type=str, help="Gitlab API token.") @click.pass_obj @generator def do_extract(bus, urls: list[str], token: str): """Extract provenance information for one or more gitlab projects. - + This command extracts provenance information from one or multiple gitlab projects. The extracted provenance is returned as a combined provenance graph. """ @@ -151,7 +154,7 @@ def do_extract(bus, urls: list[str], token: str): @generator def load(input): """Load provenance information from a file. - + This command reads one provenance graph from a file or multiple graphs from multiple files. """ for filepath in input: @@ -186,7 +189,7 @@ def load(input): @processor def save(graphs, format, output): """Save provenance information to a file. - + This command writes each provenance graph that is piped to it to a file. """ for idx, graph in enumerate(graphs, start=1): @@ -207,7 +210,7 @@ def save(graphs, format, output): @processor def pseudonymize(graphs): """Pseudonymize a provenance graph. - + This command pseudonymizes each provenance graph that is piped to it. """ for graph in graphs: @@ -223,7 +226,7 @@ def pseudonymize(graphs): @processor def combine(graphs): """Combine multiple graphs into one. - + This command combines all graphs that are piped to it into one. """ graphs = list(graphs) @@ -238,14 +241,30 @@ def combine(graphs): @cli.command("stats") -@click.option("--coarse", "resolution", flag_value="coarse", default=True, help="Print the number of PROV elements aswell as the overall number of relations.") -@click.option("--fine", "resolution", flag_value="fine", help="Print the number of PROV elements aswell as the number of PROV relations for each relation type.") -@click.option("--explain", "show_description", is_flag=True, help="Print a textual summary of all operations applied to the graphs.") +@click.option( + "--coarse", + "resolution", + flag_value="coarse", + default=True, + help="Print the number of PROV elements aswell as the overall number of relations.", +) +@click.option( + "--fine", + "resolution", + flag_value="fine", + help="Print the number of PROV elements aswell as the number of PROV relations for each relation type.", +) +@click.option( + "--explain", + "show_description", + is_flag=True, + help="Print a textual summary of all operations applied to the graphs.", +) @click.option("--formatter", type=click.Choice(["csv", "table"]), default="table") @processor def stats(graphs, resolution, show_description, formatter): """Print statistics such as node counts and relation counts. - + This command prints statistics for each processed provenance graph. Statistics include the number of elements for each element type aswell as the number of relations for each relation type. Optionally, a short textual summary of all operations applied to the processed graphs can be printed to stdout. @@ -277,7 +296,7 @@ def stats(graphs, resolution, show_description, formatter): @processor def merge_duplicated_agents(graphs, mapping): """Merge duplicated agents based on a name to aliases mapping. - + This command solves the problem of duplicated agents that can occur when the same physical user uses different user names and emails for his git and gitlab account. Based on a mapping of names to aliases the duplicated agents can be merged. From 61df234221dacd5208952a5adf672934aa771cf8 Mon Sep 17 00:00:00 2001 From: cdboer Date: Sun, 29 Jan 2023 18:14:42 +0100 Subject: [PATCH 26/81] Add subpackage for git fetching --- gitlab2prov/adapters/git/__init__.py | 1 + gitlab2prov/adapters/git/fetcher.py | 144 +++++++++++++++++++++++++++ 2 files changed, 145 insertions(+) create mode 100644 gitlab2prov/adapters/git/__init__.py create mode 100644 gitlab2prov/adapters/git/fetcher.py diff --git a/gitlab2prov/adapters/git/__init__.py b/gitlab2prov/adapters/git/__init__.py new file mode 100644 index 0000000..69cdbf5 --- /dev/null +++ b/gitlab2prov/adapters/git/__init__.py @@ -0,0 +1 @@ +from gitlab2prov.adapters.git.fetcher import GitFetcher \ No newline at end of file diff --git a/gitlab2prov/adapters/git/fetcher.py b/gitlab2prov/adapters/git/fetcher.py new file mode 100644 index 0000000..eaffa6f --- /dev/null +++ b/gitlab2prov/adapters/git/fetcher.py @@ -0,0 +1,144 @@ +from collections.abc import Iterator +from dataclasses import dataclass +from itertools import zip_longest +from tempfile import TemporaryDirectory +from pathlib import Path + +from git import Commit +from git import Repo + +from gitlab2prov.adapters.project_url import ProjectUrl +from gitlab2prov.domain.constants import ChangeType +from gitlab2prov.domain.constants import ProvRole +from gitlab2prov.domain.objects import File +from gitlab2prov.domain.objects import FileRevision +from gitlab2prov.domain.objects import GitCommit +from gitlab2prov.domain.objects import User + + +EMPTY_TREE_SHA = "4b825dc642cb6eb9a060e54bf8d69288fbee4904" + + +@dataclass +class GitFetcher: + project_url: type[ProjectUrl] + repo: Repo | None = None + tmpdir: TemporaryDirectory | None = None + + def __enter__(self): + self.tmpdir = TemporaryDirectory(ignore_cleanup_errors=True) + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + if self.repo: + self.repo.close() + if self.tmpdir: + self.tmpdir.cleanup() + + def do_clone(self, url: str, token: str) -> None: + clone_url = self.project_url(url).clone_url(token) + self.repo = Repo.clone_from(clone_url, self.tmpdir.name) + + def fetch_all(self) -> Iterator[GitCommit | File | FileRevision]: + yield from extract_commits(self.repo) + yield from extract_files(self.repo) + yield from extract_revisions(self.repo) + + +def get_author(commit: Commit) -> User: + return User( + name=commit.author.name, + email=commit.author.email, + gitlab_username=None, + gitlab_id=None, + prov_role=ProvRole.AUTHOR, + ) + + +def get_committer(commit: Commit) -> User: + return User( + name=commit.committer.name, + email=commit.committer.email, + gitlab_username=None, + gitlab_id=None, + prov_role=ProvRole.COMMITTER, + ) + + +def parse_log(log: str): + """Parse 'git log' output into file paths, commit hexshas, file status (aka change type). + Example: + >>> parse_log( + ''' + 34db8646fe1648bef9b7ce6613ae4a06acffba66 + A foo.py + 9b65f80b44acffc8036fef932f801134533b99bd + M foo.py + ''' + ) + [(foo.py, 34db8646fe1648bef9b7ce6613ae4a06acffba66, A), (foo.py, 9b65f80b44acffc8036fef932f801134533b99bd, M)] + """ + # split at line breaks, strip whitespace, remove empty lines + lines = [line.strip() for line in log.split("\n") if line] + # every second line contains the SHA1 of a commit + hexshas = lines[::2] + # every other line contains a type, aswell as a file path + types = [line.split()[0][0] for line in lines[1::2]] + paths = [line.split()[1][:] for line in lines[1::2]] + # zip all three together + return zip(paths, hexshas, types) + + +def extract_commits(repo: Repo) -> Iterator[GitCommit]: + for commit in repo.iter_commits("--all"): + yield GitCommit( + sha=commit.hexsha, + title=commit.summary, + message=commit.message, + author=get_author(commit), + committer=get_committer(commit), + parents=[parent.hexsha for parent in commit.parents], + start=commit.authored_datetime, + end=commit.committed_datetime, + ) + + +def extract_files(repo: Repo) -> Iterator[File]: + for commit in repo.iter_commits("--all"): + # choose the parent commit to diff against + # use *magic* empty tree sha for commits without parents + parent = commit.parents[0] if commit.parents else EMPTY_TREE_SHA + # diff against parent + diff = commit.diff(parent, R=True) + # only consider files that have been added to the repository + # disregard modifications and deletions + for diff_item in diff.iter_change_type(ChangeType.ADDED): + # path for new files is stored in diff b_path + yield File( + name=Path(diff_item.b_path).name, path=diff_item.b_path, commit=commit.hexsha + ) + + +def extract_revisions(repo: Repo) -> Iterator[FileRevision]: + for file in extract_files(repo): + revs = [] + + for path, hexsha, status in parse_log( + repo.git.log( + "--all", + "--follow", + "--name-status", + "--pretty=format:%H", + "--", + file.path, + ) + ): + revs.append( + FileRevision( + name=Path(path).name, path=path, commit=hexsha, status=status, file=file + ) + ) + # revisions remeber their predecessor (previous revision) + for rev, prev in zip_longest(revs, revs[1:]): + rev.previous = prev + yield rev From 7650d0c33900c2ef496cc92c4a77301595b53245 Mon Sep 17 00:00:00 2001 From: cdboer Date: Sun, 29 Jan 2023 18:15:14 +0100 Subject: [PATCH 27/81] Add subpackage for gitlab fetching --- gitlab2prov/adapters/lab/__init__.py | 1 + gitlab2prov/adapters/lab/classifiers.py | 472 ++++++++++++++++++++++++ gitlab2prov/adapters/lab/fetcher.py | 202 ++++++++++ gitlab2prov/adapters/lab/parser.py | 153 ++++++++ 4 files changed, 828 insertions(+) create mode 100644 gitlab2prov/adapters/lab/__init__.py create mode 100644 gitlab2prov/adapters/lab/classifiers.py create mode 100644 gitlab2prov/adapters/lab/fetcher.py create mode 100644 gitlab2prov/adapters/lab/parser.py diff --git a/gitlab2prov/adapters/lab/__init__.py b/gitlab2prov/adapters/lab/__init__.py new file mode 100644 index 0000000..719780d --- /dev/null +++ b/gitlab2prov/adapters/lab/__init__.py @@ -0,0 +1 @@ +from gitlab2prov.adapters.lab.fetcher import GitlabFetcher \ No newline at end of file diff --git a/gitlab2prov/adapters/lab/classifiers.py b/gitlab2prov/adapters/lab/classifiers.py new file mode 100644 index 0000000..a41d14e --- /dev/null +++ b/gitlab2prov/adapters/lab/classifiers.py @@ -0,0 +1,472 @@ +import logging +import re +from dataclasses import dataclass +from dataclasses import field +from dataclasses import InitVar +from typing import Any + + +log = logging.getLogger(__name__) + + +@dataclass(kw_only=True) +class Classifier: + patterns: InitVar[list[str]] + compiled: list[re.Pattern] = field(init=False, default_factory=list) + match: re.Match = field(init=False, default=None) + + def __post_init__(self, regexps: list[str]): + self.compiled = [re.compile(regex, re.IGNORECASE) for regex in regexps] + + @staticmethod + def match_length(match: re.Match) -> int: + if match is None: + raise TypeError(f"Expected argument of type re.Match, got {type(match)}.") + return match.end() - match.start() + + def matches(self, string: str) -> bool: + matches = [match for pt in self.compiled if (match := re.search(pt, string))] + self.match = max(matches, key=self.match_length, default=None) + return self.match is not None + + def groupdict(self) -> dict[str, Any]: + if not self.match: + return dict() + return self.match.groupdict() + + def __len__(self) -> int: + if not self.match: + return 0 + return self.match_length(self.match) + + +@dataclass(kw_only=True) +class ImportStatement(Classifier): + def replace(self, string: str) -> str: + if not self.match: + return string + # replace leftmost occurence + replaced = self.match.re.sub("", string, count=1) + # remove trailing whitespace + return replaced.strip() + + +@dataclass(kw_only=True) +class AnnotationClassifier(Classifier): + name: str = field(compare=False) + + +CLASSIFIERS = [ + AnnotationClassifier( + name="change_target_branch", + patterns=[ + r"^changed target branch from `(?P.+)` to `(?P.+)`$" + ], + ), + AnnotationClassifier( + name="change_epic", + patterns=[ + r"^changed epic to &(?P\d+)$", + r"^changed epic to &(?P.+)$", + r"^changed epic to (?P.+)&(?P\d+)$", + r"^changed epic to (?P.+)&(?P.+)$", + ], + ), + AnnotationClassifier( + name="add_to_epic", + patterns=[ + r"^added to epic &(?P\d+)$", + r"^added to epic &(?P.+)$", + ], + ), + AnnotationClassifier( + name="remove_from_epic", + patterns=[ + r"^removed from epic &(?P\d+)$", + r"^removed from epic &(?P.+)$", + ], + ), + AnnotationClassifier( + name="add_to_external_epic", + patterns=[ + r"^added to epic (?P.+)&(?P\d+)$", + r"^added to epic (?P.+)&(?P.+)$", + ], + ), + AnnotationClassifier( + name="remove_from_external_epic", + patterns=[ + r"^removed from epic (?P.+)&(?P\d+)$", + r"^removed from epic (?P.+)&(?P.+)$", + ], + ), + AnnotationClassifier( + name="close_by_external_commit", + patterns=[r"^closed via commit (?P.+)@(?P[0-9a-z]+)$"], + ), + AnnotationClassifier( + name="close_by_external_merge_request", + patterns=[r"^close via merge request (?P.+?)!(?P\d+)$"], + ), + AnnotationClassifier( + name="close_by_merge_request", + patterns=[ + r"^closed via merge request !(?P.+)$", + r"^status changed to closed by merge request !(?P.+)$", + ], + ), + AnnotationClassifier( + name="close_by_commit", + patterns=[ + r"^closed via commit (?P[a-z0-9]+)$", + r"^status changed to closed by commit (?P[a-z0-9]+)$", + ], + ), + AnnotationClassifier( + name="restore_source_branch", + patterns=[ + r"^restored source branch `(?P.+)`$", + ], + ), + AnnotationClassifier(name="remove_label", patterns=[r"^removed ~(?P\d+) label$"]), + AnnotationClassifier(name="add_label", patterns=[r"^added ~(?P\d+) label$"]), + AnnotationClassifier( + name="create_branch", + patterns=[r"^created branch \[`(?P.+)`\]\((?P.+)\).*$"], + ), + AnnotationClassifier( + name="mark_task_as_incomplete", + patterns=[r"^marked the task [*]{2}(?P.+)[*]{2} as incomplete$"], + ), + AnnotationClassifier( + name="mark_task_as_done", + patterns=[ + r"^marked the task [*]{2}(?P.+)[*]{2} as completed$", + ], + ), + AnnotationClassifier( + name="add_commits", + patterns=[ + r"added (?P\d+)\scommit[s]?\n\n.+(?P[a-z0-9]{8}) - (?P.+?)<.*", + r"^added (?P<number_of_commits>\d+) new commit[s]?:\n\n(\* (?P<short_sha>[a-z0-9]{8}) - (?P<title>.+?)\n)+$", + r"^added (?P<number_of_commits>\d+) new commit[s]?:\n\n(\* (?P<short_sha>[a-z0-9]{11}) - (?P<title>.+?)\n)+$", + r"^added (?P<number_of_commits>\d+) commit[s]?(?:.*\n?)*$", + r"^added 0 new commits:\n\n$", # seems weird + ], + ), + AnnotationClassifier( + name="address_in_merge_request", + patterns=[r"^created merge request !(?P<merge_request_iid>\d+) to address this issue$"], + ), + AnnotationClassifier( + name="unmark_as_work_in_progress", + patterns=[ + r"^unmarked as a [*]{2}work in progress[*]{2}$", + r"^unmarked this merge request as a work in progress$", + ], + ), + AnnotationClassifier( + name="mark_as_work_in_progress", + patterns=[ + r"^marked as a [*]{2}work in progress[*]{2}$", + r"^marked this merge request as a [*]{2}work in progress[*]{2}$", + ], + ), + AnnotationClassifier( + name="status_changed_to_merged", + patterns=[ + r"^merged$", + r"^status changed to merged$", + ], + ), + AnnotationClassifier(name="change_description", patterns=[r"^changed the description$"]), + AnnotationClassifier( + name="change_title", + patterns=[ + r"^changed title from [*]{2}(?P<old_title>.+)[*]{2} to [*]{2}(?P<new_title>.+)[*]{2}$", + r"^changed title: [*]{2}(?P<old_title>.+)[*]{2} → [*]{2}(?P<new_title>.+)[*]{2}$", + r"^title changed from [*]{2}(?P<old_title>.+)[*]{2} to [*]{2}(?P<new_title>.+)[*]{2}$", + ], + ), + AnnotationClassifier( + name="move_from", + patterns=[r"^moved from (?P<project_slug>.*?)#(?P<issue_iid>\d+)$"], + ), + AnnotationClassifier( + name="move_to", + patterns=[r"^moved to (?P<project_slug>.*?)#(?P<issue_iid>\d+)$"], + ), + AnnotationClassifier(name="reopen", patterns=[r"^reopened$", r"^status changed to reopened$"]), + AnnotationClassifier( + name="close", + patterns=[ + r"^closed$", + r"^status changed to closed$", + ], + ), + AnnotationClassifier( + name="unrelate_from_external_issue", + patterns=[r"^removed the relation with (?P<project_slug>.+)#(?P<issue_iid>\d+)$"], + ), + AnnotationClassifier( + name="relate_to_external_issue", + patterns=[r"^marked this issue as related to (?P<project_slug>.+)#(?P<issue_iid>\d+)$"], + ), + AnnotationClassifier( + name="unrelate_from_issue", + patterns=[r"^removed the relation with #(?P<issue_iid>\d+)$"], + ), + AnnotationClassifier( + name="relate_to_issue", + patterns=[r"^marked this issue as related to #(?P<issue_iid>\d+)$"], + ), + AnnotationClassifier( + name="has_duplicate", + patterns=[r"^marked #(?P<issue_iid>\d+) as a duplicate of this issue$"], + ), + AnnotationClassifier( + name="mark_as_duplicate", + patterns=[r"^marked this issue as a duplicate of #(?P<issue_iid>\d+)$"], + ), + AnnotationClassifier( + name="make_visible", + patterns=[ + r"^made the issue visible to everyone$", + r"^made the issue visible$", + ], + ), + AnnotationClassifier(name="make_confidential", patterns=[r"^made the issue confidential$"]), + AnnotationClassifier(name="remove_weight", patterns=[r"^removed the weight$"]), + AnnotationClassifier( + name="change_weight", + patterns=[r"^changed weight to [*]{2}(?P<weight>\d+)[*]{2}$"], + ), + AnnotationClassifier(name="remove_due_date", patterns=[r"^removed due date$"]), + AnnotationClassifier( + name="change_due_date", + patterns=[ + r"^changed due date to (?P<month>(?:january|february|march|april|may|june|july|august|september|october|november|december)) (?P<day>\d\d), (?P<year>\d{4})$" + ], + ), + AnnotationClassifier(name="remove_time_estimate", patterns=[r"^removed time estimate$"]), + AnnotationClassifier( + name="change_time_estimate", + patterns=[ + r"^changed time estimate to" + + r"(?:\s(?P<months>[-]?\d+)mo)?" + + r"(?:\s(?P<weeks>[-]?\d+)w)?" + + r"(?:\s(?P<days>[-]?\d+)d)?" + + r"(?:\s(?P<hours>[-]?\d+)h)?" + + r"(?:\s(?P<minutes>[-]?\d+)m)?" + + r"(?:\s(?P<seconds>[-]?\d+)s)?$" + ], + ), + AnnotationClassifier(name="unlock_merge_request", patterns=[r"^unlocked this merge request$"]), + AnnotationClassifier(name="lock_merge_request", patterns=[r"^locked this merge request$"]), + AnnotationClassifier(name="unlock_issue", patterns=[r"^unlocked this issue$"]), + AnnotationClassifier(name="lock_issue", patterns=[r"^locked this issue$"]), + AnnotationClassifier(name="remove_spent_time", patterns=[r"^removed time spent$"]), + AnnotationClassifier( + name="subtract_spent_time", + patterns=[ + r"^subtracted" + + r"(?:\s(?P<months>\d+)mo)?" + + r"(?:\s(?P<weeks>\d+)w)?" + + r"(?:\s(?P<days>\d+)d)?" + + r"(?:\s(?P<hours>\d+)h)?" + + r"(?:\s(?P<minutes>\d+)m)?" + + r"\sof time spent at (?P<date>\d{4}-\d{2}-\d{2})$" + ], + ), + AnnotationClassifier( + name="add_spent_time", + patterns=[ + r"^added" + + r"(?:\s(?P<months>\d+)mo)?" + + r"(?:\s(?P<weeks>\d+)w)?" + + r"(?:\s(?P<days>\d+)d)?" + + r"(?:\s(?P<hours>\d+)h)?" + + r"(?:\s(?P<minutes>\d+)m)?" + + r"\sof time spent at (?P<date>\d{4}-\d{2}-\d{2})$" + ], + ), + AnnotationClassifier( + name="remove_milestone", + patterns=[r"^removed milestone$", r"^milestone removed$"], + ), + AnnotationClassifier( + name="change_milestone", + patterns=[ + r"^changed milestone to %(?P<milestone_iid>\d+)$", + r"^changed milestone to %(?P<milestone_name>.+)$", + r"^changed milestone to (?P<project_slug>.+)%(?P<milestone_iid>\d+)$", + r"^changed milestone to (?P<project_slug>.+)%(?P<milestone_name>.+)$", + r"^milestone changed to %(?P<milestone_iid>\d+)$", + r"^milestone changed to \[(?P<release_name>.+)\]\((?P<release_link>.+)\)$", + r"^milestone changed to (?P<release_name>.+)$", + ], + ), + AnnotationClassifier( + name="unassign_user", + patterns=[ + r"^unassigned @(?P<user_name>.*)$", + r"^removed assignee$", + ], + ), + AnnotationClassifier(name="assign_user", patterns=[r"^assigned to @(?P<user_name>.*)$"]), + AnnotationClassifier( + name="mention_in_external_merge_request", + patterns=[r"^mentioned in merge request (?P<project_slug>.+)!(?P<merge_request_iid>\d+)$"], + ), + AnnotationClassifier( + name="mention_in_merge_request", + patterns=[ + r"^mentioned in merge request !(?P<merge_request_iid>\d+)$", + ], + ), + AnnotationClassifier( + name="mention_in_external_commit", + patterns=[ + r"^mentioned in commit (?P<project_slug>.+)@(?P<commit_sha>[0-9a-z]{40})$", + ], + ), + AnnotationClassifier( + name="mention_in_commit", + patterns=[ + r"^mentioned in commit (?P<commit_sha>[0-9a-z]{40})$", + ], + ), + AnnotationClassifier( + name="mention_in_external_issue", + patterns=[ + r"^mentioned in issue (?P<project_slug>.+)#(?P<issue_iid>\d+)$", + ], + ), + AnnotationClassifier( + name="mention_in_issue", + patterns=[ + r"^mentioned in issue #(?P<issue_iid>\d+)$", + ], + ), + AnnotationClassifier(name="resolve_all_threads", patterns=[r"^resolved all threads$"]), + AnnotationClassifier( + name="approve_merge_request", patterns=[r"^approved this merge request$"] + ), + AnnotationClassifier( + name="resolve_all_discussions", + patterns=[ + r"^resolved all discussions$", + ], + ), + AnnotationClassifier( + name="unapprove_merge_request", patterns=[r"^unapproved this merge request$"] + ), + AnnotationClassifier( + name="enable_automatic_merge_on_pipeline_completion", + patterns=[ + r"^enabled an automatic merge when the pipeline for (?P<pipeline_commit_sha>[0-9a-z]+) succeeds$", + ], + ), + AnnotationClassifier( + name="enable_automatic_merge_on_build_success", + patterns=[ + r"^enabled an automatic merge when the build for (?P<commit_sha>[0-9a-z]+) succeeds$", + ], + ), + AnnotationClassifier( + name="abort_automatic_merge", + patterns=[r"^aborted the automatic merge because (?P<abort_reason>[a-z\s]+)$"], + ), + AnnotationClassifier( + name="cancel_automatic_merge", + patterns=[ + r"^canceled the automatic merge$", + ], + ), + AnnotationClassifier( + name="create_issue_from_discussion", + patterns=[r"^created #(?P<issue_iid>\d+) to continue this discussion$"], + ), + AnnotationClassifier( + name="mark_merge_request_as_ready", + patterns=[r"^marked this merge request as \*\*ready\*\*$"], + ), + AnnotationClassifier( + name="mark_merge_request_note_as_draft", + patterns=[r"^marked this merge request as \*\*draft\*\*$"], + ), + # TODO: allow n reviewers + AnnotationClassifier( + name="request_review", + patterns=[ + r"^requested review from @(?P<user_name>.*)$", + r"^requested review from @(?P<user_name>.*) and @(?P<user_name2>.*)$", + ], + ), + # TODO: allow n reviewers + AnnotationClassifier( + name="cancel_review_request", + patterns=[r"^removed review request for @(?P<user_name>.*)$"], + ), + AnnotationClassifier( + name="mention_in_epic", patterns=[r"^mentioned in epic &(?P<noteable_iid>\d+)$"] + ), + AnnotationClassifier( + name="reassign_user", + patterns=[ + r"^reassigned to @(?P<user_name>.*)$", + ], + ), + AnnotationClassifier( + name="remove_merge_request_from_merge_train", + patterns=[ + r"^removed this merge request from the merge train because no stages / jobs for this pipeline.$" + ], + ), + AnnotationClassifier( + name="start_merge_train", + patterns=[ + r"^started a merge train$", + ], + ), + AnnotationClassifier( + name="enable_automatic_add_to_merge_train", + patterns=[ + r"^enabled automatic add to merge train when the pipeline for (?P<pipeline_commit_sha>[0-9a-z]+) succeeds$", + ], + ), +] + +IMPORT_STATEMENT = ImportStatement( + patterns=[ + r"\*by (?P<pre_import_author>.+) on \d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2} \(imported from gitlab project\)\*", + r"\*by (?P<pre_import_author>.+) on \d{4}-\d{2}-\d{2}\s\d{2}:\d{2}:\d{2}\sUTC \(imported from gitlab project\)\*", + ], +) + + +@dataclass +class SystemNoteClassifier: + @staticmethod + def normalize(note: str) -> str: + return note.strip().lower() + + def longest_matching_classifier(self, note: str) -> AnnotationClassifier: + matching = (classifier for classifier in CLASSIFIERS if classifier.matches(note)) + return max(matching, key=len, default=None) + + def classify(self, note: str) -> tuple[str, dict[str, str]]: + # 1. normalize the note + key_value_pairs = {} + normalized_note = self.normalize(note) + # 2. remove import statements, if any and extract the key-value pairs + if IMPORT_STATEMENT.matches(normalized_note): + normalized_note = IMPORT_STATEMENT.replace(normalized_note) + key_value_pairs.update(IMPORT_STATEMENT.groupdict()) + # 3. find the longest matching classifier + if classifier := self.longest_matching_classifier(normalized_note): + key_value_pairs.update(classifier.groupdict()) + # 4. return the classifier name and the matched groups + return classifier.name, key_value_pairs + # 5. if no classifier matches, return "unknown" and an empty dict + return "unknown", key_value_pairs diff --git a/gitlab2prov/adapters/lab/fetcher.py b/gitlab2prov/adapters/lab/fetcher.py new file mode 100644 index 0000000..c03c944 --- /dev/null +++ b/gitlab2prov/adapters/lab/fetcher.py @@ -0,0 +1,202 @@ +import logging +import itertools +from typing import Iterator +from dataclasses import dataclass, field, InitVar + +from gitlab import Gitlab +from gitlab.exceptions import GitlabListError +from gitlab.v4.objects import Project + +from gitlab2prov.adapters.lab.parser import GitlabAnnotationParser +from gitlab2prov.adapters.project_url import GitlabProjectUrl +from gitlab2prov.domain.constants import ProvRole +from gitlab2prov.domain.objects import ( + Asset, + Evidence, + Commit, + Issue, + MergeRequest, + Release, + User, + GitTag, +) + + +log = logging.getLogger(__name__) + + +@dataclass +class GitlabFetcher: + token: InitVar[str] + url: InitVar[str] + + client: Gitlab = field(init=False) + project: Project = field(init=False) + parser: GitlabAnnotationParser = GitlabAnnotationParser() + + def __post_init__(self, token, url) -> None: + url = GitlabProjectUrl(url) + self.client = Gitlab(url.instance, private_token=token) + self.project = self.client.projects.get(url.slug) + + def log_list_err(self, log: logging.Logger, err: GitlabListError, cls: str) -> None: + log.error(f"failed to fetch {cls} from {self.project.url}") + log.error(f"error: {err}") + + def fetch_all(self) -> Iterator[Commit | Issue | MergeRequest | Release | GitTag]: + yield from itertools.chain( + self.fetch_commits(), + self.fetch_issues(), + self.fetch_mergerequests(), + self.fetch_releases(), + self.fetch_tags(), + ) + + def fetch_commits(self) -> Iterator[Commit]: + try: + for commit in self.project.commits.list(all=True, per_page=100): + yield Commit( + sha=commit.id, + url=commit.web_url, + platform="gitlab", + author=User( + commit.author_name, commit.author_email, prov_role=ProvRole.COMMIT_AUTHOR + ), + annotations=self.parser.parse( + [ + *commit.comments.list(all=True, system=False), + *commit.comments.list(all=True, system=True), + ] + ), + authored_at=commit.authored_date, + committed_at=commit.committed_date, + ) + except GitlabListError as err: + self.log_list_err(log, err, "commits") + + def fetch_issues(self, state="all") -> Iterator[Issue]: + try: + for issue in self.project.issues.list(all=True, state=state, per_page=100): + yield Issue( + id=issue.id, + iid=issue.iid, + platform="gitlab", + title=issue.title, + body=issue.description, + url=issue.web_url, + author=User( + issue.author.get("name"), + issue.author.get("email"), + gitlab_username=issue.author.get("username"), + gitlab_id=issue.author.get("id"), + prov_role=ProvRole.ISSUE_AUTHOR, + ), + annotations=self.parser.parse( + [ + *issue.notes.list(all=True, system=False), + *issue.notes.list(all=True, system=True), + *issue.awardemojis.list(all=True), + *issue.resourcelabelevents.list(all=True), + *( + award + for note in issue.notes.list(all=True) + for award in note.awardemojis.list(all=True) + ), + ] + ), + created_at=issue.created_at, + closed_at=issue.closed_at, + ) + except GitlabListError as err: + self.log_list_err(log, err, "issues") + + def fetch_mergerequests(self, state="all") -> Iterator[MergeRequest]: + try: + for merge in self.project.mergerequests.list(all=True, state=state, per_page=100): + yield MergeRequest( + id=merge.id, + iid=merge.iid, + title=merge.title, + body=merge.description, + url=merge.web_url, + platform="gitlab", + source_branch=merge.source_branch, + target_branch=merge.target_branch, + author=User( + merge.author.get("name"), + merge.author.get("email"), + gitlab_username=merge.author.get("username"), + gitlab_id=merge.author.get("id"), + prov_role=ProvRole.MERGE_REQUEST_AUTHOR, + ), + annotations=self.parser.parse( + ( + *merge.notes.list(all=True, system=False), + *merge.notes.list(all=True, system=True), + *merge.awardemojis.list(all=True), + *merge.resourcelabelevents.list(all=True), + *( + award + for note in merge.notes.list(all=True) + for award in note.awardemojis.list(all=True) + ), + ) + ), + created_at=merge.created_at, + closed_at=merge.closed_at, + merged_at=merge.merged_at, + first_deployed_to_production_at=getattr( + merge, "first_deployed_to_production_at", None + ), + ) + except GitlabListError as err: + self.log_list_err(log, err, "merge requests") + + def fetch_releases(self) -> Iterator[Release]: + try: + for release in self.project.releases.list(all=True, per_page=100): + yield Release( + name=release.name, + body=release.description, + tag_name=release.tag_name, + author=User( + name=release.author.get("name"), + email=release.author.get("email"), + gitlab_username=release.author.get("username"), + gitlab_id=release.author.get("id"), + prov_role=ProvRole.RELEASE_AUTHOR, + ), + assets=[ + Asset(url=asset.get("url"), format=asset.get("format")) + for asset in release.assets.get("sources", []) + ], + evidences=[ + Evidence( + sha=evidence.get("sha"), + url=evidence.get("filepath"), + collected_at=evidence.get("collected_at"), + ) + for evidence in release.evidences + ], + created_at=release.created_at, + released_at=release.released_at, + ) + except GitlabListError as err: + self.log_list_err(log, err, "releases") + + def fetch_tags(self) -> Iterator[GitTag]: + try: + for tag in self.project.tags.list(all=True, per_page=100): + yield GitTag( + name=tag.name, + sha=tag.target, + message=tag.message, + author=User( + name=tag.commit.get("author_name"), + email=tag.commit.get("author_email"), + prov_role=ProvRole.TAG_AUTHOR, + ), + created_at=tag.commit.get("created_at"), + ) + except GitlabListError as err: + self.log_list_err(log, err, "tags") diff --git a/gitlab2prov/adapters/lab/parser.py b/gitlab2prov/adapters/lab/parser.py new file mode 100644 index 0000000..7116914 --- /dev/null +++ b/gitlab2prov/adapters/lab/parser.py @@ -0,0 +1,153 @@ +import logging +import uuid +from dataclasses import dataclass +from typing import TypeVar, Callable + +from gitlab.v4.objects import ( + ProjectIssueNote, + ProjectMergeRequestNote, + ProjectCommitComment, + ProjectIssueResourceLabelEvent, + ProjectMergeRequestResourceLabelEvent, + ProjectIssueAwardEmoji, + ProjectIssueNoteAwardEmoji, + ProjectMergeRequestAwardEmoji, + ProjectMergeRequestNoteAwardEmoji, +) + +from gitlab2prov.adapters.lab.classifiers import SystemNoteClassifier +from gitlab2prov.domain.objects import Annotation, User +from gitlab2prov.domain.constants import ProvRole + + +A = TypeVar("A") + +log = logging.getLogger(__name__) + + +@dataclass +class GitlabAnnotationParser: + + classifier: SystemNoteClassifier = SystemNoteClassifier() + + @staticmethod + def sort_by_date(annotations: list[Annotation]) -> list[Annotation]: + return list(sorted(annotations, key=lambda a: a.start)) + + def choose_parser(self, raw_annotation: A) -> Callable[[A], Annotation]: + match raw_annotation: + case ProjectIssueNote(system=True) | ProjectMergeRequestNote(system=True): + return self.parse_system_note + case ProjectIssueNote() | ProjectMergeRequestNote(): + return self.parse_note + case ProjectCommitComment(): + return self.parse_comment + case ProjectIssueResourceLabelEvent() | ProjectMergeRequestResourceLabelEvent(): + return self.parse_label + case ProjectIssueAwardEmoji() | ProjectIssueNoteAwardEmoji() | ProjectMergeRequestAwardEmoji() | ProjectMergeRequestNoteAwardEmoji(): + return self.parse_award + case _: + log.warning(f"no parser found for {raw_annotation=}") + return + + def parse(self, annotations: list[A]) -> list[Annotation]: + parsed_annotations = [] + for annotation in annotations: + if parser := self.choose_parser(annotation): + parsed_annotations.append(parser(annotation)) + return self.sort_by_date(parsed_annotations) + + def parse_system_note(self, note: ProjectIssueNote | ProjectMergeRequestNote) -> Annotation: + annotator = User( + name=note.author.get("name"), + email=note.author.get("email"), + gitlab_username=note.author.get("username"), + gitlab_id=note.author.get("id"), + prov_role=ProvRole.ANNOTATOR, + ) + annotation_name, key_value_pairs = self.classifier.classify(note.body) + return Annotation( + uid=note.id, + name=annotation_name, + body=note.body, + start=note.created_at, + end=note.created_at, + captured_kwargs=key_value_pairs, + annotator=annotator, + ) + + def parse_comment(self, comment: ProjectCommitComment) -> Annotation: + annotator = User( + name=comment.author.get("name"), + email=comment.author.get("email"), + gitlab_username=comment.author.get("username"), + gitlab_id=comment.author.get("id"), + prov_role=ProvRole.ANNOTATOR, + ) + return Annotation( + uid=f"{uuid.uuid4()}{annotator.gitlab_id}{abs(hash(comment.note))}", + name="add_comment", + body=comment.note, + start=comment.created_at, + end=comment.created_at, + annotator=annotator, + ) + + def parse_note(self, note: ProjectIssueNote | ProjectMergeRequestNote) -> Annotation: + annotator = User( + name=note.author.get("name"), + email=note.author.get("email"), + gitlab_username=note.author.get("username"), + gitlab_id=note.author.get("id"), + prov_role=ProvRole.ANNOTATOR, + ) + return Annotation( + uid=note.id, + name="add_note", + body=note.body, + annotator=annotator, + start=note.created_at, + end=note.created_at, + ) + + def parse_award( + self, + award: ProjectIssueAwardEmoji + | ProjectIssueNoteAwardEmoji + | ProjectMergeRequestAwardEmoji + | ProjectMergeRequestNoteAwardEmoji, + ) -> Annotation: + annotator = User( + name=award.user.get("name"), + email=award.user.get("email"), + gitlab_username=award.user.get("username"), + gitlab_id=award.user.get("id"), + prov_role=ProvRole.ANNOTATOR, + ) + return Annotation( + uid=award.id, + name="add_award", + body=award.name, + annotator=annotator, + start=award.created_at, + end=award.created_at, + ) + + def parse_label( + self, label: ProjectIssueResourceLabelEvent | ProjectMergeRequestResourceLabelEvent + ) -> Annotation: + annotator = User( + name=label.user.get("name"), + email=label.user.get("email"), + gitlab_username=label.user.get("username"), + gitlab_id=label.user.get("id"), + prov_role=ProvRole.ANNOTATOR, + ) + return Annotation( + uid=label.id, + name=f"{label.action}_label", + body=label.action, + annotator=annotator, + start=label.created_at, + end=label.created_at, + ) From ab259d69b9ad9a272a750e91f374553b88dd65e8 Mon Sep 17 00:00:00 2001 From: cdboer <cdboer@cdboer.de> Date: Sun, 29 Jan 2023 18:15:31 +0100 Subject: [PATCH 28/81] Add subpackage for github fetching --- gitlab2prov/adapters/hub/__init__.py | 1 + gitlab2prov/adapters/hub/fetcher.py | 159 ++++++++++++++++++++++ gitlab2prov/adapters/hub/parser.py | 191 +++++++++++++++++++++++++++ 3 files changed, 351 insertions(+) create mode 100644 gitlab2prov/adapters/hub/__init__.py create mode 100644 gitlab2prov/adapters/hub/fetcher.py create mode 100644 gitlab2prov/adapters/hub/parser.py diff --git a/gitlab2prov/adapters/hub/__init__.py b/gitlab2prov/adapters/hub/__init__.py new file mode 100644 index 0000000..f777384 --- /dev/null +++ b/gitlab2prov/adapters/hub/__init__.py @@ -0,0 +1 @@ +from gitlab2prov.adapters.hub.fetcher import GithubFetcher \ No newline at end of file diff --git a/gitlab2prov/adapters/hub/fetcher.py b/gitlab2prov/adapters/hub/fetcher.py new file mode 100644 index 0000000..8e5f955 --- /dev/null +++ b/gitlab2prov/adapters/hub/fetcher.py @@ -0,0 +1,159 @@ +import logging +import itertools +from typing import Iterator +from dataclasses import dataclass, field, InitVar + +from github import Github +from github.Repository import Repository + +from gitlab2prov.adapters.project_url import GithubProjectUrl +from gitlab2prov.adapters.hub.parser import GithubAnnotationParser +from gitlab2prov.domain.constants import ProvRole +from gitlab2prov.domain.objects import ( + Asset, + User, + Commit, + Issue, + MergeRequest, + GitTag, + Release, +) + + +log = logging.getLogger(__name__) + + +@dataclass +class GithubFetcher: + token: InitVar[str] + url: InitVar[str] + + parser: GithubAnnotationParser = GithubAnnotationParser() + client: Github = field(init=False) + repository: Repository = field(init=False) + + def __post_init__(self, token, url) -> None: + self.client = Github(login_or_token=token, per_page=100) + self.repository = self.client.get_repo(full_name_or_id=GithubProjectUrl(url).slug) + log.warning(f"Remaining requests: {self.client.rate_limiting[0]}") + + def fetch_all(self) -> Iterator[Commit | Issue | MergeRequest | Release | GitTag]: + yield from itertools.chain( + self.fetch_commits(), + self.fetch_issues(), + self.fetch_mergerequests(), + self.fetch_releases(), + self.fetch_tags(), + ) + + def fetch_commits(self) -> Iterator[Commit]: + for commit in self.repository.get_commits(): + raw_annotations = [ + *commit.get_statuses(), + *commit.get_comments(), + *(comment.get_reactions() for comment in commit.get_comments()), + ] + yield Commit( + sha=commit.sha, + url=commit.url, + author=User( + commit.commit.author.name, + commit.commit.author.email, + prov_role=ProvRole.COMMIT_AUTHOR, + ), + platform="github", + annotations=self.parser.parse(raw_annotations), + authored_at=commit.commit.author.date, + committed_at=commit.commit.committer.date, + ) + + def fetch_issues(self) -> Iterator[Issue]: + for issue in self.repository.get_issues(state="all"): + raw_annotations = [ + *issue.get_comments(), + *issue.get_reactions(), + *(comment.get_reactions() for comment in issue.get_comments()), + *issue.get_events(), + *issue.get_timeline(), + ] + yield Issue( + id=issue.number, + iid=issue.id, + platform="github", + title=issue.title, + body=issue.body, + url=issue.url, + author=User(issue.user.name, issue.user.email, prov_role=ProvRole.ISSUE_AUTHOR), + annotations=self.parser.parse(raw_annotations), + created_at=issue.created_at, + closed_at=issue.closed_at, + ) + + def fetch_mergerequests(self) -> Iterator[MergeRequest]: + for pull in self.repository.get_pulls(state="all"): + raw_annotations = [] + raw_annotations.extend(pull.get_comments()) + raw_annotations.extend(comment.get_reactions() for comment in pull.get_comments()) + raw_annotations.extend(pull.get_review_comments()) + raw_annotations.extend( + comment.get_reactions() for comment in pull.get_review_comments() + ) + raw_annotations.extend(pull.get_reviews()) + raw_annotations.extend(pull.as_issue().get_reactions()) + raw_annotations.extend(pull.as_issue().get_events()) + raw_annotations.extend(pull.as_issue().get_timeline()) + + yield MergeRequest( + id=pull.number, + iid=pull.id, + title=pull.title, + body=pull.body, + url=pull.url, + platform="github", + source_branch=pull.base.ref, + target_branch=pull.head.ref, + author=User( + name=pull.user.name, + email=pull.user.email, + prov_role=ProvRole.MERGE_REQUEST_AUTHOR, + ), + annotations=self.parser.parse(raw_annotations), + created_at=pull.created_at, + closed_at=pull.closed_at, + merged_at=pull.merged_at, + ) + + def fetch_releases(self) -> Iterator[Release]: + for release in self.repository.get_releases(): + yield Release( + name=release.title, + body=release.body, + tag_name=release.tag_name, + platform="github", + author=User( + name=release.author.name, + email=release.author.email, + prov_role=ProvRole.RELEASE_AUTHOR, + ), + assets=[ + Asset(url=asset.url, format=asset.content_type) + for asset in release.get_assets() + ], + evidences=[], + created_at=release.created_at, + released_at=release.published_at, + ) + + def fetch_tags(self) -> Iterator[GitTag]: + for tag in self.repository.get_tags(): + yield GitTag( + name=tag.name, + sha=tag.commit.sha, + message=tag.commit.commit.message, + author=User( + name=tag.commit.author.name, + email=tag.commit.author.email, + prov_role=ProvRole.TAG_AUTHOR, + ), + created_at=tag.commit.commit.author.date, + ) diff --git a/gitlab2prov/adapters/hub/parser.py b/gitlab2prov/adapters/hub/parser.py new file mode 100644 index 0000000..493256a --- /dev/null +++ b/gitlab2prov/adapters/hub/parser.py @@ -0,0 +1,191 @@ +import logging +from dataclasses import dataclass +from typing import TypeVar, Callable + +from github.CommitComment import CommitComment +from github.CommitStatus import CommitStatus +from github.Reaction import Reaction +from github.IssueComment import IssueComment +from github.IssueEvent import IssueEvent +from github.TimelineEvent import TimelineEvent +from github.PullRequestComment import PullRequestComment +from github.PullRequestReview import PullRequestReview + +from gitlab2prov.domain.objects import Annotation, User +from gitlab2prov.domain.constants import ProvRole + +A = TypeVar("A") + +log = logging.getLogger(__name__) + + +@dataclass +class GithubAnnotationParser: + @staticmethod + def sort_by_date(annotations: list[Annotation]) -> list[Annotation]: + return list(sorted(annotations, key=lambda a: a.start)) + + def choose_parser(self, raw_annotation: A) -> Callable[[A], Annotation]: + match raw_annotation: + case CommitComment(): + return self.parse_commit_comment + case CommitStatus(): + return self.parse_commit_status + case Reaction(): + return self.parse_reaction + case IssueComment(): + return self.parse_issue_comment + case IssueEvent(): + return self.parse_issue_event + case TimelineEvent(): + return self.parse_timeline_event + case PullRequestReview(): + return self.parse_pull_request_review + case PullRequestComment(): + return self.parse_pull_request_comment + case _: + log.warning(f"no parser found for {raw_annotation=}") + + def parse(self, annotations: list[A]) -> list[Annotation]: + parsed_annotations = [] + for annotation in annotations: + if parser := self.choose_parser(annotation): + parsed_annotations.append(parser(annotation)) + return self.sort_by_date(parsed_annotations) + + def parse_commit_comment(self, comment: CommitComment) -> Annotation: + annotator = User( + name=comment.user.name, + email=comment.user.email, + github_username=comment.user.login, + github_id=comment.user.id, + prov_role=ProvRole.ANNOTATOR, + ) + return Annotation( + uid=comment.id, + name="add_comment", + body=comment.body, + start=comment.created_at, + end=comment.created_at, + annotator=annotator, + ) + + def parse_commit_status(self, status: CommitStatus) -> Annotation: + annotator = User( + name=status.creator.name, + email=status.creator.email, + github_username=status.creator.login, + github_id=status.creator.id, + prov_role=ProvRole.ANNOTATOR, + ) + return Annotation( + uid=status.id, + name="add_commit_status", + body=status.description, + start=status.created_at, + end=status.created_at, + annotator=annotator, + ) + + def parse_reaction(self, reaction: Reaction) -> Annotation: + annotator = User( + name=reaction.user.name, + email=reaction.user.email, + github_username=reaction.user.login, + github_id=reaction.user.id, + prov_role=ProvRole.ANNOTATOR, + ) + return Annotation( + uid=reaction.id, + name="add_award", + body=reaction.content, + start=reaction.created_at, + end=reaction.created_at, + annotator=annotator, + ) + + + def parse_issue_comment(self, comment: IssueComment) -> Annotation: + annotator = User( + name=comment.user.name, + email=comment.user.email, + github_username=comment.user.login, + github_id=comment.user.id, + prov_role=ProvRole.ANNOTATOR, + ) + return Annotation( + uid=comment.id, + name="add_comment", + body=comment.body, + start=comment.created_at, + end=comment.created_at, + annotator=annotator, + ) + + def parse_issue_event(self, event: IssueEvent) -> Annotation: + annotator = User( + name=event.actor.name, + email=event.actor.email, + github_username=event.actor.login, + github_id=event.actor.id, + prov_role=ProvRole.ANNOTATOR, + ) + return Annotation( + uid=event.id, + name=event.event, + body=event.event, + start=event.created_at, + end=event.created_at, + annotator=annotator, + ) + + def parse_timeline_event(self, event: TimelineEvent) -> Annotation: + annotator = User( + name=event.actor.name, + email=event.actor.email, + github_username=event.actor.login, + github_id=event.actor.id, + prov_role=ProvRole.ANNOTATOR, + ) + return Annotation( + uid=event.id, + name=event.event, + body=event.event, + start=event.created_at, + end=event.created_at, + annotator=annotator, + ) + + def parse_pull_request_review(self, review: PullRequestReview) -> Annotation: + annotator = User( + name=review.user.name, + email=review.user.email, + github_username=review.user.login, + github_id=review.user.id, + prov_role=ProvRole.ANNOTATOR, + ) + return Annotation( + uid=review.id, + name="add_review", + body=review.body, + start=review.submitted_at, + end=review.submitted_at, + annotator=annotator, + ) + + def parse_pull_request_comment(self, comment: PullRequestComment) -> Annotation: + annotator = User( + name=comment.user.name, + email=comment.user.email, + github_username=comment.user.login, + github_id=comment.user.id, + prov_role=ProvRole.ANNOTATOR, + ) + return Annotation( + uid=comment.id, + name="add_comment", + body=comment.body, + start=comment.created_at, + end=comment.created_at, + annotator=annotator, + ) From c0d23ad1d07d13d0cbb67652fbfb2f941b3d987a Mon Sep 17 00:00:00 2001 From: cdboer <cdboer@cdboer.de> Date: Sun, 29 Jan 2023 18:16:06 +0100 Subject: [PATCH 29/81] Add class that handles everything related to project/clone urls --- gitlab2prov/adapters/project_url.py | 45 +++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) create mode 100644 gitlab2prov/adapters/project_url.py diff --git a/gitlab2prov/adapters/project_url.py b/gitlab2prov/adapters/project_url.py new file mode 100644 index 0000000..e73112d --- /dev/null +++ b/gitlab2prov/adapters/project_url.py @@ -0,0 +1,45 @@ +from urllib.parse import urlsplit +from dataclasses import dataclass + + +@dataclass +class ProjectUrl: + url: str + + @property + def slug(self) -> str: + if path := urlsplit(self.url).path: + owner, project = (s for s in path.split("/") if s) + return f"{owner}/{project}" + return None + + @property + def instance(self) -> str: + return f"{self.scheme}://{self.netloc}" + + @property + def netloc(self): + return urlsplit(self.url).netloc + + @property + def scheme(self): + return "https" + + def clone_url(self, platform: str, token: str | None = None, method: str = "https"): + urls = { + "gitlab": f"{self.instance}:{token}@{self.netloc}/{self.slug}", + "github": f"{self.scheme}://{token}@{self.netloc}/{self.slug}.git", + } + return urls.get(platform) + + +@dataclass +class GitlabProjectUrl(ProjectUrl): + def clone_url(self, token: str | None = None, method: str = "https"): + return super().clone_url("gitlab", token, method) + + +@dataclass +class GithubProjectUrl(ProjectUrl): + def clone_url(self, token: str | None = None, method: str = "https"): + return super().clone_url("github", token, method) From 4651d7362267f5da507adbafbd17b3508bfc0a77 Mon Sep 17 00:00:00 2001 From: cdboer <cdboer@cdboer.de> Date: Sun, 29 Jan 2023 18:17:06 +0100 Subject: [PATCH 30/81] Remove fetch subpackage as it has been replaced by 'git', 'hub', 'lab' pckgs --- gitlab2prov/adapters/fetch/__init__.py | 14 - gitlab2prov/adapters/fetch/_git.py | 150 ------ gitlab2prov/adapters/fetch/_github.py | 161 ------- gitlab2prov/adapters/fetch/_gitlab.py | 199 -------- .../adapters/fetch/annotations/__init__.py | 4 - .../adapters/fetch/annotations/classifiers.py | 445 ------------------ .../adapters/fetch/annotations/parse.py | 185 -------- gitlab2prov/adapters/fetch/utils.py | 21 - 8 files changed, 1179 deletions(-) delete mode 100644 gitlab2prov/adapters/fetch/__init__.py delete mode 100644 gitlab2prov/adapters/fetch/_git.py delete mode 100644 gitlab2prov/adapters/fetch/_github.py delete mode 100644 gitlab2prov/adapters/fetch/_gitlab.py delete mode 100644 gitlab2prov/adapters/fetch/annotations/__init__.py delete mode 100644 gitlab2prov/adapters/fetch/annotations/classifiers.py delete mode 100644 gitlab2prov/adapters/fetch/annotations/parse.py delete mode 100644 gitlab2prov/adapters/fetch/utils.py diff --git a/gitlab2prov/adapters/fetch/__init__.py b/gitlab2prov/adapters/fetch/__init__.py deleted file mode 100644 index 10e7664..0000000 --- a/gitlab2prov/adapters/fetch/__init__.py +++ /dev/null @@ -1,14 +0,0 @@ -from gitlab2prov.adapters.fetch._git import GitFetcher -from gitlab2prov.adapters.fetch._gitlab import GitlabFetcher -from gitlab2prov.adapters.fetch._github import GithubFetcher - - -class FetcherFactory: - @staticmethod - def factory(url: str): - if "github" in url: - return GithubFetcher - if "gitlab" in url: - return GitlabFetcher - raise ValueError(f"can't derive fetcher for unknown url {url=}") - \ No newline at end of file diff --git a/gitlab2prov/adapters/fetch/_git.py b/gitlab2prov/adapters/fetch/_git.py deleted file mode 100644 index 1daa138..0000000 --- a/gitlab2prov/adapters/fetch/_git.py +++ /dev/null @@ -1,150 +0,0 @@ -from collections.abc import Iterator -from dataclasses import dataclass -from itertools import zip_longest -from tempfile import TemporaryDirectory -from pathlib import Path - -from git import Commit -from git import Repo - -from gitlab2prov.adapters.fetch.utils import clone_over_https_url -from gitlab2prov.domain.constants import ChangeType -from gitlab2prov.domain.constants import ProvRole -from gitlab2prov.domain.objects import File -from gitlab2prov.domain.objects import FileRevision -from gitlab2prov.domain.objects import GitCommit -from gitlab2prov.domain.objects import User - - -EMPTY_TREE_SHA = "4b825dc642cb6eb9a060e54bf8d69288fbee4904" - - -@dataclass -class GitFetcher: - url: str - token: str - - _repo: Repo | None = None - _tmpdir: TemporaryDirectory | None = None - - def __enter__(self): - self._tmpdir = TemporaryDirectory(ignore_cleanup_errors=True) - return self - - def __exit__(self, exc_type, exc_val, exc_tb): - if self._repo: - self._repo.close() - if self._tmpdir: - self._tmpdir.cleanup() - - def do_clone(self) -> None: - clone_url = "" - if "gitlab" in self.url: - clone_url = clone_over_https_url(self.url, self.token, "gitlab") - if "github" in self.url: - clone_url = clone_over_https_url(self.url, self.token, "github") - - self._repo = Repo.clone_from( - url=clone_url, - to_path=self._tmpdir.name, - ) - - def fetch_git(self) -> Iterator[GitCommit | File | FileRevision]: - yield from extract_commits(self._repo) - yield from extract_files(self._repo) - yield from extract_revisions(self._repo) - - -def get_author(commit: Commit) -> User: - return User( - name=commit.author.name, - email=commit.author.email, - gitlab_username=None, - gitlab_id=None, - prov_role=ProvRole.AUTHOR, - ) - - -def get_committer(commit: Commit) -> User: - return User( - name=commit.committer.name, - email=commit.committer.email, - gitlab_username=None, - gitlab_id=None, - prov_role=ProvRole.COMMITTER, - ) - - -def parse_log(log: str): - """Parse 'git log' output into file paths, commit hexshas, file status (aka change type). - Example: - >>> parse_log( - ''' - 34db8646fe1648bef9b7ce6613ae4a06acffba66 - A foo.py - 9b65f80b44acffc8036fef932f801134533b99bd - M foo.py - ''' - ) - [(foo.py, 34db8646fe1648bef9b7ce6613ae4a06acffba66, A), (foo.py, 9b65f80b44acffc8036fef932f801134533b99bd, M)] - """ - # split at line breaks, strip whitespace, remove empty lines - lines = [line.strip() for line in log.split("\n") if line] - # every second line contains the SHA1 of a commit - hexshas = lines[::2] - # every other line contains a type, aswell as a file path - types = [line.split()[0][0] for line in lines[1::2]] - paths = [line.split()[1][:] for line in lines[1::2]] - # zip all three together - return zip(paths, hexshas, types) - - -def extract_commits(repo: Repo) -> Iterator[GitCommit]: - for commit in repo.iter_commits("--all"): - yield GitCommit( - sha=commit.hexsha, - title=commit.summary, - message=commit.message, - author=get_author(commit), - committer=get_committer(commit), - parents=[parent.hexsha for parent in commit.parents], - start=commit.authored_datetime, - end=commit.committed_datetime, - ) - - -def extract_files(repo: Repo) -> Iterator[File]: - for commit in repo.iter_commits("--all"): - # choose the parent commit to diff against - # use *magic* empty tree sha for commits without parents - parent = commit.parents[0] if commit.parents else EMPTY_TREE_SHA - # diff against parent - diff = commit.diff(parent, R=True) - # only consider files that have been added to the repository - # disregard modifications and deletions - for diff_item in diff.iter_change_type(ChangeType.ADDED): - # path for new files is stored in diff b_path - yield File(name=Path(diff_item.b_path).name, path=diff_item.b_path, commit=commit.hexsha) - - -def extract_revisions(repo: Repo) -> Iterator[FileRevision]: - for file in extract_files(repo): - revs = [] - - for path, hexsha, status in parse_log( - repo.git.log( - "--all", - "--follow", - "--name-status", - "--pretty=format:%H", - "--", - file.path, - ) - ): - revs.append( - FileRevision(name=Path(path).name, path=path, commit=hexsha, status=status, file=file) - ) - # revisions remeber their predecessor (previous revision) - for rev, prev in zip_longest(revs, revs[1:]): - rev.previous = prev - yield rev diff --git a/gitlab2prov/adapters/fetch/_github.py b/gitlab2prov/adapters/fetch/_github.py deleted file mode 100644 index 346b799..0000000 --- a/gitlab2prov/adapters/fetch/_github.py +++ /dev/null @@ -1,161 +0,0 @@ -import logging -import itertools - -from typing import Iterator -from dataclasses import dataclass, field, InitVar - -from github import Github -from github.Repository import Repository - -from gitlab2prov.adapters.fetch.annotations import parse_annotations -from gitlab2prov.adapters.fetch.utils import project_slug -from gitlab2prov.domain.constants import ProvRole -from gitlab2prov.domain.objects import ( - Asset, - Release, - User, - Commit, - Issue, - MergeRequest, - GitTag, - Release, -) - - -log = logging.getLogger(__name__) - - -@dataclass -class GithubFetcher: - private_token: InitVar[str] - project_url: InitVar[str] - - client: Github = field(init=False) - repository: Repository = field(init=False) - - def __post_init__(self, private_token, project_url) -> None: - self.client = Github(login_or_token=private_token, per_page=100) - self.repository = self.client.get_repo(full_name_or_id=project_slug(project_url)) - log.warning(f"Remaining requests: {self.client.rate_limiting[0]}") - - def fetch_all(self) -> Iterator[Commit | Issue | MergeRequest | Release | GitTag]: - yield from itertools.chain( - self.fetch_commits(), - self.fetch_issues(), - self.fetch_mergerequests(), - self.fetch_releases(), - self.fetch_tags(), - ) - - def fetch_commits(self) -> Iterator[Commit]: - for commit in self.repository.get_commits(): - parseables = [ - *commit.get_statuses(), - *commit.get_comments(), - *(comment.get_reactions() for comment in commit.get_comments()), - ] - yield Commit( - sha=commit.sha, - url=commit.url, - author=User( - commit.commit.author.name, - commit.commit.author.email, - prov_role=ProvRole.COMMIT_AUTHOR, - ), - platform="github", - annotations=parse_annotations(parseables), - authored_at=commit.commit.author.date, - committed_at=commit.commit.committer.date, - ) - - def fetch_issues(self) -> Iterator[Issue]: - for issue in self.repository.get_issues(state="all"): - parseables = [ - *issue.get_comments(), - *(comment.get_reactions() for comment in issue.get_comments()), - *issue.get_labels(), - *issue.get_events(), - *issue.get_timeline(), - ] - yield Issue( - id=issue.number, - iid=issue.id, - platform="github", - title=issue.title, - body=issue.body, - url=issue.url, - author=User(issue.user.name, issue.user.email, prov_role=ProvRole.ISSUE_AUTHOR), - annotations=[], - created_at=issue.created_at, - closed_at=issue.closed_at, - ) - - def fetch_mergerequests(self) -> Iterator[MergeRequest]: - for pull in self.repository.get_pulls(state="all"): - raw_annotations = [] - raw_annotations.extend(pull.get_comments()) - raw_annotations.extend(comment.get_reactions() for comment in pull.get_comments()) - raw_annotations.extend(pull.get_labels()) - raw_annotations.extend(pull.get_review_comments()) - raw_annotations.extend( - comment.get_reactions() for comment in pull.get_review_comments() - ) - raw_annotations.extend(pull.get_reviews()) - raw_annotations.extend(pull.as_issue().get_reactions()) - raw_annotations.extend(pull.as_issue().get_events()) - raw_annotations.extend(pull.as_issue().get_timeline()) - - yield MergeRequest( - id=pull.number, - iid=pull.id, - title=pull.title, - body=pull.body, - url=pull.url, - platform="github", - source_branch=pull.base.ref, - target_branch=pull.head.ref, - author=User( - name=pull.user.name, - email=pull.user.email, - prov_role=ProvRole.MERGE_REQUEST_AUTHOR, - ), - annotations=[], - created_at=pull.created_at, - closed_at=pull.closed_at, - merged_at=pull.merged_at, - ) - - def fetch_releases(self) -> Iterator[Release]: - for release in self.repository.get_releases(): - yield Release( - name=release.title, - body=release.body, - tag_name=release.tag_name, - platform="github", - author=User( - name=release.author.name, - email=release.author.email, - prov_role=ProvRole.RELEASE_AUTHOR, - ), - assets=[ - Asset(url=asset.url, format=asset.content_type) - for asset in release.get_assets() - ], - evidences=[], - created_at=release.created_at, - released_at=release.published_at, - ) - - def fetch_tags(self) -> Iterator[GitTag]: - for tag in self.repository.get_tags(): - yield GitTag( - name=tag.name, - sha=tag.commit.sha, - message=tag.commit.commit.message, - author=User( - name=tag.commit.author.name, - email=tag.commit.author.email, - prov_role=ProvRole.TAG_AUTHOR, - ), - created_at=tag.commit.commit.author.date, - ) diff --git a/gitlab2prov/adapters/fetch/_gitlab.py b/gitlab2prov/adapters/fetch/_gitlab.py deleted file mode 100644 index 5bf94db..0000000 --- a/gitlab2prov/adapters/fetch/_gitlab.py +++ /dev/null @@ -1,199 +0,0 @@ -import logging -import itertools - -from dataclasses import dataclass, field, InitVar -from typing import Iterator - -from gitlab import Gitlab -from gitlab.exceptions import GitlabListError - -from gitlab2prov.adapters.fetch.annotations import parse_annotations -from gitlab2prov.adapters.fetch.utils import instance_url, project_slug -from gitlab2prov.domain.constants import ProvRole -from gitlab2prov.domain.objects import ( - Asset, - Evidence, - Commit, - Issue, - MergeRequest, - Release, - GitTag, - User, - GitTag, -) - - -log = logging.getLogger(__name__) - - -@dataclass -class GitlabFetcher: - private_token: InitVar[str] - url: InitVar[str] = "https://gitlab.com" - - client: Gitlab = field(init=False) - project: Gitlab = field(init=False) - - def __post_init__(self, private_token, url) -> None: - self.client = Gitlab(instance_url(url), private_token=private_token) - self.project = self.client.projects.get(project_slug(url)) - - def log_list_err(self, log: logging.Logger, err: GitlabListError, cls: str) -> None: - log.error(f"failed to fetch {cls} from {instance_url(self.project)}") - log.error(f"error: {err}") - - def fetch_all(self) -> Iterator[Commit | Issue | MergeRequest | Release | GitTag]: - yield from itertools.chain( - self.fetch_commits(), - self.fetch_issues(), - self.fetch_mergerequests(), - self.fetch_releases(), - self.fetch_tags(), - ) - - def fetch_commits(self) -> Iterator[Commit]: - try: - for commit in self.project.commits.list(all=True, per_page=100): - yield Commit( - sha=commit.id, - url=commit.web_url, - platform="gitlab", - author=User( - commit.author_name, commit.author_email, prov_role=ProvRole.COMMIT_AUTHOR - ), - annotations=parse_annotations( - [ - *commit.comments.list(all=True, system=False), - *commit.comments.list(all=True, system=True), - ] - ), - authored_at=commit.authored_date, - committed_at=commit.committed_date, - ) - except GitlabListError as err: - self.log_list_err(log, err, "commits") - - def fetch_issues(self, state="all") -> Iterator[Issue]: - try: - for issue in self.project.issues.list(all=True, state=state, per_page=100): - yield Issue( - id=issue.id, - iid=issue.iid, - platform="gitlab", - title=issue.title, - body=issue.description, - url=issue.web_url, - author=User( - issue.author.get("name"), - issue.author.get("email"), - gitlab_username=issue.author.get("username"), - gitlab_id=issue.author.get("id"), - prov_role=ProvRole.ISSUE_AUTHOR, - ), - annotations=parse_annotations( - [ - *issue.notes.list(all=True, system=False), - *issue.notes.list(all=True, system=True), - *issue.awardemojis.list(all=True), - *issue.resourcelabelevents.list(all=True), - *( - award - for note in issue.notes.list(all=True) - for award in note.awardemojis.list(all=True) - ), - ] - ), - created_at=issue.created_at, - closed_at=issue.closed_at, - ) - except GitlabListError as err: - self.log_list_err(log, err, "issues") - - def fetch_mergerequests(self, state="all") -> Iterator[MergeRequest]: - try: - for merge in self.project.mergerequests.list(all=True, state=state, per_page=100): - yield MergeRequest( - id=merge.id, - iid=merge.iid, - title=merge.title, - body=merge.description, - url=merge.web_url, - platform="gitlab", - source_branch=merge.source_branch, - target_branch=merge.target_branch, - author=User( - merge.author.get("name"), - merge.author.get("email"), - gitlab_username=merge.author.get("username"), - gitlab_id=merge.author.get("id"), - prov_role=ProvRole.MERGE_REQUEST_AUTHOR, - ), - annotations=parse_annotations( - ( - *merge.notes.list(all=True, system=False), - *merge.notes.list(all=True, system=True), - *merge.awardemojis.list(all=True), - *merge.resourcelabelevents.list(all=True), - *( - award - for note in merge.notes.list(all=True) - for award in note.awardemojis.list(all=True) - ), - ) - ), - created_at=merge.created_at, - closed_at=merge.closed_at, - merged_at=merge.merged_at, - first_deployed_to_production_at=getattr(merge, "first_deployed_to_production_at", None), - ) - except GitlabListError as err: - self.log_list_err(log, err, "merge requests") - - def fetch_releases(self) -> Iterator[Release]: - try: - for release in self.project.releases.list(all=True, per_page=100): - yield Release( - name=release.name, - body=release.description, - tag_name=release.tag_name, - author=User( - name=release.author.get("name"), - email=release.author.get("email"), - gitlab_username=release.author.get("username"), - gitlab_id=release.author.get("id"), - prov_role=ProvRole.RELEASE_AUTHOR, - ), - assets=[ - Asset(url=asset.get("url"), format=asset.get("format")) - for asset in release.assets.get("sources", []) - ], - evidences=[ - Evidence( - sha=evidence.get("sha"), - url=evidence.get("filepath"), - collected_at=evidence.get("collected_at"), - ) - for evidence in release.evidences - ], - created_at=release.created_at, - released_at=release.released_at, - ) - except GitlabListError as err: - self.log_list_err(log, err, "releases") - - def fetch_tags(self) -> Iterator[GitTag]: - try: - for tag in self.project.tags.list(all=True, per_page=100): - yield GitTag( - name=tag.name, - sha=tag.target, - message=tag.message, - author=User( - name=tag.commit.get("author_name"), - email=tag.commit.get("author_email"), - prov_role=ProvRole.TAG_AUTHOR, - ), - created_at=tag.commit.get("created_at"), - ) - except GitlabListError as err: - self.log_list_err(log, err, "tags") diff --git a/gitlab2prov/adapters/fetch/annotations/__init__.py b/gitlab2prov/adapters/fetch/annotations/__init__.py deleted file mode 100644 index db17990..0000000 --- a/gitlab2prov/adapters/fetch/annotations/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -from gitlab2prov.adapters.fetch.annotations.classifiers import CLASSIFIERS -from gitlab2prov.adapters.fetch.annotations.classifiers import IMPORT_STATEMENT -from gitlab2prov.adapters.fetch.annotations.classifiers import AnnotationClassifier -from gitlab2prov.adapters.fetch.annotations.parse import parse_annotations diff --git a/gitlab2prov/adapters/fetch/annotations/classifiers.py b/gitlab2prov/adapters/fetch/annotations/classifiers.py deleted file mode 100644 index dd48c15..0000000 --- a/gitlab2prov/adapters/fetch/annotations/classifiers.py +++ /dev/null @@ -1,445 +0,0 @@ -import logging -import re -from dataclasses import dataclass -from dataclasses import field -from dataclasses import InitVar -from typing import Any - - -log = logging.getLogger(__name__) - - -def match_length(match: re.Match) -> int: - if match is None: - raise TypeError(f"Expected argument of type re.Match, got {type(match)}.") - return match.end() - match.start() - - -@dataclass(kw_only=True) -class Classifier: - patterns: InitVar[list[str]] - compiled: list[re.Pattern] = field(init=False, default_factory=list) - match: re.Match = field(init=False, default=None) - - def __post_init__(self, regexps: list[str]): - self.compiled = [re.compile(regex, re.IGNORECASE) for regex in regexps] - - def matches(self, string: str) -> bool: - matches = [match for pt in self.compiled if (match := re.search(pt, string))] - self.match = max(matches, key=match_length, default=None) - return self.match is not None - - def groupdict(self) -> dict[str, Any]: - if not self.match: - return dict() - return self.match.groupdict() - - def __len__(self) -> int: - if not self.match: - return 0 - return match_length(self.match) - - -@dataclass(kw_only=True) -class ImportStatement(Classifier): - def replace(self, string: str) -> str: - if not self.match: - return string - # replace leftmost occurence - replaced = self.match.re.sub("", string, count=1) - # remove trailing whitespace - return replaced.strip() - - -@dataclass(kw_only=True) -class AnnotationClassifier(Classifier): - name: str = field(compare=False) - - -CLASSIFIERS = [ - AnnotationClassifier( - name="change_target_branch", - patterns=[ - r"^changed target branch from `(?P<old_target_branch>.+)` to `(?P<new_target_branch>.+)`$" - ], - ), - AnnotationClassifier( - name="change_epic", - patterns=[ - r"^changed epic to &(?P<epic_iid>\d+)$", - r"^changed epic to &(?P<epic_name>.+)$", - r"^changed epic to (?P<project_slug>.+)&(?P<epic_name>\d+)$", - r"^changed epic to (?P<project_slug>.+)&(?P<epic_name>.+)$", - ], - ), - AnnotationClassifier( - name="add_to_epic", - patterns=[ - r"^added to epic &(?P<epic_iid>\d+)$", - r"^added to epic &(?P<epic_name>.+)$", - ], - ), - AnnotationClassifier( - name="remove_from_epic", - patterns=[ - r"^removed from epic &(?P<epic_iid>\d+)$", - r"^removed from epic &(?P<epic_name>.+)$", - ], - ), - AnnotationClassifier( - name="add_to_external_epic", - patterns=[ - r"^added to epic (?P<project_slug>.+)&(?P<epic_iid>\d+)$", - r"^added to epic (?P<project_slug>.+)&(?P<epic_name>.+)$", - ], - ), - AnnotationClassifier( - name="remove_from_external_epic", - patterns=[ - r"^removed from epic (?P<project_slug>.+)&(?P<epic_iid>\d+)$", - r"^removed from epic (?P<project_slug>.+)&(?P<epic_name>.+)$", - ], - ), - AnnotationClassifier( - name="close_by_external_commit", - patterns=[r"^closed via commit (?P<project_slug>.+)@(?P<commit_sha>[0-9a-z]+)$"], - ), - AnnotationClassifier( - name="close_by_external_merge_request", - patterns=[r"^close via merge request (?P<project_slug>.+?)!(?P<merge_request_iid>\d+)$"], - ), - AnnotationClassifier( - name="close_by_merge_request", - patterns=[ - r"^closed via merge request !(?P<merge_request_iid>.+)$", - r"^status changed to closed by merge request !(?P<merge_request_iid>.+)$", - ], - ), - AnnotationClassifier( - name="close_by_commit", - patterns=[ - r"^closed via commit (?P<commit_sha>[a-z0-9]+)$", - r"^status changed to closed by commit (?P<commit_sha>[a-z0-9]+)$", - ], - ), - AnnotationClassifier( - name="restore_source_branch", - patterns=[ - r"^restored source branch `(?P<branch_name>.+)`$", - ], - ), - AnnotationClassifier(name="remove_label", patterns=[r"^removed ~(?P<label_id>\d+) label$"]), - AnnotationClassifier(name="add_label", patterns=[r"^added ~(?P<label_id>\d+) label$"]), - AnnotationClassifier( - name="create_branch", - patterns=[r"^created branch \[`(?P<branch_name>.+)`\]\((?P<compare_link>.+)\).*$"], - ), - AnnotationClassifier( - name="mark_task_as_incomplete", - patterns=[r"^marked the task [*]{2}(?P<task_description>.+)[*]{2} as incomplete$"], - ), - AnnotationClassifier( - name="mark_task_as_done", - patterns=[ - r"^marked the task [*]{2}(?P<task_description>.+)[*]{2} as completed$", - ], - ), - AnnotationClassifier( - name="add_commits", - patterns=[ - r"added (?P<number_of_commits>\d+)\scommit[s]?\n\n.+(?P<short_sha>[a-z0-9]{8}) - (?P<title>.+?)<.*", - r"^added (?P<number_of_commits>\d+) new commit[s]?:\n\n(\* (?P<short_sha>[a-z0-9]{8}) - (?P<title>.+?)\n)+$", - r"^added (?P<number_of_commits>\d+) new commit[s]?:\n\n(\* (?P<short_sha>[a-z0-9]{11}) - (?P<title>.+?)\n)+$", - r"^added (?P<number_of_commits>\d+) commit[s]?(?:.*\n?)*$", - r"^added 0 new commits:\n\n$", # seems weird - ], - ), - AnnotationClassifier( - name="address_in_merge_request", - patterns=[r"^created merge request !(?P<merge_request_iid>\d+) to address this issue$"], - ), - AnnotationClassifier( - name="unmark_as_work_in_progress", - patterns=[ - r"^unmarked as a [*]{2}work in progress[*]{2}$", - r"^unmarked this merge request as a work in progress$", - ], - ), - AnnotationClassifier( - name="mark_as_work_in_progress", - patterns=[ - r"^marked as a [*]{2}work in progress[*]{2}$", - r"^marked this merge request as a [*]{2}work in progress[*]{2}$", - ], - ), - AnnotationClassifier( - name="status_changed_to_merged", - patterns=[ - r"^merged$", - r"^status changed to merged$", - ], - ), - AnnotationClassifier(name="change_description", patterns=[r"^changed the description$"]), - AnnotationClassifier( - name="change_title", - patterns=[ - r"^changed title from [*]{2}(?P<old_title>.+)[*]{2} to [*]{2}(?P<new_title>.+)[*]{2}$", - r"^changed title: [*]{2}(?P<old_title>.+)[*]{2} → [*]{2}(?P<new_title>.+)[*]{2}$", - r"^title changed from [*]{2}(?P<old_title>.+)[*]{2} to [*]{2}(?P<new_title>.+)[*]{2}$", - ], - ), - AnnotationClassifier( - name="move_from", - patterns=[r"^moved from (?P<project_slug>.*?)#(?P<issue_iid>\d+)$"], - ), - AnnotationClassifier( - name="move_to", - patterns=[r"^moved to (?P<project_slug>.*?)#(?P<issue_iid>\d+)$"], - ), - AnnotationClassifier(name="reopen", patterns=[r"^reopened$", r"^status changed to reopened$"]), - AnnotationClassifier( - name="close", - patterns=[ - r"^closed$", - r"^status changed to closed$", - ], - ), - AnnotationClassifier( - name="unrelate_from_external_issue", - patterns=[r"^removed the relation with (?P<project_slug>.+)#(?P<issue_iid>\d+)$"], - ), - AnnotationClassifier( - name="relate_to_external_issue", - patterns=[r"^marked this issue as related to (?P<project_slug>.+)#(?P<issue_iid>\d+)$"], - ), - AnnotationClassifier( - name="unrelate_from_issue", - patterns=[r"^removed the relation with #(?P<issue_iid>\d+)$"], - ), - AnnotationClassifier( - name="relate_to_issue", - patterns=[r"^marked this issue as related to #(?P<issue_iid>\d+)$"], - ), - AnnotationClassifier( - name="has_duplicate", - patterns=[r"^marked #(?P<issue_iid>\d+) as a duplicate of this issue$"], - ), - AnnotationClassifier( - name="mark_as_duplicate", - patterns=[r"^marked this issue as a duplicate of #(?P<issue_iid>\d+)$"], - ), - AnnotationClassifier( - name="make_visible", - patterns=[ - r"^made the issue visible to everyone$", - r"^made the issue visible$", - ], - ), - AnnotationClassifier(name="make_confidential", patterns=[r"^made the issue confidential$"]), - AnnotationClassifier(name="remove_weight", patterns=[r"^removed the weight$"]), - AnnotationClassifier( - name="change_weight", - patterns=[r"^changed weight to [*]{2}(?P<weight>\d+)[*]{2}$"], - ), - AnnotationClassifier(name="remove_due_date", patterns=[r"^removed due date$"]), - AnnotationClassifier( - name="change_due_date", - patterns=[ - r"^changed due date to (?P<month>(?:january|february|march|april|may|june|july|august|september|october|november|december)) (?P<day>\d\d), (?P<year>\d{4})$" - ], - ), - AnnotationClassifier(name="remove_time_estimate", patterns=[r"^removed time estimate$"]), - AnnotationClassifier( - name="change_time_estimate", - patterns=[ - r"^changed time estimate to" - + r"(?:\s(?P<months>[-]?\d+)mo)?" - + r"(?:\s(?P<weeks>[-]?\d+)w)?" - + r"(?:\s(?P<days>[-]?\d+)d)?" - + r"(?:\s(?P<hours>[-]?\d+)h)?" - + r"(?:\s(?P<minutes>[-]?\d+)m)?" - + r"(?:\s(?P<seconds>[-]?\d+)s)?$" - ], - ), - AnnotationClassifier(name="unlock_merge_request", patterns=[r"^unlocked this merge request$"]), - AnnotationClassifier(name="lock_merge_request", patterns=[r"^locked this merge request$"]), - AnnotationClassifier(name="unlock_issue", patterns=[r"^unlocked this issue$"]), - AnnotationClassifier(name="lock_issue", patterns=[r"^locked this issue$"]), - AnnotationClassifier(name="remove_spent_time", patterns=[r"^removed time spent$"]), - AnnotationClassifier( - name="subtract_spent_time", - patterns=[ - r"^subtracted" - + r"(?:\s(?P<months>\d+)mo)?" - + r"(?:\s(?P<weeks>\d+)w)?" - + r"(?:\s(?P<days>\d+)d)?" - + r"(?:\s(?P<hours>\d+)h)?" - + r"(?:\s(?P<minutes>\d+)m)?" - + r"\sof time spent at (?P<date>\d{4}-\d{2}-\d{2})$" - ], - ), - AnnotationClassifier( - name="add_spent_time", - patterns=[ - r"^added" - + r"(?:\s(?P<months>\d+)mo)?" - + r"(?:\s(?P<weeks>\d+)w)?" - + r"(?:\s(?P<days>\d+)d)?" - + r"(?:\s(?P<hours>\d+)h)?" - + r"(?:\s(?P<minutes>\d+)m)?" - + r"\sof time spent at (?P<date>\d{4}-\d{2}-\d{2})$" - ], - ), - AnnotationClassifier( - name="remove_milestone", - patterns=[r"^removed milestone$", r"^milestone removed$"], - ), - AnnotationClassifier( - name="change_milestone", - patterns=[ - r"^changed milestone to %(?P<milestone_iid>\d+)$", - r"^changed milestone to %(?P<milestone_name>.+)$", - r"^changed milestone to (?P<project_slug>.+)%(?P<milestone_iid>\d+)$", - r"^changed milestone to (?P<project_slug>.+)%(?P<milestone_name>.+)$", - r"^milestone changed to %(?P<milestone_iid>\d+)$", - r"^milestone changed to \[(?P<release_name>.+)\]\((?P<release_link>.+)\)$", - r"^milestone changed to (?P<release_name>.+)$", - ], - ), - AnnotationClassifier( - name="unassign_user", - patterns=[ - r"^unassigned @(?P<user_name>.*)$", - r"^removed assignee$", - ], - ), - AnnotationClassifier(name="assign_user", patterns=[r"^assigned to @(?P<user_name>.*)$"]), - AnnotationClassifier( - name="mention_in_external_merge_request", - patterns=[r"^mentioned in merge request (?P<project_slug>.+)!(?P<merge_request_iid>\d+)$"], - ), - AnnotationClassifier( - name="mention_in_merge_request", - patterns=[ - r"^mentioned in merge request !(?P<merge_request_iid>\d+)$", - ], - ), - AnnotationClassifier( - name="mention_in_external_commit", - patterns=[ - r"^mentioned in commit (?P<project_slug>.+)@(?P<commit_sha>[0-9a-z]{40})$", - ], - ), - AnnotationClassifier( - name="mention_in_commit", - patterns=[ - r"^mentioned in commit (?P<commit_sha>[0-9a-z]{40})$", - ], - ), - AnnotationClassifier( - name="mention_in_external_issue", - patterns=[ - r"^mentioned in issue (?P<project_slug>.+)#(?P<issue_iid>\d+)$", - ], - ), - AnnotationClassifier( - name="mention_in_issue", - patterns=[ - r"^mentioned in issue #(?P<issue_iid>\d+)$", - ], - ), - AnnotationClassifier(name="resolve_all_threads", patterns=[r"^resolved all threads$"]), - AnnotationClassifier( - name="approve_merge_request", patterns=[r"^approved this merge request$"] - ), - AnnotationClassifier( - name="resolve_all_discussions", - patterns=[ - r"^resolved all discussions$", - ], - ), - AnnotationClassifier( - name="unapprove_merge_request", patterns=[r"^unapproved this merge request$"] - ), - AnnotationClassifier( - name="enable_automatic_merge_on_pipeline_completion", - patterns=[ - r"^enabled an automatic merge when the pipeline for (?P<pipeline_commit_sha>[0-9a-z]+) succeeds$", - ], - ), - AnnotationClassifier( - name="enable_automatic_merge_on_build_success", - patterns=[ - r"^enabled an automatic merge when the build for (?P<commit_sha>[0-9a-z]+) succeeds$", - ], - ), - AnnotationClassifier( - name="abort_automatic_merge", - patterns=[r"^aborted the automatic merge because (?P<abort_reason>[a-z\s]+)$"], - ), - AnnotationClassifier( - name="cancel_automatic_merge", - patterns=[ - r"^canceled the automatic merge$", - ], - ), - AnnotationClassifier( - name="create_issue_from_discussion", - patterns=[r"^created #(?P<issue_iid>\d+) to continue this discussion$"], - ), - AnnotationClassifier( - name="mark_merge_request_as_ready", - patterns=[r"^marked this merge request as \*\*ready\*\*$"], - ), - AnnotationClassifier( - name="mark_merge_request_note_as_draft", - patterns=[r"^marked this merge request as \*\*draft\*\*$"], - ), - # TODO: allow n reviewers - AnnotationClassifier( - name="request_review", - patterns=[ - r"^requested review from @(?P<user_name>.*)$", - r"^requested review from @(?P<user_name>.*) and @(?P<user_name2>.*)$", - ], - ), - # TODO: allow n reviewers - AnnotationClassifier( - name="cancel_review_request", - patterns=[r"^removed review request for @(?P<user_name>.*)$"], - ), - AnnotationClassifier( - name="mention_in_epic", patterns=[r"^mentioned in epic &(?P<noteable_iid>\d+)$"] - ), - AnnotationClassifier( - name="reassign_user", - patterns=[ - r"^reassigned to @(?P<user_name>.*)$", - ], - ), - AnnotationClassifier( - name="remove_merge_request_from_merge_train", - patterns=[ - r"^removed this merge request from the merge train because no stages / jobs for this pipeline.$" - ], - ), - AnnotationClassifier( - name="start_merge_train", - patterns=[ - r"^started a merge train$", - ], - ), - AnnotationClassifier( - name="enable_automatic_add_to_merge_train", - patterns=[ - r"^enabled automatic add to merge train when the pipeline for (?P<pipeline_commit_sha>[0-9a-z]+) succeeds$", - ], - ), -] - -IMPORT_STATEMENT = ImportStatement( - patterns=[ - r"\*by (?P<pre_import_author>.+) on \d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2} \(imported from gitlab project\)\*", - r"\*by (?P<pre_import_author>.+) on \d{4}-\d{2}-\d{2}\s\d{2}:\d{2}:\d{2}\sUTC \(imported from gitlab project\)\*", - ], -) diff --git a/gitlab2prov/adapters/fetch/annotations/parse.py b/gitlab2prov/adapters/fetch/annotations/parse.py deleted file mode 100644 index 383625d..0000000 --- a/gitlab2prov/adapters/fetch/annotations/parse.py +++ /dev/null @@ -1,185 +0,0 @@ -import logging -import operator -import uuid -from typing import Any -from typing import Callable -from typing import Sequence -from typing import TypeAlias - -from gitlab.v4.objects import ProjectCommitComment -from gitlab.v4.objects import ProjectIssueAwardEmoji -from gitlab.v4.objects import ProjectIssueNote -from gitlab.v4.objects import ProjectIssueNoteAwardEmoji -from gitlab.v4.objects import ProjectIssueResourceLabelEvent -from gitlab.v4.objects import ProjectMergeRequestAwardEmoji -from gitlab.v4.objects import ProjectMergeRequestNote -from gitlab.v4.objects import ProjectMergeRequestNoteAwardEmoji -from gitlab.v4.objects import ProjectMergeRequestResourceLabelEvent - -from gitlab2prov.adapters.fetch.annotations import AnnotationClassifier -from gitlab2prov.adapters.fetch.annotations import CLASSIFIERS -from gitlab2prov.adapters.fetch.annotations import IMPORT_STATEMENT -from gitlab2prov.domain.constants import ProvRole -from gitlab2prov.domain.objects import Annotation -from gitlab2prov.domain.objects import User - - -log = logging.getLogger(__name__) - - -DEFAULT = "default_annotation" - - -Comment: TypeAlias = ProjectCommitComment -Note: TypeAlias = ProjectIssueNote | ProjectMergeRequestNote -Label: TypeAlias = ProjectIssueResourceLabelEvent | ProjectMergeRequestResourceLabelEvent -AwardEmoji: TypeAlias = ( - ProjectIssueAwardEmoji - | ProjectIssueNoteAwardEmoji - | ProjectMergeRequestAwardEmoji - | ProjectMergeRequestNoteAwardEmoji -) - - -def normalize(string: str) -> str: - return string.strip().lower() - - -def longest_matching_classifier(string: str) -> AnnotationClassifier | None: - matching = (cls for cls in CLASSIFIERS if cls.matches(string)) - return max(matching, key=len, default=None) - - -def classify_system_note(string: str) -> tuple[str, dict[str, Any]]: - string = normalize(string) - kwargs = {} - # remove import statement, if present - if IMPORT_STATEMENT.matches(string): - string = IMPORT_STATEMENT.replace(string) - kwargs = IMPORT_STATEMENT.groupdict() - # find classifier by choosing the one with the longest match - if matching_classifier := longest_matching_classifier(string): - kwargs.update(matching_classifier.groupdict()) - return matching_classifier.name, kwargs - return DEFAULT, kwargs - - -def parse_system_note(note: Note) -> Annotation: - annotator = User( - name=note.author.get("name"), - email=note.author.get("email"), - gitlab_username=note.author.get("username"), - gitlab_id=note.author.get("id"), - prov_role=ProvRole.ANNOTATOR, - ) - annotation_type, kwargs = classify_system_note(note.body) - return Annotation( - uid=note.id, - name=annotation_type, - body=note.body, - start=note.created_at, - end=note.created_at, - captured_kwargs=kwargs, - annotator=annotator, - ) - - -def parse_comment(comment: Comment) -> Annotation: - annotator = User( - name=comment.author.get("name"), - email=comment.author.get("email"), - gitlab_username=comment.author.get("username"), - gitlab_id=comment.author.get("id"), - prov_role=ProvRole.ANNOTATOR, - ) - return Annotation( - uid=f"{uuid.uuid4()}{annotator.gitlab_id}{abs(hash(comment.note))}", - name="add_comment", - body=comment.note, - start=comment.created_at, - end=comment.created_at, - annotator=annotator, - ) - - -def parse_note(note: Note) -> Annotation: - annotator = User( - name=note.author.get("name"), - email=note.author.get("email"), - gitlab_username=note.author.get("username"), - gitlab_id=note.author.get("id"), - prov_role=ProvRole.ANNOTATOR, - ) - return Annotation( - uid=note.id, - name="add_note", - body=note.body, - annotator=annotator, - start=note.created_at, - end=note.created_at, - ) - - -def parse_award(award: AwardEmoji) -> Annotation: - annotator = User( - name=award.user.get("name"), - email=award.user.get("email"), - gitlab_username=award.user.get("username"), - gitlab_id=award.user.get("id"), - prov_role=ProvRole.ANNOTATOR, - ) - return Annotation( - uid=award.id, - name="award_emoji", - body=award.name, - annotator=annotator, - start=award.created_at, - end=award.created_at, - ) - - -def parse_label(label: Label) -> Annotation: - annotator = User( - name=label.user.get("name"), - email=label.user.get("email"), - gitlab_username=label.user.get("username"), - gitlab_id=label.user.get("id"), - prov_role=ProvRole.ANNOTATOR, - ) - return Annotation( - uid=label.id, - name=f"{label.action}_label", - body=label.action, - annotator=annotator, - start=label.created_at, - end=label.created_at, - ) - - -def choose_parser( - parseable: Note | Comment | AwardEmoji | Label, -) -> Callable[[Note | Comment | AwardEmoji | Label], Annotation] | None: - match parseable: - case ProjectIssueNote(system=True) | ProjectMergeRequestNote(system=True): - return parse_system_note - case ProjectIssueNote() | ProjectMergeRequestNote(): - return parse_note - case ProjectCommitComment(): - return parse_comment - case ProjectIssueResourceLabelEvent() | ProjectMergeRequestResourceLabelEvent(): - return parse_label - case ProjectIssueAwardEmoji() | ProjectIssueNoteAwardEmoji() | ProjectMergeRequestAwardEmoji() | ProjectMergeRequestNoteAwardEmoji(): - return parse_award - case _: - log.warning(f"no parser found for {parseable=}") - return - - -def parse_annotations( - parseables: Sequence[Note | Comment | AwardEmoji | Label], -) -> Sequence[Annotation]: - annotations = [] - for parseable in parseables: - if parser := choose_parser(parseable): - annotations.append(parser(parseable)) - return sorted(annotations, key=operator.attrgetter("start")) diff --git a/gitlab2prov/adapters/fetch/utils.py b/gitlab2prov/adapters/fetch/utils.py deleted file mode 100644 index 32d26a0..0000000 --- a/gitlab2prov/adapters/fetch/utils.py +++ /dev/null @@ -1,21 +0,0 @@ -from urllib.parse import urlsplit - - -def project_slug(url: str) -> str: - if path := urlsplit(url).path: - owner, project = (s for s in path.split("/") if s) - return f"{owner}/{project}" - return None - - -def instance_url(url: str) -> str: - split = urlsplit(url) - return f"{split.scheme}://{split.netloc}" - - -def clone_over_https_url(url: str, token: str, platform: str = "gitlab") -> str: - split = urlsplit(url) - if platform == "gitlab": - return f"https://gitlab.com:{token}@{split.netloc}/{project_slug(url)}" - if platform == "github": - return f"https://{token}@{split.netloc}/{project_slug(url)}.git" From a2d28daf451e5e8db607a9c9dc9cd289c03fc023 Mon Sep 17 00:00:00 2001 From: cdboer <cdboer@cdboer.de> Date: Sun, 29 Jan 2023 18:18:01 +0100 Subject: [PATCH 31/81] Remove 'Abstract' from 'AbstractRepository' --- gitlab2prov/adapters/repository.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/gitlab2prov/adapters/repository.py b/gitlab2prov/adapters/repository.py index 9d9a642..93dbf25 100644 --- a/gitlab2prov/adapters/repository.py +++ b/gitlab2prov/adapters/repository.py @@ -6,7 +6,7 @@ R = TypeVar("R") -class AbstractRepository(abc.ABC): +class Repository(abc.ABC): def add(self, resource: R) -> None: self._add(resource) @@ -31,10 +31,8 @@ def _list_all(self, resource_type: Type[R], **filters: Any) -> list[R]: raise NotImplementedError -class InMemoryRepository(AbstractRepository): - # not super efficient - # should be fast enough for 1.0 - # snychronous get requests are the main culprit in slowing runtime +class InMemoryRepository(Repository): + # TODO: speed up retrieval def __init__(self): super().__init__() self.repo = defaultdict(list) From 49b04ad5e4c8776fb90a4325ad86f73d98275a8f Mon Sep 17 00:00:00 2001 From: cdboer <cdboer@cdboer.de> Date: Sun, 29 Jan 2023 18:19:56 +0100 Subject: [PATCH 32/81] Replace config/parser.py by config/config.py --- gitlab2prov/config/config.py | 73 ++++++++++++++++++++++++++++++++++++ gitlab2prov/config/parser.py | 69 ---------------------------------- 2 files changed, 73 insertions(+), 69 deletions(-) create mode 100644 gitlab2prov/config/config.py delete mode 100644 gitlab2prov/config/parser.py diff --git a/gitlab2prov/config/config.py b/gitlab2prov/config/config.py new file mode 100644 index 0000000..4504806 --- /dev/null +++ b/gitlab2prov/config/config.py @@ -0,0 +1,73 @@ +import json +from typing import Any +from dataclasses import dataclass, field + +import jsonschema +import jsonschema.exceptions +from ruamel.yaml import YAML +import ruamel.yaml.constructor as constructor + +from gitlab2prov.root import get_package_root + + +@dataclass +class Config: + """A config file.""" + + content: str = "" + schema: dict[str, Any] = field(init=False) + + def __post_init__(self): + self.schema = self.get_schema() + + @classmethod + def read(cls, filepath: str): + """Read the config file from the given path.""" + with open(filepath, "rt") as f: + yaml = YAML(typ="safe") + return cls(content=yaml.load(f.read())) + + @staticmethod + def get_schema() -> dict[str, Any]: + """Get the schema from the config package.""" + path = get_package_root() / "config" / "schema.json" + with open(path, "rt", encoding="utf-8") as f: + return json.loads(f.read()) + + def validate(self) -> tuple[bool, str]: + """Validate the config file against the schema.""" + try: + jsonschema.validate(self.content, self.schema) + except jsonschema.exceptions.ValidationError as err: + return False, err.message + except jsonschema.exceptions.SchemaError as err: + return False, err.message + except constructor.DuplicateKeyError as err: + return False, err.problem + return True, "Everything is fine!" + + def parse(self) -> list[str]: + """Parse the config file into a list of strings.""" + args = [] + + for obj in self.content: + command = list(obj.keys())[0] + args.append(command) + + options = obj.get(command) + if not options: + continue + + for name, literal in options.items(): + if isinstance(literal, bool): + args.append(f"--{name}") + elif isinstance(literal, str): + args.append(f"--{name}") + args.append(literal) + elif isinstance(literal, list): + for lit in literal: + args.append(f"--{name}") + args.append(lit) + else: + raise ValueError(f"Unknown literal type: {type(literal)}") + return args diff --git a/gitlab2prov/config/parser.py b/gitlab2prov/config/parser.py deleted file mode 100644 index 7899cc6..0000000 --- a/gitlab2prov/config/parser.py +++ /dev/null @@ -1,69 +0,0 @@ -import json -from typing import Any - -import jsonschema -import jsonschema.exceptions - -from ruamel.yaml import YAML, constructor - -from gitlab2prov.root import get_package_root - - -def read_file(filepath: str) -> Any: - with open(filepath, "rt") as f: - yaml = YAML(typ="safe") - return yaml.load(f.read()) - - -def get_schema() -> dict[str, Any]: - path = get_package_root() / "config" / "schema.json" - with open(path, "rt", encoding="utf-8") as f: - return json.loads(f.read()) - - -class ConfigParser: - @staticmethod - def validate(filepath: str) -> tuple[bool, str]: - try: - validator = jsonschema.Draft7Validator(get_schema()) - validator.validate(read_file(filepath)) - except jsonschema.exceptions.ValidationError as err: - return False, err.message - except jsonschema.exceptions.SchemaError as err: - return False, err.message - except constructor.DuplicateKeyError as err: - return False, err.problem - return True, "Everything is fine!" - - def parse(self, filepath: str) -> list[str]: - content = read_file(filepath) - return list(self.parse_array(content)) - - def parse_array(self, arr: list[Any]): - for obj in arr: - yield from self.parse_object(obj) - - def parse_object(self, obj: dict[str, Any]): - cmd = list(obj.keys())[0] - yield cmd - yield from self.parse_options(obj[cmd]) - - def parse_options(self, options: dict[str, bool | str | list[str]] | None): - if not options: - return - for name, value in options.items(): - yield from self.parse_option(name, value) - - def parse_option(self, name: str, literal: bool | str | list[str]): - match literal: - case bool(): - yield f"--{name}" - case str(): - yield f"--{name}" - yield literal - case list() as litlist: - for lit in litlist: - yield f"--{name}" - yield lit - case _: - raise ValueError(f"Unknown literal type!") From 1ae70a73eef331592ee7e1202c765d2e769a49fd Mon Sep 17 00:00:00 2001 From: cdboer <cdboer@cdboer.de> Date: Sun, 29 Jan 2023 18:20:17 +0100 Subject: [PATCH 33/81] Update package exports --- gitlab2prov/config/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gitlab2prov/config/__init__.py b/gitlab2prov/config/__init__.py index d2f9919..87f0140 100644 --- a/gitlab2prov/config/__init__.py +++ b/gitlab2prov/config/__init__.py @@ -1 +1 @@ -from gitlab2prov.config.parser import ConfigParser \ No newline at end of file +from gitlab2prov.config.config import Config \ No newline at end of file From 14cf6140f931993842ed42d717aa8151834c8a82 Mon Sep 17 00:00:00 2001 From: cdboer <cdboer@cdboer.de> Date: Sun, 29 Jan 2023 18:21:14 +0100 Subject: [PATCH 34/81] Fix schema by using 'oneOf' for array items (fix #78) --- gitlab2prov/config/schema.json | 91 ++++++++++++++++++++++++++-------- 1 file changed, 69 insertions(+), 22 deletions(-) diff --git a/gitlab2prov/config/schema.json b/gitlab2prov/config/schema.json index 2262428..6716945 100644 --- a/gitlab2prov/config/schema.json +++ b/gitlab2prov/config/schema.json @@ -1,8 +1,30 @@ { "$schema": "http://json-schema.org/draft-07/schema", "type": "array", - "items": [ - { + "items": { + "oneOf": [ + { + "$ref": "#/definitions/extract" + }, + { + "$ref": "#/definitions/load" + }, + { + "$ref": "#/definitions/combine" + }, + { + "$ref": "#/definitions/save" + }, + { + "$ref": "#/definitions/pseudonymize" + }, + { + "$ref": "#/definitions/stats" + } + ] + }, + "definitions": { + "extract": { "type": "object", "properties": { "extract": { @@ -25,12 +47,16 @@ "token" ] } - } + }, + "additionalProperties": false, + "required": [ + "extract" + ] }, - { + "load": { "type": "object", "properties": { - "open": { + "load": { "type": "object", "properties": { "input": { @@ -45,17 +71,37 @@ "input" ] } - } + }, + "additionalProperties": false, + "required": [ + "load" + ] }, - { + "combine": { "type": "object", "properties": { "combine": { "type": "null" } - } + }, + "additionalProperties": false, + "required": [ + "combine" + ] + }, + "pseudonymize": { + "type": "object", + "properties": { + "pseudonymize": { + "type": "null" + } + }, + "additionalProperties": false, + "required": [ + "pseudonymize" + ] }, - { + "save": { "type": "object", "properties": { "save": { @@ -67,7 +113,8 @@ "format": { "type": "array", "items": { - "type": "string" + "type": "string", + "enum": ["json", "rdf", "provn", "dot", "xml"] } } }, @@ -77,17 +124,13 @@ "format" ] } - } - }, - { - "type": "object", - "properties": { - "pseudonymize": { - "type": "null" - } - } + }, + "additionalProperties": false, + "required": [ + "save" + ] }, - { + "stats": { "type": "object", "properties": { "stats": { @@ -112,7 +155,11 @@ }, "additionalProperties": false } - } + }, + "additionalProperties": false, + "required": [ + "stats" + ] } - ] + } } \ No newline at end of file From b06435716482d9e8f94a3226b1a8f981db68e9cc Mon Sep 17 00:00:00 2001 From: cdboer <cdboer@cdboer.de> Date: Sun, 29 Jan 2023 18:22:06 +0100 Subject: [PATCH 35/81] Add new commands --- gitlab2prov/domain/commands.py | 36 +++++++++++++++++++++++++++++++--- 1 file changed, 33 insertions(+), 3 deletions(-) diff --git a/gitlab2prov/domain/commands.py b/gitlab2prov/domain/commands.py index 830eadd..56e0df4 100644 --- a/gitlab2prov/domain/commands.py +++ b/gitlab2prov/domain/commands.py @@ -1,6 +1,7 @@ from dataclasses import dataclass from datetime import datetime from typing import Optional +from prov.model import ProvDocument @dataclass @@ -20,10 +21,39 @@ class Update(Fetch): @dataclass -class Reset(Command): - pass +class Normalize(Command): + document: ProvDocument + no_duplicates: bool = False + use_pseudonyms: bool = False + agent_mapping: str = "" + + +@dataclass +class Combine(Command): + documents: list[ProvDocument] + + +@dataclass +class Statistics(Command): + document: ProvDocument + resolution: str + format: str @dataclass class Serialize(Command): - pass + url: str = None + + +@dataclass +class Document2File(Command): + document: ProvDocument + filename: Optional[str] = None + format: Optional[str] = None + + +@dataclass +class File2Document(Command): + source: Optional[str] = None + content: Optional[str] = None + format: Optional[str] = None \ No newline at end of file From 2ef9f8e5e6e0775c6e7a1f5d7b5ad0714c8fb157 Mon Sep 17 00:00:00 2001 From: cdboer <cdboer@cdboer.de> Date: Sun, 29 Jan 2023 18:22:34 +0100 Subject: [PATCH 36/81] Add timestamps to AnnotatedVersions --- gitlab2prov/domain/objects.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/gitlab2prov/domain/objects.py b/gitlab2prov/domain/objects.py index 9f22a79..f836a1d 100644 --- a/gitlab2prov/domain/objects.py +++ b/gitlab2prov/domain/objects.py @@ -164,6 +164,7 @@ class AnnotatedVersion: uid: str aid: str resource: str + start: datetime @property def identifier(self) -> QualifiedName: @@ -171,15 +172,15 @@ def identifier(self) -> QualifiedName: @classmethod def from_commit(cls, commit: Commit, annotation: Annotation): - return cls(uid=commit.sha, aid=annotation.uid, resource=ProvType.COMMIT) + return cls(uid=commit.sha, aid=annotation.uid, resource=ProvType.COMMIT, start=annotation.start) @classmethod def from_issue(cls, issue: Issue, annotation: Annotation): - return cls(uid=issue.id, aid=annotation.uid, resource=ProvType.ISSUE) + return cls(uid=issue.id, aid=annotation.uid, resource=ProvType.ISSUE, start=annotation.start) @classmethod def from_merge_request(cls, merge_request: MergeRequest, annotation: Annotation): - return cls(uid=merge_request.id, aid=annotation.uid, resource=ProvType.MERGE_REQUEST) + return cls(uid=merge_request.id, aid=annotation.uid, resource=ProvType.MERGE_REQUEST, start=annotation.start) def to_prov_element(self) -> ProvEntity: attributes = [("uid", self.uid), (PROV_TYPE, f"Annotated{self.resource}Version")] @@ -203,7 +204,7 @@ def identifier(self) -> QualifiedName: @classmethod def from_tag(cls, tag: GitTag): - return cls(uid=tag.name, resource=ProvType.TAG, start=tag.start, end=tag.end) + return cls(uid=tag.name, resource=ProvType.TAG, start=tag.created_at, end=tag.created_at) @classmethod def from_commit(cls, commit: Commit): From f9f18bb2d80d045ce4771d53f5b82e604324bab0 Mon Sep 17 00:00:00 2001 From: cdboer <cdboer@cdboer.de> Date: Sun, 29 Jan 2023 18:23:52 +0100 Subject: [PATCH 37/81] Not every tag has to have a matching commit --- gitlab2prov/prov/model.py | 59 +++++++++++++++++++++------------------ 1 file changed, 32 insertions(+), 27 deletions(-) diff --git a/gitlab2prov/prov/model.py b/gitlab2prov/prov/model.py index d2334ce..49fd4a8 100644 --- a/gitlab2prov/prov/model.py +++ b/gitlab2prov/prov/model.py @@ -23,7 +23,7 @@ from prov.identifier import QualifiedName, Namespace from functools import partial -from gitlab2prov.adapters.repository import AbstractRepository +from gitlab2prov.adapters.repository import Repository from gitlab2prov.domain.constants import ProvRole from gitlab2prov.domain.objects import ( FileRevision, @@ -47,18 +47,18 @@ HostedResource = Commit | Issue | MergeRequest -Query = Callable[[AbstractRepository], Iterable[HostedResource]] +Query = Callable[[Repository], Iterable[HostedResource]] DEFAULT_NAMESPACE = Namespace("ex", "example.org") -def file_status_query(repository: AbstractRepository, status: str): +def file_status_query(repository: Repository, status: str): for revision in repository.list_all(FileRevision, status=status): commit = repository.get(GitCommit, sha=revision.commit) for parent in [repository.get(GitCommit, sha=sha) for sha in commit.parents]: yield commit, parent, revision, revision.previous if status == "modified" else None -def hosted_resource_query(repository: AbstractRepository, resource_type: Type[HostedResource]): +def hosted_resource_query(repository: Repository, resource_type: Type[HostedResource]): for resource in repository.list_all(resource_type): if resource_type == Commit: yield (resource, repository.get(GitCommit, sha=resource.sha)) @@ -354,7 +354,10 @@ def _add_creation_part_for_hosted_commits(self): ) self.ctx.add_relation(self.resource.creation, self.commit, ProvCommunication) self.ctx.add_relation( - self.commit, self.commit.committer, ProvAssociation, {PROV_ROLE: ProvRole.COMMIT_AUTHOR} + self.commit, + self.commit.committer, + ProvAssociation, + {PROV_ROLE: ProvRole.COMMIT_AUTHOR}, ) def _add_creation_part(self): @@ -443,7 +446,7 @@ def __post_init__(self): self.ctx = ProvenanceContext(ProvDocument()) @staticmethod - def query(repository: AbstractRepository) -> Iterable[tuple[Release, GitTag]]: + def query(repository: Repository) -> Iterable[tuple[Release, GitTag]]: for release in repository.list_all(Release): tag = repository.get(GitTag, sha=release.tag_sha) yield release, tag @@ -503,16 +506,16 @@ class GitTagModel: """Model for a Git tag.""" tag: GitTag - commit: GitCommit + commit: Commit | None = None ctx: ProvenanceContext = field(init=False) def __post_init__(self): self.ctx = ProvenanceContext(ProvDocument()) @staticmethod - def query(repository: AbstractRepository) -> Iterable[tuple[GitTag, GitCommit]]: + def query(repository: Repository) -> Iterable[tuple[GitTag, Commit]]: for tag in repository.list_all(GitTag): - commit = repository.get(GitCommit, sha=tag.commit_sha) + commit = repository.get(Commit, sha=tag.sha) yield tag, commit def build_provenance_model(self) -> ProvDocument: @@ -521,9 +524,10 @@ def build_provenance_model(self) -> ProvDocument: self.ctx.add_element(self.tag.creation) self.ctx.add_element(self.tag.author) # Add the commit - self.ctx.add_element(self.commit) - self.ctx.add_element(self.commit.creation) - self.ctx.add_element(self.commit.author) + if self.commit: + self.ctx.add_element(self.commit) + self.ctx.add_element(self.commit.creation) + self.ctx.add_element(self.commit.author) # Add tag relationships self.ctx.add_relation( self.tag, @@ -536,20 +540,21 @@ def build_provenance_model(self) -> ProvDocument: self.tag.creation, self.tag.author, ProvAssociation, {PROV_ROLE: ProvRole.TAG_AUTHOR} ) # Add commit relationships - self.ctx.add_relation(self.commit, self.tag, ProvMembership) - self.ctx.add_relation( - self.commit, - self.commit.creation, - ProvGeneration, - {PROV_ATTR_STARTTIME: self.commit.creation.start, PROV_ROLE: ProvRole.COMMIT}, - ) - self.ctx.add_relation(self.commit, self.commit.author, ProvAttribution) - self.ctx.add_relation( - self.commit.creation, - self.commit.author, - ProvAssociation, - {PROV_ROLE: ProvRole.COMMIT_AUTHOR}, - ) + if self.commit: + self.ctx.add_relation(self.commit, self.tag, ProvMembership) + self.ctx.add_relation( + self.commit, + self.commit.creation, + ProvGeneration, + {PROV_ATTR_STARTTIME: self.commit.creation.start, PROV_ROLE: ProvRole.COMMIT}, + ) + self.ctx.add_relation(self.commit, self.commit.author, ProvAttribution) + self.ctx.add_relation( + self.commit.creation, + self.commit.author, + ProvAssociation, + {PROV_ROLE: ProvRole.COMMIT_AUTHOR}, + ) return self.ctx.get_document() @@ -572,7 +577,7 @@ def __post_init__(self): # Initialize the document self.document = ProvDocument() - def __call__(self, repository: AbstractRepository): + def __call__(self, repository: Repository): # Pass the repository to the query for args in self.query(repository): # Initialize the model From ff61032182b5a02e7476e5b635cdb4717e382252 Mon Sep 17 00:00:00 2001 From: cdboer <cdboer@cdboer.de> Date: Sun, 29 Jan 2023 18:24:22 +0100 Subject: [PATCH 38/81] Update provenance operations --- gitlab2prov/prov/operations.py | 45 ++++++++++++++++++++-------------- 1 file changed, 27 insertions(+), 18 deletions(-) diff --git a/gitlab2prov/prov/operations.py b/gitlab2prov/prov/operations.py index 09da949..486f5fe 100644 --- a/gitlab2prov/prov/operations.py +++ b/gitlab2prov/prov/operations.py @@ -1,7 +1,7 @@ import json import logging import hashlib -from typing import Iterable, NamedTuple, Type +from typing import NamedTuple, Type from collections import defaultdict, Counter from pathlib import Path @@ -34,23 +34,27 @@ DESERIALIZATION_FORMATS = ["rdf", "xml", "json"] -def serialize_graph( - graph: ProvDocument, format: str = "json", destination=None, encoding="utf-8" -) -> str | None: +def serialize(document: ProvDocument, destination=None, format: str = "json") -> str | None: + """Serialize a ProvDocument to a file or string.""" if format not in SERIALIZATION_FORMATS: raise ValueError("Unsupported serialization format.") - if format == "dot": - return prov_to_dot(graph).to_string().encode(encoding) - return graph.serialize(format=format, destination=destination) + if format != "dot": + return document.serialize(format=format, destination=destination) + string = prov_to_dot(document).to_string() + if not destination: + return string + with open(destination, "w") as f: + f.write(string) -def deserialize_graph(source: str = None, content: str = None): +def deserialize(source: str = None, content: str = None, format: str = None): + """Deserialize a ProvDocument from a file or string.""" for format in DESERIALIZATION_FORMATS: try: return ProvDocument.deserialize(source=source, content=content, format=format) - except: + except Exception: continue - raise Exception + raise Exception(f"Deseialization failed for {source=}, {content=}, {format=}") def format_stats_as_ascii_table(stats: dict[str, int]) -> str: @@ -61,13 +65,18 @@ def format_stats_as_ascii_table(stats: dict[str, int]) -> str: def format_stats_as_csv(stats: dict[str, int]) -> str: - csv = f"Record Type, Count\n" + csv = "Record Type, Count\n" for record_type, count in stats.items(): csv += f"{record_type}, {count}\n" return csv -def stats(graph: ProvDocument, resolution: str, formatter=format_stats_as_ascii_table) -> str: +def stats(graph: ProvDocument, resolution: str, format: str = "table") -> str: + if format == "csv": + formatter = format_stats_as_csv + if format == "table": + formatter = format_stats_as_ascii_table + elements = Counter(e.get_type().localpart for e in graph.get_records(ProvElement)) relations = Counter(r.get_type().localpart for r in graph.get_records(ProvRelation)) @@ -92,11 +101,11 @@ def graph_factory(records: Optional[Sequence[ProvRecord]] = None) -> ProvDocumen return graph -def combine(*graphs: ProvDocument) -> ProvDocument: - log.info(f"combine graphs {graphs=}") - acc = graphs[0] - for graph in graphs: - acc.update(graph) +def combine(*documents: ProvDocument) -> ProvDocument: + log.info(f"combine {documents=}") + acc = documents[0] + for document in documents[1:]: + acc.update(document) return dedupe(acc) @@ -139,7 +148,7 @@ def read(fp: Path) -> dict[str, list[str]]: data = f.read() d = json.loads(data) if not d: - log.info(f"empty agent mapping") + log.info("empty agent mapping") return dict() return d From d1202d7b2679eb8b1746051097968d9a00e01941 Mon Sep 17 00:00:00 2001 From: cdboer <cdboer@cdboer.de> Date: Sun, 29 Jan 2023 18:25:05 +0100 Subject: [PATCH 39/81] Remove 'Abstract' from 'AbstractUnitOfWork' --- gitlab2prov/service_layer/unit_of_work.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/gitlab2prov/service_layer/unit_of_work.py b/gitlab2prov/service_layer/unit_of_work.py index 46c208d..01f8ab0 100644 --- a/gitlab2prov/service_layer/unit_of_work.py +++ b/gitlab2prov/service_layer/unit_of_work.py @@ -1,12 +1,13 @@ from __future__ import annotations import abc +from collections import defaultdict from gitlab2prov.adapters import repository -class AbstractUnitOfWork(abc.ABC): - def __enter__(self) -> AbstractUnitOfWork: +class UnitOfWork(abc.ABC): + def __enter__(self) -> UnitOfWork: return self def __exit__(self, *args): @@ -31,9 +32,10 @@ def rollback(self): raise NotImplementedError -class InMemoryUnitOfWork(AbstractUnitOfWork): +class InMemoryUnitOfWork(UnitOfWork): def __init__(self): - self.resources = repository.InMemoryRepository() + # self.resources = repository.InMemoryRepository() + self.resources = defaultdict(repository.InMemoryRepository) def __enter__(self): return super().__enter__() From 348107adfa0fbc7872314dfae081d0c77693af5f Mon Sep 17 00:00:00 2001 From: cdboer <cdboer@cdboer.de> Date: Sun, 29 Jan 2023 18:25:27 +0100 Subject: [PATCH 40/81] Update 'UnitOfWork' import --- gitlab2prov/service_layer/messagebus.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/gitlab2prov/service_layer/messagebus.py b/gitlab2prov/service_layer/messagebus.py index 3afc567..077a722 100644 --- a/gitlab2prov/service_layer/messagebus.py +++ b/gitlab2prov/service_layer/messagebus.py @@ -5,7 +5,7 @@ from prov.model import ProvDocument from gitlab2prov.domain.commands import Command -from gitlab2prov.service_layer.unit_of_work import AbstractUnitOfWork +from gitlab2prov.service_layer.unit_of_work import UnitOfWork logger = logging.getLogger(__name__) @@ -13,11 +13,10 @@ @dataclass class MessageBus: - uow: AbstractUnitOfWork + uow: UnitOfWork handlers: dict[type[Command], list[Callable]] def handle(self, command: Command) -> ProvDocument | None: - # TODO: Return more than the last result... for handler in self.handlers[type(command)]: try: logger.debug(f"Handling command {command}.") From 8c3ad8bf46da2fdbe41f6314cf78102c3b8417be Mon Sep 17 00:00:00 2001 From: cdboer <cdboer@cdboer.de> Date: Sun, 29 Jan 2023 18:25:47 +0100 Subject: [PATCH 41/81] Add new handlers, update existing ones --- gitlab2prov/service_layer/handlers.py | 53 +++++++++++++++++++-------- 1 file changed, 37 insertions(+), 16 deletions(-) diff --git a/gitlab2prov/service_layer/handlers.py b/gitlab2prov/service_layer/handlers.py index e15bf48..9eb83c3 100644 --- a/gitlab2prov/service_layer/handlers.py +++ b/gitlab2prov/service_layer/handlers.py @@ -9,46 +9,67 @@ def fetch_git(cmd: commands.Fetch, uow, git_fetcher) -> None: - with git_fetcher(cmd.url, cmd.token) as fetcher: - fetcher.do_clone() + with git_fetcher as fetcher: + fetcher.do_clone(cmd.url, cmd.token) with uow: - for resource in fetcher.fetch_git(): + for resource in fetcher.fetch_all(): log.info(f"add {resource=}") - uow.resources.add(resource) + uow.resources[cmd.url].add(resource) uow.commit() -def fetch_githosted(cmd: commands.Fetch, uow, fetcher_factory) -> None: - fetcher = fetcher_factory.factory(cmd.url) - log.info("choose fetcher {fetcher=} for {cmd.url=}") - fetcher = fetcher(cmd.token, cmd.url) +def fetch_githosted(cmd: commands.Fetch, uow, githosted_fetcher) -> None: + fetcher = githosted_fetcher(cmd.token, cmd.url) with uow: for resource in fetcher.fetch_all(): log.info(f"add {resource=}") - uow.resources.add(resource) + uow.resources[cmd.url].add(resource) uow.commit() -def reset(cmd: commands.Reset, uow): - log.info(f"reset repository {uow.resources=}") - uow.reset() - - def serialize(cmd: commands.Serialize, uow) -> ProvDocument: log.info(f"serialize graph consisting of {model.MODELS=}") document = ProvDocument() for prov_model in model.MODELS: - provenance = prov_model(uow.resources) + provenance = prov_model(uow.resources[cmd.url]) document = operations.combine(document, provenance) document = operations.dedupe(document) return document +def normalize(cmd: commands.Normalize): + if cmd.no_duplicates: + cmd.document = operations.dedupe(cmd.doc) + if cmd.use_pseudonyms: + cmd.document = operations.pseudonymize(cmd.doc) + return cmd.document + + +def combine(cmd: commands.Combine): + return operations.combine(*cmd.documents) + + +def document2file(cmd: commands.Document2File): + return operations.serialize(cmd.document, cmd.filename, cmd.format) + + +def file2document(cmd: commands.File2Document): + return operations.deserialize(cmd.source, cmd.content, cmd.format) + + +def statistics(cmd: commands.Statistics): + return operations.stats(cmd.document, cmd.resolution, cmd.format) + + HANDLERS = { commands.Fetch: [ fetch_git, fetch_githosted, ], - commands.Reset: [reset], commands.Serialize: [serialize], + commands.Document2File: [document2file], + commands.File2Document: [file2document], + commands.Combine: [combine], + commands.Normalize: [normalize], + commands.Statistics: [statistics], } From 19996b13b92c603e6331c48a25000131051f2264 Mon Sep 17 00:00:00 2001 From: cdboer <cdboer@cdboer.de> Date: Sun, 29 Jan 2023 18:26:42 +0100 Subject: [PATCH 42/81] Update bootstrapping to include the platform from which to fetch the project data (#79) --- gitlab2prov/bootstrap.py | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/gitlab2prov/bootstrap.py b/gitlab2prov/bootstrap.py index 33f7a30..c7bea98 100644 --- a/gitlab2prov/bootstrap.py +++ b/gitlab2prov/bootstrap.py @@ -1,23 +1,30 @@ import inspect import logging -from typing import Type from gitlab2prov.service_layer import handlers, messagebus, unit_of_work -from gitlab2prov.adapters.fetch import GitFetcher, FetcherFactory + +from gitlab2prov.adapters.git import GitFetcher +from gitlab2prov.adapters.lab import GitlabFetcher +from gitlab2prov.adapters.hub import GithubFetcher +from gitlab2prov.adapters.project_url import GithubProjectUrl, GitlabProjectUrl log = logging.getLogger(__name__) def bootstrap( - uow: unit_of_work.AbstractUnitOfWork = unit_of_work.InMemoryUnitOfWork(), - git_fetcher: Type[GitFetcher] = GitFetcher, - fetcher_factory: Type[FetcherFactory] = FetcherFactory, + platform: str, + uow: unit_of_work.UnitOfWork = unit_of_work.InMemoryUnitOfWork(), + git_fetcher: type[GitFetcher] = GitFetcher, + gitlab_fetcher: type[GitlabFetcher] = GitlabFetcher, + github_fetcher: type[GithubFetcher] = GithubFetcher, + github_url: type[GithubProjectUrl] = GithubProjectUrl, + gitlab_url: type[GitlabProjectUrl] = GitlabProjectUrl, ): dependencies = { "uow": uow, - "git_fetcher": git_fetcher, - "fetcher_factory": fetcher_factory, + "git_fetcher": git_fetcher(gitlab_url if platform == "gitlab" else github_url), + "githosted_fetcher": gitlab_fetcher if platform == "gitlab" else github_fetcher, } injected_handlers = { command_type: [inject_dependencies(handler, dependencies) for handler in handlers] From 39d06d4988b5dc05f34a22e30f42605b79cf73fd Mon Sep 17 00:00:00 2001 From: cdboer <cdboer@cdboer.de> Date: Sun, 29 Jan 2023 18:27:19 +0100 Subject: [PATCH 43/81] Add seperate cli for github2prov (#79) --- gitlab2prov/entrypoints/cli.py | 301 ++++++++++++++++++++------------- 1 file changed, 186 insertions(+), 115 deletions(-) diff --git a/gitlab2prov/entrypoints/cli.py b/gitlab2prov/entrypoints/cli.py index 6a678ba..08d2eaa 100644 --- a/gitlab2prov/entrypoints/cli.py +++ b/gitlab2prov/entrypoints/cli.py @@ -1,12 +1,15 @@ +import sys from functools import partial from functools import update_wrapper from functools import wraps +from typing import Iterator +from prov.model import ProvDocument import click from gitlab2prov import __version__ from gitlab2prov import bootstrap -from gitlab2prov.config import ConfigParser +from gitlab2prov.config import Config from gitlab2prov.domain import commands from gitlab2prov.log import create_logger from gitlab2prov.prov import operations @@ -18,31 +21,37 @@ def enable_logging(ctx: click.Context, _, enable: bool): create_logger() -def invoke_from_config(ctx: click.Context, _, filepath: str): +def invoke_command_line_from_config(ctx: click.Context, _, filepath: str): """Callback that executes a gitlab2prov run from a config file.""" - if filepath: - args = ConfigParser().parse(filepath) - context = cli.make_context(f"{cli}", args=args, parent=ctx) - cli.invoke(context) - ctx.exit() + if not filepath: + return + config = Config.read(filepath) + ok, err = config.validate() + if not ok: + ctx.fail(f"Validation failed: {err}") + context = ctx.command.make_context(ctx.command.name, args=config.parse(), parent=ctx) + ctx.command.invoke(context) + ctx.exit() def validate_config(ctx: click.Context, _, filepath: str): """Callback that validates config file using gitlab2prov/config/schema.json.""" - if filepath: - ok, err = ConfigParser().validate(filepath) - if ok: - config = ConfigParser().parse(filepath) - click.echo("Validation successful, the following command would be executed:\n") - click.echo(f"gitlab2prov {' '.join(config)}") - else: - ctx.fail(f"Validation failed: {err}") - ctx.exit() + if not filepath: + return + config = Config.read(filepath) + ok, err = config.validate() + if not ok: + ctx.fail(f"Validation failed: {err}") + click.echo("Validation successful, the following command would be executed:\n") + click.echo(f"gitlab2prov {' '.join(config.parse())}") + ctx.exit() def processor(func, wrapped=None): - """Helper decorator to rewrite a function so that it returns another - function from it. + """Decorator that turns a function into a processor. + + A processor is a function that takes a stream of values, applies an operation to each value and returns a new stream of values. + A processor therefore transforms a stream of values into a new stream of values. """ @wraps(wrapped or func) @@ -56,8 +65,11 @@ def processor(stream): def generator(func): - """Similar to the :func:`processor` but passes through old values - unchanged and does not pass through the values as parameter.""" + """Decorator that turns a function into a generator. + + A generator is a special case of a processor. + A generator is a processor that doesn't apply any operation to the values but adds new values to the stream. + """ @partial(processor, wrapped=func) def new_func(stream, *args, **kwargs): @@ -72,7 +84,6 @@ def new_func(stream, *args, **kwargs): @click.option( "--verbose", is_flag=True, - is_eager=True, default=False, expose_value=False, callback=enable_logging, @@ -82,31 +93,61 @@ def new_func(stream, *args, **kwargs): "--config", type=click.Path(exists=True, dir_okay=False), expose_value=False, - callback=invoke_from_config, + callback=invoke_command_line_from_config, help="Read config from file.", ) @click.option( "--validate", - is_eager=True, type=click.Path(exists=True, dir_okay=False), expose_value=False, callback=validate_config, help="Validate config file and exit.", ) @click.pass_context -def cli(ctx): +def gitlab_cli(ctx): """ Extract provenance information from GitLab projects. """ - ctx.obj = bootstrap.bootstrap() + ctx.obj = bootstrap.bootstrap("gitlab") + + +@click.group(chain=True, invoke_without_command=False) +@click.version_option(version=__version__, prog_name="github2prov") +@click.option( + "--verbose", + is_flag=True, + default=False, + expose_value=False, + callback=enable_logging, + help="Enable logging to 'github2prov.log'.", +) +@click.option( + "--config", + type=click.Path(exists=True, dir_okay=False), + expose_value=False, + callback=invoke_command_line_from_config, + help="Read config from file.", +) +@click.option( + "--validate", + type=click.Path(exists=True, dir_okay=False), + expose_value=False, + callback=validate_config, + help="Validate config file and exit.", +) +@click.pass_context +def github_cli(ctx): + ctx.obj = bootstrap.bootstrap("github") -@cli.result_callback() +@github_cli.result_callback() +@gitlab_cli.result_callback() def process_commands(processors, **kwargs): - """This result callback is invoked with an iterable of all the chained - subcommands. As each subcommand returns a function - we can chain them together to feed one into the other, similar to how - a pipe on unix works. + """Execute the chain of commands. + + This function is called after all subcommands have been chained together. + It executes the chain of commands by piping the output of one command into the input of the next command. + Subcommands can be processors that transform the stream of values or generators that add new values to the stream. """ # Start with an empty iterable. stream = () @@ -120,61 +161,64 @@ def process_commands(processors, **kwargs): pass -@cli.command("extract") +@click.command("extract") @click.option( "-u", "--url", "urls", multiple=True, type=str, required=True, help="Project url[s]." ) @click.option("-t", "--token", required=True, type=str, help="Gitlab API token.") @click.pass_obj @generator -def do_extract(bus, urls: list[str], token: str): +def extract(bus, urls: list[str], token: str): """Extract provenance information for one or more gitlab projects. This command extracts provenance information from one or multiple gitlab projects. The extracted provenance is returned as a combined provenance graph. """ - for url in urls: - bus.handle(commands.Fetch(url, token)) + document = None - graph = bus.handle(commands.Serialize()) - graph.description = f"graph extracted from '{', '.join(urls)}'" - yield graph + for url in urls: + doc = bus.handle(commands.Fetch(url, token)) + doc = bus.handle(commands.Serialize(url)) + doc = bus.handle(commands.Normalize(doc)) + if not document: + document = doc + document.update(doc) - bus.handle(commands.Reset()) + document.description = f"extracted from '{', '.join(urls)}'" + yield document -@cli.command("load", short_help="Load provenance files.") +@click.command("load", short_help="Load provenance files.") @click.option( "-i", "--input", + "sources", multiple=True, - type=click.Path(exists=True, dir_okay=False), + type=click.Path(dir_okay=False), help="Provenance file path (specify '-' to read from <stdin>).", ) +@click.pass_obj @generator -def load(input): +def load(bus, sources: list[str]): """Load provenance information from a file. This command reads one provenance graph from a file or multiple graphs from multiple files. """ - for filepath in input: + for filepath in sources: try: - if filepath == "-": - graph = operations.deserialize_graph() - graph.description = f"'<stdin>'" - yield graph - else: - graph = operations.deserialize_graph(filepath) - graph.description = f"'{filepath}'" - yield graph + filename = sys.stdin if filepath == "-" else filepath + document = bus.handle(commands.File2Document(filename)) + document.description = "'<stdin>'" if filepath == "-" else f"'{filepath}'" + yield document except Exception as e: click.echo(f"Could not open '{filepath}': {e}", err=True) -@cli.command("save") +@click.command("save") @click.option( "-f", "--format", + "formats", multiple=True, default=["json"], type=click.Choice(operations.SERIALIZATION_FORMATS), @@ -183,64 +227,79 @@ def load(input): @click.option( "-o", "--output", - default="gitlab2prov-graph-{:04}", + "destination", + default="-", + # TODO: think of a better default help="Output file path.", ) @processor -def save(graphs, format, output): - """Save provenance information to a file. +@click.pass_obj +def save(bus, documents, formats, destination): + """Save one or multiple provenance documents to a file. - This command writes each provenance graph that is piped to it to a file. + This command saves one or multiple provenance documents to a file. + + The output file path can be specified using the '-o' option. + The serialization format can be specified using the '-f' option. """ - for idx, graph in enumerate(graphs, start=1): - for fmt in format: + documents = list(documents) + + for i, document in enumerate(documents, start=1): + + for fmt in formats: + filename = f"{destination}{'-' + str(i) if len(documents) > 1 else ''}.{fmt}" try: - serialized = operations.serialize_graph(graph, fmt) - if output == "-": - click.echo(serialized) - else: - with open(f"{output.format(idx)}.{fmt}", "w") as out: - click.echo(serialized, file=out) - except Exception as e: - click.echo(f"Could not save {graph.description}: {e}", err=True) - yield graph - - -@cli.command("pseudonymize") + bus.handle(commands.Document2File(document, filename, fmt)) + except Exception as exc: + click.echo(f"Could not save {document.description}: {exc}", err=True) + + yield document + + +@click.command("pseudonymize") @processor -def pseudonymize(graphs): - """Pseudonymize a provenance graph. +@click.pass_obj +def pseudonymize(bus, documents: Iterator[ProvDocument]): + """Pseudonymize a provenance document. + + This command pseudonymizes one or multiple provenance documents. - This command pseudonymizes each provenance graph that is piped to it. + Pseudonymization is done by hashing attributes that contain personal information. + Pseudonymization only affects agents and their attributes. """ - for graph in graphs: + for document in documents: + try: - pseud = operations.pseudonymize(graph) - pseud.description = f"pseudonymized {graph.description}" - yield pseud - except Exception as e: - click.echo(f"Could not pseudonymize {graph.description}: {e}", err=True) + document = bus.handle(commands.Normalize(document, use_pseudonyms=True)) + document.description = f"pseudonymized {document.description}" + yield document + + except Exception as exc: + click.echo(f"Could not pseudonymize {document.description}: {exc}", err=True) -@cli.command("combine") +@click.command("combine") @processor -def combine(graphs): - """Combine multiple graphs into one. +@click.pass_obj +def combine(bus, documents: Iterator[ProvDocument]): + """Combine one or more provenance documents. - This command combines all graphs that are piped to it into one. + This command combines one or multiple provenance documents into a single document. """ - graphs = list(graphs) + documents = list(documents) + descriptions = [doc.description for doc in documents] + try: - combined = operations.combine(iter(graphs)) - descriptions = ", ".join(graph.description for graph in graphs) - combined.description = f"combination of {descriptions}" - yield combined - except Exception as e: - descriptions = "with ".join(graph.description for graph in graphs) - click.echo(f"Could not combine {descriptions}: {e}", err=True) + document = bus.handle(commands.Combine(documents)) + document = bus.handle(commands.Normalize(document)) + document.description = f"combination of {', '.join(descriptions)}" + yield document + + except Exception as exc: + click.echo(f"Could not combine {', '.join(descriptions)}: {exc}", err=True) -@cli.command("stats") +@click.command("stats") @click.option( "--coarse", "resolution", @@ -254,54 +313,66 @@ def combine(graphs): flag_value="fine", help="Print the number of PROV elements aswell as the number of PROV relations for each relation type.", ) +@click.option("--format", type=click.Choice(["csv", "table"]), default="table") @click.option( "--explain", - "show_description", is_flag=True, help="Print a textual summary of all operations applied to the graphs.", ) -@click.option("--formatter", type=click.Choice(["csv", "table"]), default="table") @processor -def stats(graphs, resolution, show_description, formatter): +@click.pass_obj +def stats(bus, documents: Iterator[ProvDocument], resolution: str, format: str, explain: bool): """Print statistics such as node counts and relation counts. This command prints statistics for each processed provenance graph. Statistics include the number of elements for each element type aswell as the number of relations for each relation type. Optionally, a short textual summary of all operations applied to the processed graphs can be printed to stdout. """ - for graph in graphs: + for document in documents: try: - if show_description: - click.echo(f"\nDescription: {graph.description.capitalize()}\n") - click.echo( - operations.stats( - graph, - resolution, - formatter=operations.format_stats_as_ascii_table - if formatter == "table" - else operations.format_stats_as_csv, - ) - ) - yield graph - except Exception as e: - click.echo(f"Could not display stats for {graph.description}: {e}", err=True) + statistics = bus.handle(commands.Statistics(document, resolution, format)) + if explain: + statistics = f"{document.description}\n\n{statistics}" + click.echo(statistics) + except: + click.echo("Could not compute statistics for {document.description}.", err=True) + yield document -@cli.command() +@click.command() @click.option( "--mapping", + "path_to_agent_map", type=click.Path(exists=True, dir_okay=False), help="File path to duplicate agent mapping.", ) @processor -def merge_duplicated_agents(graphs, mapping): +@click.pass_obj +def merge_duplicated_agents(bus, documents: Iterator[ProvDocument], path_to_agent_map: str): """Merge duplicated agents based on a name to aliases mapping. This command solves the problem of duplicated agents that can occur when the same physical user uses different user names and emails for his git and gitlab account. Based on a mapping of names to aliases the duplicated agents can be merged. """ - for graph in graphs: - graph = operations.merge_duplicated_agents(graph, mapping) - graph.description += f"merged double agents {graph.description}" - yield graph + for document in documents: + document = bus.handle(commands.Normalize(document, agent_mapping=path_to_agent_map)) + document.description += f"merged double agents {document.description}" + yield document + + +gitlab_cli.add_command(extract) +gitlab_cli.add_command(stats) +gitlab_cli.add_command(combine) +gitlab_cli.add_command(pseudonymize) +gitlab_cli.add_command(save) +gitlab_cli.add_command(load) +gitlab_cli.add_command(merge_duplicated_agents) + +github_cli.add_command(extract) +github_cli.add_command(stats) +github_cli.add_command(combine) +github_cli.add_command(pseudonymize) +github_cli.add_command(save) +github_cli.add_command(load) +github_cli.add_command(merge_duplicated_agents) From a90560e95cba5f73c9da898c092d229fc3c33d6e Mon Sep 17 00:00:00 2001 From: cdboer <cdboer@cdboer.de> Date: Sun, 29 Jan 2023 18:28:00 +0100 Subject: [PATCH 44/81] Add github2prov as seperate script (#79) --- pyproject.toml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index d30a7a9..f755dfd 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -47,7 +47,8 @@ dynamic = ["version"] dev = ["pytest", "pytest-mock", "black", "isort", "bump2version"] [project.scripts] -gitlab2prov = "gitlab2prov.entrypoints.cli:cli" +gitlab2prov = "gitlab2prov.entrypoints.cli:gitlab_cli" +github2prov = "gitlab2prov.entrypoints.cli:github_cli" [project.urls] Twitter = "https://twitter.com/dlr_software" From 243e9b8001ff7eacb838ff20da148307c50bdcb4 Mon Sep 17 00:00:00 2001 From: cdboer <cdboer@cdboer.de> Date: Sun, 26 Feb 2023 17:53:58 +0100 Subject: [PATCH 45/81] Remap status names to the ones used in the internal data structures --- gitlab2prov/adapters/git/fetcher.py | 1 + 1 file changed, 1 insertion(+) diff --git a/gitlab2prov/adapters/git/fetcher.py b/gitlab2prov/adapters/git/fetcher.py index eaffa6f..052110d 100644 --- a/gitlab2prov/adapters/git/fetcher.py +++ b/gitlab2prov/adapters/git/fetcher.py @@ -133,6 +133,7 @@ def extract_revisions(repo: Repo) -> Iterator[FileRevision]: file.path, ) ): + status = {"A": "added", "M": "modified", "D": "deleted"}.get(status, "modified") revs.append( FileRevision( name=Path(path).name, path=path, commit=hexsha, status=status, file=file From b4074e73c40de34c030ad7c1f1d7f599f1e173b2 Mon Sep 17 00:00:00 2001 From: cdboer <cdboer@cdboer.de> Date: Sun, 26 Feb 2023 18:00:34 +0100 Subject: [PATCH 46/81] Spelling fix --- gitlab2prov/prov/model.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/gitlab2prov/prov/model.py b/gitlab2prov/prov/model.py index 49fd4a8..9f5574a 100644 --- a/gitlab2prov/prov/model.py +++ b/gitlab2prov/prov/model.py @@ -14,7 +14,6 @@ ProvAssociation, ProvAttribution, ProvGeneration, - ProvRelation, ProvSpecialization, ProvCommunication, ProvRelation, @@ -55,7 +54,10 @@ def file_status_query(repository: Repository, status: str): for revision in repository.list_all(FileRevision, status=status): commit = repository.get(GitCommit, sha=revision.commit) for parent in [repository.get(GitCommit, sha=sha) for sha in commit.parents]: - yield commit, parent, revision, revision.previous if status == "modified" else None + if status == "modified": + yield commit, parent, revision, revision.previous + else: + yield commit, parent, revision def hosted_resource_query(repository: Repository, resource_type: Type[HostedResource]): @@ -128,7 +130,7 @@ def get_document(self): class FileAdditionModel: commit: GitCommit parent: GitCommit - revisions: FileRevision + revision: FileRevision ctx: ProvenanceContext = field(init=False) def __post_init__(self): @@ -210,10 +212,10 @@ def build_provenance_model(self) -> ProvDocument: self.ctx.add_relation(self.commit, self.parent, ProvCommunication) # Add the relations to the context self.ctx.add_relation( - self.commit, self.comitter, ProvAssociation, {PROV_ROLE: ProvRole.COMMITTER} + self.commit, self.commit.committer, ProvAssociation, {PROV_ROLE: ProvRole.COMMITTER} ) self.ctx.add_relation( - self.commit, self.author, ProvAssociation, {PROV_ROLE: ProvRole.AUTHOR} + self.commit, self.commit.author, ProvAssociation, {PROV_ROLE: ProvRole.AUTHOR} ) self.ctx.add_relation(self.revision, self.revision.file, ProvSpecialization) self.ctx.add_relation( From 76596e1c778d797fcf4ebb510d7475be5c2777e2 Mon Sep 17 00:00:00 2001 From: cdboer <cdboer@cdboer.de> Date: Sun, 26 Feb 2023 18:34:18 +0100 Subject: [PATCH 47/81] Update commands --- gitlab2prov/domain/commands.py | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/gitlab2prov/domain/commands.py b/gitlab2prov/domain/commands.py index 56e0df4..34594f6 100644 --- a/gitlab2prov/domain/commands.py +++ b/gitlab2prov/domain/commands.py @@ -6,35 +6,41 @@ @dataclass class Command: + """Base class for all commands.""" pass @dataclass class Fetch(Command): + """Fetch data from cloned repository and remote projects.""" url: str token: str @dataclass class Update(Fetch): + """Incremental update of data from cloned repository and remote projects.""" last_updated_at: datetime @dataclass -class Normalize(Command): +class Transform(Command): + """Apply transformations to the provenance document.""" document: ProvDocument - no_duplicates: bool = False use_pseudonyms: bool = False - agent_mapping: str = "" + remove_duplicates: bool = False + merge_aliased_agents: str = "" @dataclass class Combine(Command): + """Combine multiple provenance documents into one.""" documents: list[ProvDocument] @dataclass class Statistics(Command): + """Calculate statistics for the provenance document.""" document: ProvDocument resolution: str format: str @@ -42,18 +48,21 @@ class Statistics(Command): @dataclass class Serialize(Command): + """Retrieve/Serialize provenance document from interal data store.""" url: str = None @dataclass -class Document2File(Command): +class Write(Command): + """Write provenance document to file.""" document: ProvDocument filename: Optional[str] = None format: Optional[str] = None @dataclass -class File2Document(Command): - source: Optional[str] = None +class Read(Command): + """Read provenance document from file.""" + filename: Optional[str] = None content: Optional[str] = None format: Optional[str] = None \ No newline at end of file From 1046d44bba070a403dfec3b85785dcc55ff6042a Mon Sep 17 00:00:00 2001 From: cdboer <cdboer@cdboer.de> Date: Sun, 26 Feb 2023 18:35:52 +0100 Subject: [PATCH 48/81] Update file reading/writing operations --- gitlab2prov/prov/operations.py | 50 +++++++++++++++++++++++----------- 1 file changed, 34 insertions(+), 16 deletions(-) diff --git a/gitlab2prov/prov/operations.py b/gitlab2prov/prov/operations.py index 486f5fe..0b89e00 100644 --- a/gitlab2prov/prov/operations.py +++ b/gitlab2prov/prov/operations.py @@ -1,4 +1,5 @@ import json +import sys import logging import hashlib from typing import NamedTuple, Type @@ -34,27 +35,44 @@ DESERIALIZATION_FORMATS = ["rdf", "xml", "json"] -def serialize(document: ProvDocument, destination=None, format: str = "json") -> str | None: - """Serialize a ProvDocument to a file or string.""" - if format not in SERIALIZATION_FORMATS: - raise ValueError("Unsupported serialization format.") - if format != "dot": - return document.serialize(format=format, destination=destination) - string = prov_to_dot(document).to_string() - if not destination: - return string - with open(destination, "w") as f: - f.write(string) +def read_provenance_file(filename: str) -> ProvDocument: + """Read provenance document from file or sys.stdin.""" + if filename == "-": + content = sys.stdin.read() + else: + with open(filename, "r") as f: + content = f.read() + return deserialize_string(content=content) -def deserialize(source: str = None, content: str = None, format: str = None): - """Deserialize a ProvDocument from a file or string.""" +def deserialize_string(content: str, format: str = None): + """Deserialize a ProvDocument from a string.""" for format in DESERIALIZATION_FORMATS: try: - return ProvDocument.deserialize(source=source, content=content, format=format) + doc = ProvDocument.deserialize(content=content, format=format) + return doc except Exception: - continue - raise Exception(f"Deseialization failed for {source=}, {content=}, {format=}") + pass + raise Exception(f"Deserialization failed for {content=}, {format=}") + + +def write_provenance_file( + document: ProvDocument, filename: str, format: str = "json", overwrite: bool = True +): + """Write provenance document to file.""" + if Path(filename).exists() and not overwrite: + raise FileExistsError(f"File {filename} already exists.") + with open(filename, "w") as f: + f.write(serialize_string(document, format=format)) + + +def serialize_string(document: ProvDocument, format: str = "json") -> str: + """Serialize a ProvDocument to a string.""" + if format not in SERIALIZATION_FORMATS: + raise ValueError("Unsupported serialization format.") + if format != "dot": + return document.serialize(format=format) + return prov_to_dot(document).to_string() def format_stats_as_ascii_table(stats: dict[str, int]) -> str: From 59f3d87921dc995b360d49b6868a82df022a61be Mon Sep 17 00:00:00 2001 From: cdboer <cdboer@cdboer.de> Date: Sun, 26 Feb 2023 18:36:18 +0100 Subject: [PATCH 49/81] Update handlers to use correct commands --- gitlab2prov/service_layer/handlers.py | 28 ++++++++++++++++++--------- 1 file changed, 19 insertions(+), 9 deletions(-) diff --git a/gitlab2prov/service_layer/handlers.py b/gitlab2prov/service_layer/handlers.py index 9eb83c3..b17fb0b 100644 --- a/gitlab2prov/service_layer/handlers.py +++ b/gitlab2prov/service_layer/handlers.py @@ -9,6 +9,7 @@ def fetch_git(cmd: commands.Fetch, uow, git_fetcher) -> None: + log.info(f"fetch {cmd=}") with git_fetcher as fetcher: fetcher.do_clone(cmd.url, cmd.token) with uow: @@ -19,6 +20,7 @@ def fetch_git(cmd: commands.Fetch, uow, git_fetcher) -> None: def fetch_githosted(cmd: commands.Fetch, uow, githosted_fetcher) -> None: + log.info(f"fetch {cmd=}") fetcher = githosted_fetcher(cmd.token, cmd.url) with uow: for resource in fetcher.fetch_all(): @@ -31,33 +33,41 @@ def serialize(cmd: commands.Serialize, uow) -> ProvDocument: log.info(f"serialize graph consisting of {model.MODELS=}") document = ProvDocument() for prov_model in model.MODELS: + log.info(f"populate {prov_model=}") provenance = prov_model(uow.resources[cmd.url]) document = operations.combine(document, provenance) document = operations.dedupe(document) return document -def normalize(cmd: commands.Normalize): - if cmd.no_duplicates: +def transform(cmd: commands.Transform): + log.info(f"transform {cmd=}") + if cmd.remove_duplicates: cmd.document = operations.dedupe(cmd.doc) if cmd.use_pseudonyms: cmd.document = operations.pseudonymize(cmd.doc) + if cmd.merge_aliased_agents: + cmd.document = operations.merge_duplicated_agents(cmd.doc, cmd.merge_aliased_agents) return cmd.document def combine(cmd: commands.Combine): + log.info(f"combine {cmd=}") return operations.combine(*cmd.documents) -def document2file(cmd: commands.Document2File): - return operations.serialize(cmd.document, cmd.filename, cmd.format) +def write_file(cmd: commands.Write): + log.info(f"write {cmd=}") + return operations.write_provenance_file(cmd.document, cmd.filename, cmd.format) -def file2document(cmd: commands.File2Document): - return operations.deserialize(cmd.source, cmd.content, cmd.format) +def read_file(cmd: commands.Read): + log.info(f"read {cmd=}") + return operations.read_provenance_file(cmd.filename) def statistics(cmd: commands.Statistics): + log.info(f"statistics {cmd=}") return operations.stats(cmd.document, cmd.resolution, cmd.format) @@ -67,9 +77,9 @@ def statistics(cmd: commands.Statistics): fetch_githosted, ], commands.Serialize: [serialize], - commands.Document2File: [document2file], - commands.File2Document: [file2document], + commands.Read: [read_file], + commands.Write: [write_file], commands.Combine: [combine], - commands.Normalize: [normalize], + commands.Transform: [transform], commands.Statistics: [statistics], } From dbd614e00da65c59def312f3d3639e0f9ffb9a12 Mon Sep 17 00:00:00 2001 From: cdboer <cdboer@cdboer.de> Date: Sun, 26 Feb 2023 18:37:50 +0100 Subject: [PATCH 50/81] Regrouped commands in 'transform' command (#83) --- gitlab2prov/entrypoints/cli.py | 158 +++++++++++++++------------------ 1 file changed, 74 insertions(+), 84 deletions(-) diff --git a/gitlab2prov/entrypoints/cli.py b/gitlab2prov/entrypoints/cli.py index 08d2eaa..6763401 100644 --- a/gitlab2prov/entrypoints/cli.py +++ b/gitlab2prov/entrypoints/cli.py @@ -1,4 +1,3 @@ -import sys from functools import partial from functools import update_wrapper from functools import wraps @@ -50,7 +49,8 @@ def validate_config(ctx: click.Context, _, filepath: str): def processor(func, wrapped=None): """Decorator that turns a function into a processor. - A processor is a function that takes a stream of values, applies an operation to each value and returns a new stream of values. + A processor is a function that takes a stream of values, applies an operation + to each value and returns a new stream of values. A processor therefore transforms a stream of values into a new stream of values. """ @@ -68,7 +68,8 @@ def generator(func): """Decorator that turns a function into a generator. A generator is a special case of a processor. - A generator is a processor that doesn't apply any operation to the values but adds new values to the stream. + A generator is a processor that doesn't apply any operation + to the values but adds new values to the stream. """ @partial(processor, wrapped=func) @@ -146,8 +147,9 @@ def process_commands(processors, **kwargs): """Execute the chain of commands. This function is called after all subcommands have been chained together. - It executes the chain of commands by piping the output of one command into the input of the next command. - Subcommands can be processors that transform the stream of values or generators that add new values to the stream. + It executes the chain of commands by piping the output of one command + into the input of the next command. Subcommands can be processors that transform + the stream of values or generators that add new values to the stream. """ # Start with an empty iterable. stream = () @@ -161,7 +163,7 @@ def process_commands(processors, **kwargs): pass -@click.command("extract") +@click.command() @click.option( "-u", "--url", "urls", multiple=True, type=str, required=True, help="Project url[s]." ) @@ -179,7 +181,7 @@ def extract(bus, urls: list[str], token: str): for url in urls: doc = bus.handle(commands.Fetch(url, token)) doc = bus.handle(commands.Serialize(url)) - doc = bus.handle(commands.Normalize(doc)) + doc = bus.handle(commands.Transform(doc)) if not document: document = doc document.update(doc) @@ -188,33 +190,34 @@ def extract(bus, urls: list[str], token: str): yield document -@click.command("load", short_help="Load provenance files.") +@click.command() @click.option( "-i", - "--input", - "sources", + "--from", + "filenames", + default=["-"], multiple=True, type=click.Path(dir_okay=False), help="Provenance file path (specify '-' to read from <stdin>).", ) @click.pass_obj @generator -def load(bus, sources: list[str]): - """Load provenance information from a file. +def read(bus, filenames: list[str]): + """Read provenance information from file[s]. - This command reads one provenance graph from a file or multiple graphs from multiple files. + This command reads one provenance graph from a file/stdin or + multiple graphs from multiple files. """ - for filepath in sources: + for filename in filenames: try: - filename = sys.stdin if filepath == "-" else filepath - document = bus.handle(commands.File2Document(filename)) - document.description = "'<stdin>'" if filepath == "-" else f"'{filepath}'" + document = bus.handle(commands.Read(filename=filename)) + document.description = "'<stdin>'" if filename == "-" else f"'{filename}'" yield document except Exception as e: - click.echo(f"Could not open '{filepath}': {e}", err=True) + click.echo(f"Could not open '{filename}': {e}", err=True) -@click.command("save") +@click.command() @click.option( "-f", "--format", @@ -226,16 +229,15 @@ def load(bus, sources: list[str]): ) @click.option( "-o", - "--output", + "--to", "destination", default="-", - # TODO: think of a better default help="Output file path.", ) @processor @click.pass_obj -def save(bus, documents, formats, destination): - """Save one or multiple provenance documents to a file. +def write(bus, documents, formats, destination): + """Write provenance information to file[s]. This command saves one or multiple provenance documents to a file. @@ -243,42 +245,50 @@ def save(bus, documents, formats, destination): The serialization format can be specified using the '-f' option. """ documents = list(documents) - + for i, document in enumerate(documents, start=1): - + for fmt in formats: filename = f"{destination}{'-' + str(i) if len(documents) > 1 else ''}.{fmt}" try: - bus.handle(commands.Document2File(document, filename, fmt)) + bus.handle(commands.Write(document, filename, fmt)) except Exception as exc: click.echo(f"Could not save {document.description}: {exc}", err=True) yield document -@click.command("pseudonymize") +@click.command() +@click.option("--use-pseudonyms", is_flag=True, help="Use pseudonyms.") +@click.option("--remove-duplicates", is_flag=True, help="Remove duplicate statements.") +@click.option( + "--merge-aliased-agents", + type=click.Path(exists=True), + default="", + help="Merge aliased agents.", +) @processor @click.pass_obj -def pseudonymize(bus, documents: Iterator[ProvDocument]): - """Pseudonymize a provenance document. - - This command pseudonymizes one or multiple provenance documents. - - Pseudonymization is done by hashing attributes that contain personal information. - Pseudonymization only affects agents and their attributes. +def transform( + bus, + documents: Iterator[ProvDocument], + use_pseudonyms: bool = False, + remove_duplicates: bool = False, + merge_aliased_agents: str = "", +): + """Apply a set of transformations to provenance documents. + + This command applies a set of transformations to one or multiple provenance documents. """ for document in documents: + transformed = bus.handle( + commands.Transform(document, use_pseudonyms, remove_duplicates, merge_aliased_agents) + ) + transformed.description = f"normalized {document.description}" + yield transformed - try: - document = bus.handle(commands.Normalize(document, use_pseudonyms=True)) - document.description = f"pseudonymized {document.description}" - yield document - except Exception as exc: - click.echo(f"Could not pseudonymize {document.description}: {exc}", err=True) - - -@click.command("combine") +@click.command() @processor @click.pass_obj def combine(bus, documents: Iterator[ProvDocument]): @@ -291,7 +301,7 @@ def combine(bus, documents: Iterator[ProvDocument]): try: document = bus.handle(commands.Combine(documents)) - document = bus.handle(commands.Normalize(document)) + document = bus.handle(commands.Transform(document)) document.description = f"combination of {', '.join(descriptions)}" yield document @@ -299,30 +309,32 @@ def combine(bus, documents: Iterator[ProvDocument]): click.echo(f"Could not combine {', '.join(descriptions)}: {exc}", err=True) -@click.command("stats") +@click.command() @click.option( "--coarse", "resolution", flag_value="coarse", default=True, - help="Print the number of PROV elements aswell as the overall number of relations.", + help="Print the number of PROV elements for each element type.", ) @click.option( "--fine", "resolution", flag_value="fine", - help="Print the number of PROV elements aswell as the number of PROV relations for each relation type.", + help="Print the number of PROV elements for each element type and each relation type.", ) @click.option("--format", type=click.Choice(["csv", "table"]), default="table") @click.option( - "--explain", + "--verbose", is_flag=True, help="Print a textual summary of all operations applied to the graphs.", ) @processor @click.pass_obj -def stats(bus, documents: Iterator[ProvDocument], resolution: str, format: str, explain: bool): - """Print statistics such as node counts and relation counts. +def statistics( + bus, documents: Iterator[ProvDocument], resolution: str, format: str, verbose: bool +): + """Print statistics for one or more provenance documents. This command prints statistics for each processed provenance graph. Statistics include the number of elements for each element type aswell as the number of relations for each relation type. @@ -331,48 +343,26 @@ def stats(bus, documents: Iterator[ProvDocument], resolution: str, format: str, for document in documents: try: statistics = bus.handle(commands.Statistics(document, resolution, format)) - if explain: + if verbose: statistics = f"{document.description}\n\n{statistics}" click.echo(statistics) - except: + except Exception: click.echo("Could not compute statistics for {document.description}.", err=True) yield document -@click.command() -@click.option( - "--mapping", - "path_to_agent_map", - type=click.Path(exists=True, dir_okay=False), - help="File path to duplicate agent mapping.", -) -@processor -@click.pass_obj -def merge_duplicated_agents(bus, documents: Iterator[ProvDocument], path_to_agent_map: str): - """Merge duplicated agents based on a name to aliases mapping. - - This command solves the problem of duplicated agents that can occur when the same physical user - uses different user names and emails for his git and gitlab account. - Based on a mapping of names to aliases the duplicated agents can be merged. - """ - for document in documents: - document = bus.handle(commands.Normalize(document, agent_mapping=path_to_agent_map)) - document.description += f"merged double agents {document.description}" - yield document - - +# CLI group for gitlab commands gitlab_cli.add_command(extract) -gitlab_cli.add_command(stats) +gitlab_cli.add_command(read) +gitlab_cli.add_command(write) gitlab_cli.add_command(combine) -gitlab_cli.add_command(pseudonymize) -gitlab_cli.add_command(save) -gitlab_cli.add_command(load) -gitlab_cli.add_command(merge_duplicated_agents) +gitlab_cli.add_command(transform) +gitlab_cli.add_command(statistics) +# CLI group for github commands github_cli.add_command(extract) -github_cli.add_command(stats) +github_cli.add_command(read) +github_cli.add_command(write) github_cli.add_command(combine) -github_cli.add_command(pseudonymize) -github_cli.add_command(save) -github_cli.add_command(load) -github_cli.add_command(merge_duplicated_agents) +github_cli.add_command(transform) +github_cli.add_command(statistics) From 796c5e07b93110f26d27dc0b837bc14039f2ad79 Mon Sep 17 00:00:00 2001 From: cdboer <cdboer@cdboer.de> Date: Sun, 26 Feb 2023 18:51:00 +0100 Subject: [PATCH 51/81] Fix read/write option spelling --- gitlab2prov/entrypoints/cli.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gitlab2prov/entrypoints/cli.py b/gitlab2prov/entrypoints/cli.py index 6763401..9d1a799 100644 --- a/gitlab2prov/entrypoints/cli.py +++ b/gitlab2prov/entrypoints/cli.py @@ -193,7 +193,7 @@ def extract(bus, urls: list[str], token: str): @click.command() @click.option( "-i", - "--from", + "--input", "filenames", default=["-"], multiple=True, @@ -229,7 +229,7 @@ def read(bus, filenames: list[str]): ) @click.option( "-o", - "--to", + "--output", "destination", default="-", help="Output file path.", From b4c744d4ed6a082bd087a20594fd774a30b2221a Mon Sep 17 00:00:00 2001 From: cdboer <cdboer@cdboer.de> Date: Sun, 26 Feb 2023 18:51:43 +0100 Subject: [PATCH 52/81] Update example configuration file --- config/example.yaml | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/config/example.yaml b/config/example.yaml index 6e78e6a..017c570 100644 --- a/config/example.yaml +++ b/config/example.yaml @@ -5,11 +5,13 @@ - extract: url: ["https://gitlab.com/example/bar"] token: tokenBar -- load: +- read: input: [example.rdf] -- pseudonymize: - combine: -- save: +- transform: + use_pseudonyms: true + remove_duplicates: true +- write: output: combined format: [json, rdf, xml, dot] - stats: From 55de7a9c09902c9b50c98234022cda6587610959 Mon Sep 17 00:00:00 2001 From: cdboer <cdboer@cdboer.de> Date: Sun, 26 Feb 2023 18:52:08 +0100 Subject: [PATCH 53/81] Fix 'stats' spelling --- gitlab2prov/entrypoints/cli.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gitlab2prov/entrypoints/cli.py b/gitlab2prov/entrypoints/cli.py index 9d1a799..28865e2 100644 --- a/gitlab2prov/entrypoints/cli.py +++ b/gitlab2prov/entrypoints/cli.py @@ -309,7 +309,7 @@ def combine(bus, documents: Iterator[ProvDocument]): click.echo(f"Could not combine {', '.join(descriptions)}: {exc}", err=True) -@click.command() +@click.command("stats") @click.option( "--coarse", "resolution", From 0baac8ee76c53c08374a44e60c16ca4f9594c5db Mon Sep 17 00:00:00 2001 From: cdboer <cdboer@cdboer.de> Date: Sun, 26 Feb 2023 19:03:02 +0100 Subject: [PATCH 54/81] Add transform command to schema --- gitlab2prov/config/schema.json | 64 ++++++++++++++++++++++------------ 1 file changed, 41 insertions(+), 23 deletions(-) diff --git a/gitlab2prov/config/schema.json b/gitlab2prov/config/schema.json index 6716945..f39e719 100644 --- a/gitlab2prov/config/schema.json +++ b/gitlab2prov/config/schema.json @@ -7,19 +7,19 @@ "$ref": "#/definitions/extract" }, { - "$ref": "#/definitions/load" + "$ref": "#/definitions/read" }, { "$ref": "#/definitions/combine" }, { - "$ref": "#/definitions/save" + "$ref": "#/definitions/write" }, { - "$ref": "#/definitions/pseudonymize" + "$ref": "#/definitions/stats" }, { - "$ref": "#/definitions/stats" + "$ref": "#/definitions/transform" } ] }, @@ -53,10 +53,10 @@ "extract" ] }, - "load": { + "read": { "type": "object", "properties": { - "load": { + "read": { "type": "object", "properties": { "input": { @@ -74,7 +74,7 @@ }, "additionalProperties": false, "required": [ - "load" + "read" ] }, "combine": { @@ -89,22 +89,10 @@ "combine" ] }, - "pseudonymize": { + "write": { "type": "object", "properties": { - "pseudonymize": { - "type": "null" - } - }, - "additionalProperties": false, - "required": [ - "pseudonymize" - ] - }, - "save": { - "type": "object", - "properties": { - "save": { + "write": { "type": "object", "properties": { "output": { @@ -114,7 +102,13 @@ "type": "array", "items": { "type": "string", - "enum": ["json", "rdf", "provn", "dot", "xml"] + "enum": [ + "json", + "rdf", + "provn", + "dot", + "xml" + ] } } }, @@ -127,7 +121,7 @@ }, "additionalProperties": false, "required": [ - "save" + "write" ] }, "stats": { @@ -160,6 +154,30 @@ "required": [ "stats" ] + }, + "transform": { + "type": "object", + "properties": { + "transform": { + "type": "object", + "properties": { + "use_pseudonyms": { + "type": "boolean" + }, + "remove_duplicates": { + "type": "boolean" + }, + "merge_aliased_agents": { + "type": "boolean" + } + }, + "additionalProperties": false + } + }, + "additionalProperties": false, + "required": [ + "transform" + ] } } } \ No newline at end of file From b243965cd9da469e1d6f3b63dae3cc3873da2054 Mon Sep 17 00:00:00 2001 From: cdboer <cdboer@cdboer.de> Date: Sun, 26 Feb 2023 19:03:38 +0100 Subject: [PATCH 55/81] Update README.md --- README.md | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index 66bf374..f398b88 100644 --- a/README.md +++ b/README.md @@ -84,13 +84,12 @@ Options: --help Show this message and exit. Commands: - combine Combine multiple graphs into one. - extract Extract provenance information for one or more... - load Load provenance files. - merge-duplicated-agents Merge duplicated agents based on a name to... - pseudonymize Pseudonymize a provenance graph. - save Save provenance information to a file. - stats Print statistics such as node counts and... + combine Combine one or more provenance documents. + extract Extract provenance information for one or more gitlab... + read Read provenance information from file[s]. + stats Print statistics for one or more provenance documents. + transform Apply a set of transformations to provenance documents. + write Write provenance information to file[s]. ``` ### Configuration Files @@ -118,8 +117,10 @@ Config file example: token: tokenB - load: input: [example.rdf] -- pseudonymize: - combine: +- transform: + use_pseudonyms: true + remove_duplicates: true - save: output: combined format: [json, rdf, xml, dot] @@ -135,9 +136,9 @@ The config file example is functionally equivalent to this command line invocati gitlab2prov extract -u https://gitlab.com/example/foo -t tokenFoo \ extract -u https://gitlab.com/example/bar -t tokenBar \ load -i example.rdf \ - pseudonymize \ combine \ - save -o combined -f json -f rdf -f xml -f dot \ + transform --use-pseudonyms --remove_duplicates \ + write -o combined -f json -f rdf -f xml -f dot \ stats --fine --explain --formatter table ``` From f4c54396696905009c53f9a3adcf579b0d3efc0a Mon Sep 17 00:00:00 2001 From: cdboer <dev@claas.plus> Date: Mon, 17 Apr 2023 11:23:53 +0200 Subject: [PATCH 56/81] Rewrite model explanations (#81) --- docs/README.md | 371 +++++++++++++++++++++++++------------------------ 1 file changed, 191 insertions(+), 180 deletions(-) diff --git a/docs/README.md b/docs/README.md index 52cb91b..9de0832 100644 --- a/docs/README.md +++ b/docs/README.md @@ -10,8 +10,7 @@ pip install -r requirements.txt ``` ## The GitLab2PROV Provenance Model -The provenance model for GitLab2PROV provenance graphs consists of multiple submodels, that are concerned with various types of interactions that users can have with a GitLab project aswell as with the `git` repository contained within the project. -A few models have been compiled without prior examples others are derivations of related projects such as `git2prov` or `github2prov`. +The GitLab2PROV provenance model comprises several submodels that address different user interactions with a GitLab project and its corresponding git repository. Some submodels have been developed without existing examples, while others are adaptations of related projects like [`git2prov`](https://github.com/IDLabResearch/Git2PROV) or [`github2prov`](https://www.usenix.org/system/files/tapp2019-paper-packer.pdf). In total, the GitLab2PROV provenance model includes the following submodels: @@ -23,12 +22,11 @@ In total, the GitLab2PROV provenance model includes the following submodels: 6. **GitLab: Merge Request Web Resource** 7. **GitLab: Release & Tag Resource** -This document contains a brief explanation for each model. -This includes, but is not limited to, a reference table for each PROV element of a model that defines which attributes are attached to the element. -Reference tables for qualified relations, i.e. relations that with attached attributes, are also provided. +This document provides a concise explanation for each model, including a reference table for each PROV element that defines the attributes attached to the element. +The reference tables for qualified relations, which are relations with attached attributes, are also included. -This document uses the Cypher query language notation to denote relationships/relations. -The following ASCII art based notation represents a directed relation `r` of type `R` between the vertices `S` and `T`. +To represent relationships/relations, this document uses the Cypher query language notation. +The notation consists of an ASCII art-based representation of a directed relation `r` of type `R` between vertices `S` and `T`. `(S)-[r:R]->(T)` @@ -36,52 +34,61 @@ The following ASCII art based notation represents a directed relation `r` of typ ![Addition of a File.](./svgs/git_commit_model_add.svg) -This model captures the addition of a new file to the git repository of a GitLab project by a git commit. +This model captures the addition of a new file to a GitLab project's git repository by a git commit. -The model includes all human actors involved in the process. -In this case these actors are the author and the committer of the git commit represented as agents in the model. -The author represents the user that originally wrote the code contained in the commit. -The committer represents the user that committed the code on behalf of the author. -Committer and author can be the same person but do not have to be. +All human actors involved in the process are included in the model. +These actors are represented as agents in the model. +The author represents the user that originally wrote the code contained in the commit. +The committer represents the user that committed the code on behalf of the author. +The committer and author can be the same person but do not have to be. -The commit aswell as all of its parents are captured as activities. -Each commit is said to be informed by its parent commit, as each commit builds upon the git repository that the parent commits left behind. -The commit is associated to both author and committer as these are the actors responsible for its existance. +The commit and all of its parents are captured as activities. +Each commit is informed by its parent commit. +The commit is associated with both the author and committer as these are the actors responsible for its existence. -Two entities are created for the file that was added in the commit. -One, the File entity, represents the origin of the added file aswell as the concept of its originality. -The second entity, called FileRevision, represents the added file at the time of its addition. -The revision are marked as a specialization of the file origin. -Both entities are generated by the commit activity. -Both entities are attributed to the author of the commit, the actor responsible for their content and creation. +Two entities are created for the added file. +The File entity represents the origin of the added file as well as the concept of its originality. +The second entity, called FileRevision, represents the added file at the time of its addition. +The revisions are marked as specializations of the file origin. +Both entities are generated by the commit activity and are attributed to the author of the commit, the actor responsible for their content and creation. **`Author`** -| Attribute | Fixed Value | Description | -| ---------- | ----------- | -------------------------------------------------------- | -| name | - | `git config user.name` Set in the author's git config. | -| email | - | `git config user.email` Set in the author's git config. | -| prov:role | Author | Function of the agent in context of the commit activity. | -| prov:type | User | Agent type. | -| prov:label | - | Human readable representation of the agent. | +| Attribute | Fixed Value | Description | +| --------------- | ----------- | -------------------------------------------------------- | +| name | - | `git config user.name` Set in the author's git config. | +| email | - | `git config user.email` Set in the author's git config. | +| gitlab_username | - | Gitlab user account username. | +| github_username | - | Github user account username. | +| gitlab_email | - | Gitlab user account email. | +| github_email | - | Github user account email. | +| prov:role | Author | Function of the agent in context of the commit activity. | +| prov:type | User | Agent type. | +| prov:label | - | Human readable representation of the agent. | **`Committer`** -| Attribute | Fixed Value | Description | -| ---------- | ----------- | -------------------------------------------------------- | -| name | - | `git config user.name` Set in the author's git config. | -| email | - | `git config user.email` Set in the author's git config. | -| prov:role | Committer | Function of the agent in context of the commit activity. | -| prov:type | User | Agent type. | -| prov:label | - | Human readable representation of the agent. | +| Attribute | Fixed Value | Description | +| --------------- | ----------- | -------------------------------------------------------- | +| name | - | `git config user.name` Set in the author's git config. | +| email | - | `git config user.email` Set in the author's git config. | +| gitlab_username | - | Gitlab user account username. | +| github_username | - | Github user account username. | +| gitlab_email | - | Gitlab user account email. | +| github_email | - | Github user account email. | +| prov:role | Committer | Function of the agent in context of the commit activity. | +| prov:type | User | Agent type. | +| prov:label | - | Human readable representation of the agent. | **`Commit`** | Attribute | Fixed Value | Description | | -------------- | ----------------------- | ------------------------------------------- | -| hexsha | - | Commit SHA1 | -| message | - | Commit message. | +| sha | - | Commit SHA1 | | title | - | First 50 characters of the commit message. | +| message | - | Commit message. | +| authored_at | - | Time at which the commit was authored. | +| committed_at | - | Time at which the commit was committed. | | prov:startTime | `COMMIT_AUTHOR_DATE` | Time at which the commit activity started. | | prov:endTime | `COMMIT_COMMITTER_DATE` | Time at which the commit activity ended. | | prov:type | GitCommit | Activity type. | @@ -89,22 +96,25 @@ Both entities are attributed to the author of the commit, the actor responsible **`File`** -| Attribute | Fixed Value | Description | -| ------------ | ----------- | ------------------------------------------------------------------ | -| path | - | Original file path. The path at which this file was first created. | -| committed_in | - | SHA1 of the commit that added this file to the repository. | -| prov:type | File | Entity type. | -| prov:label | - | Human readable representation of the entity. | +| Attribute | Fixed Value | Description | +| ---------- | ----------- | ------------------------------------------------------------------ | +| name | - | Original file name. | +| path | - | Original file path. The path at which this file was first created. | +| commit | - | SHA1 of the commit that added this file to the repository. | +| prov:type | File | Entity type. | +| prov:label | - | Human readable representation of the entity. | **`File Revision`** -| Attribute | Fixed Value | Description | -| ------------ | ------------ | ---------------------------------------------------------------------------- | -| path | - | Current file path of this revision. | -| committed_in | - | SHA1 of the commit that added this revision to the repository. | -| change_type | - | [`git diff`](https://git-scm.com/docs/git-diff) change type / change status. | -| prov:type | FileRevision | Entity type. | -| prov:label | - | Human readable representation of the entity. | +| Attribute | Fixed Value | Description | +| ----------- | ---------------------------------- | ---------------------------------------------------------------------------- | +| name | - | Current file name. | +| path | - | Current file path of this revision. | +| commit | - | SHA1 of the commit that added this revision to the repository. | +| status | `added` or `modified` or `deleted` | Change status of the file revision. | +| change_type | - | [`git diff`](https://git-scm.com/docs/git-diff) change type / change status. | +| prov:type | FileRevision | Entity type. | +| prov:label | - | Human readable representation of the entity. | Some PROV relations in this model are "qualified" relations. @@ -142,26 +152,25 @@ The following tables define the attributes attached to these relations. ![Modifying a file.](./svgs/git_commit_model_mod.svg) -This model captures the modification of a file from the git repository of a GitLab project by a git commit. +This model captures the modification of a file in a GitLab project's git repository by a git commit. -The model includes all human actors involved in the process. -In this case these actors are the author and the committer of the git commit represented as agents in the model. -The author represents the user that originally wrote the code contained in the commit. -The committer represents the user that committed the code on behalf of the author. -Committer and author can be the same person but do not have to be. +All human actors involved in the process are included in the model. +These actors are represented as agents in the model. +The author represents the user that originally wrote the code contained in the commit. +The committer represents the user that committed the code on behalf of the author. +The committer and author can be the same person but do not have to be. -The commit aswell as all of its parents are captured as activities. -Each commit is said to be informed by its parent commit, as each commit builds upon the git repository that the parent commits left behind. -The commit is associated to both author and committer as these are the actors responsible for its existance. +The commit and all of its parents are captured as activities. +Each commit is informed by its parent commit. +The commit is associated with both the author and committer as these are the actors responsible for its existence. The commit uses the PreviousFileRevision to generate a new revision that accounts for the modifications included in the commit. -Three entities are created for the modified file. -One, the File entity, represents the origin of the file aswell as the concept of its originality. -The File entity will already exist, due to it being created in the 'Git: Addition of a File' model. -The second entity, called PreviousFileRevision, represents the latest file revision of the modified file before modification. -The third entity, called FileRevision, represents the revision of the file after the modification has been accounted for. -The FileRevision is said to be derived from the previous revision of the modified file. -The FileRevision is generated by the commit activity. +Three entities are created for the modified file. +The File entity represents the origin of the file as well as the concept of its originality. +The File entity already exists due to it being created in the 'Git: Addition of a File' model. +The second entity, called PreviousFileRevision, represents the latest file revision of the modified file before modification. +The third entity, called FileRevision, represents the revision of the file after the modification has been accounted for. +The FileRevision is derived from the previous revision of the modified file and is generated by the commit activity. All revisions are marked as specializations of the File entity. @@ -260,23 +269,21 @@ The following tables define the attributes attached to these relations. ![Deleting a file.](./svgs/git_commit_model_del.svg) -This model captures the deletion of a file from the git repository of a GitLab project by a git commit. +This model documents the removal of a file from the git repository of a GitLab project through a git commit. -The model includes all human actors involved in the process. -In this case these actors are the author and the committer of the git commit represented as agents in the model. -The author represents the user that originally wrote the code contained in the commit. -The committer represents the user that committed the code on behalf of the author. -Committer and author can be the same person but do not have to be. +The model accounts for all human actors involved in the process. +These actors are represented as agents in the model and include the author and the committer of the git commit. +The author represents the user who originally wrote the code contained in the commit, while the committer represents the user who committed the code on behalf of the author. +It is possible for the committer and author to be the same person. -The commit aswell as all of its parents are captured as activities. -Each commit is said to be informed by its parent commit, as each commit builds upon the git repository that the parent commits left behind. -The commit is associated to both author and committer, as these are the actors responsible for its existance. +The commit, as well as all of its parents, are captured as activities. +Each commit builds upon the git repository that the parent commits left behind and is informed by its parent commit. +The commit is attributed to both author and committer, as these are the actors responsible for its existence. -Two entities are created for the deleted file. -One, the File entity, represents the origin of the file aswell as the concept of its originality. -The second, the FileRevision entity, represents the revision of the file at the point of its deletion. -The revision is invalidated by the commit that deletes / removes it from the repository. -The deleted revision is marked as a specialization of the original File entity. +The model generates two entities for the deleted file. +The File entity represents the origin of the file and the concept of its originality. +The second entity, FileRevision, represents the revision of the file at the time of its deletion. +The deleted revision is invalidated by the commit that removes it from the repository and is marked as a specialization of the original File entity. **`Author`** @@ -358,37 +365,36 @@ The following tables define the attributes attached to these relations. ![GitLab Commit Model](./svgs/gitlab_commit_model.svg) -This model captures the creation and annotation of a GitLab commit web resource, i.e. the webpage of a git commit as displayed by GitLab. - -GitLab creates a webpage for a commit as soon as the commit is pushed to the GitLab remote. -Users can interact with the webpage by, among other interactions, leaving a comment in the comment section. -GitLab captures some of these interactions and stores them in internal data structures. -Comments written by users are therefore retrievable through the GitLab API. -Retrievable interactions such as comments are considered to "annotate" the web resource. - -The model includes all human actors involved in the process of the creation or annotation of a GitLab commit web resource. -In this case these actors are the author of the GitLab commit web resource aswell as all users responsible for annotations. -The author represents the user that pushed the commit to the GitLab remote and consequently triggered the creation of the web resource. -An annotator is a user that is responsible for the existance of an annotation such as a comment. -In case of the annotation being a comment, the responsible annotator would be the author of the comment. - -The creation of the web resource is captured as an activity. -The creation activity is informed by the corresponding git commit that triggered the creation of the commit web resource. -The creation activity is associated with the user that pushed the commit to the GitLab remote. -In the context of the creation activity, this user is called "Gitlab Commit Author". -Each annotation is captured as an activity that uses the latest version of the web resource to generate a new one. -Each annotation is associated with the user that is responsible for creating it. -Each annotation is informed by either the annotation that precedes it or - if no annotations have been recorded so far - the creation activity. -The annotations form a chain of events, that corresponds to the chain of interactions between users and the GitLab commit web resource. - -The commit web resource is captured by multiple entities. -One for the original web resource and its concept of originality called "GitLab Commit". -A second one for the version of the GitLab commit web resource at the time of its creation, called "Commit Version". -One entity per annotation capturing the commit web resource right after the annotation happened, called "Annotated Commit Version". -The original web resource and the resource version at the point of creation is generated by the creation activity. -The original web resource and the first resource version are attributed to the gitlab commit author. -Each annotated commit version is generated by the corresponding annotation activity. -Each annotated commit version is attributed to its annotator. +This model focuses on capturing the creation and annotation of a GitLab commit web resource, which refers to the webpage of a Git commit as displayed by GitLab. + +Upon pushing a commit to the GitLab remote, GitLab automatically generates a webpage for the commit. +Users can interact with this webpage by leaving comments in the comment section and engaging in other interactions. +GitLab captures and stores some of these interactions in internal data structures. +As a result, comments written by users and other stored interactions can be retrieved through the GitLab API. +In this model, these retrievable interactions, such as comments, are considered as annotations to the web resource. + +The model encompasses all human actors involved in the process of creating or annotating a GitLab commit web resource. +This includes the author of the GitLab commit web resource, who is the user that pushed the commit to the GitLab remote, triggering the creation of the web resource. +Additionally, it includes all users who are responsible for annotations, such as comments. + +An annotator is a user who is responsible for the existence of an annotation, such as a comment, on the GitLab commit web resource. +For instance, in the case of a comment being an annotation, the responsible annotator would be the author of that comment. +The model accounts for all these actors, providing a comprehensive representation of the human involvement in the creation and annotation of the GitLab commit web resource. + +The creation of the GitLab commit web resource is captured as an activity, which is informed by the corresponding git commit that triggered the creation. The user who pushed the commit to the GitLab remote is associated with the creation activity and referred to as the "GitLab Commit Author". + +Each annotation is represented as an activity that generates a new version of the web resource by using the latest version. +The user responsible for creating the annotation is associated with the annotation activity. +Annotations are informed by either the preceding annotation, if any, or the creation activity, if no annotations have been recorded yet. +This forms a chain of events, reflecting the chain of interactions between users and the GitLab commit web resource. + +The GitLab commit web resource is represented by multiple entities. Firstly, an entity for the original web resource and its concept of originality called "GitLab Commit". Secondly, an entity for the version of the GitLab commit web resource at the time of its creation, referred to as "Commit Version". For each annotation, a separate entity called "Annotated Commit Version" captures the state of the commit web resource after the annotation occurred. + +The creation activity generates the original web resource entity and the Commit Version entity, representing the web resource at the point of creation. Both of these entities are attributed to the GitLab Commit Author, who pushed the commit to the GitLab remote. + +Each annotated commit version is generated by the corresponding annotation activity, capturing the web resource state after the annotation. +The Annotated Commit Version entity is attributed to its annotator, who is responsible for creating the annotation. +This way, the model captures the lineage of the GitLab commit web resource and associates it with the relevant users and activities throughout the process. **`Gitlab Commit Author`** @@ -527,34 +533,36 @@ The following tables define the attributes attached to these relations. ![GitLab Issue Model](./svgs/gitlab_issue_model.svg) -This model captures the creation and annotation of a GitLab issue web resource, i.e. the webpage of an issue as displayed by GitLab. +The GitLab: Issue Web Resource model represents the creation and annotation of a GitLab issue web resource, specifically the webpage of an issue as displayed in GitLab. + +The structure of the GitLab: Issue Web Resource model is similar to that of the GitLab: Commit Web Resource model. +GitLab's issue tracker can be accessed via the GitLab API just like the commit web resources. +Both models share similar concepts and ideas behind their design. + +The model encompasses all human actors involved in the creation or annotation process of a GitLab issue web resource. +These actors include the author of the GitLab issue web resource, as well as users responsible for annotations. -GitLab provides an issue tracker which is accessable through the GitLab API. -The GitLab: Issue Web Resource model is structurally similar to the GitLab: Commit Web Resource model. -The idea behind it is very similar aswell. +The issue author refers to the user who originally opened/created the issue. -The model includes all human actors involved in the process of the creation or annotation of a gitlab issue web resource. -In this case the actors are the author of the gitlab issue web resource aswell as all users responsible for annotations. -The issue author represents the user that opened/created the issue in the first place. -An annotator is a user that is responsible for the existance of an annotation such as a comment, label, etc. -For example: In case of the annotation being a comment, the responsible annotator would be the author of the comment. +An annotator is a user who is responsible for creating annotations, such as comments, labels, etc. +For instance, in the case of a comment annotation, the author of the comment is considered the responsible annotator. -The creation of the web resource is captured as an activity. -The creation activity is associated with the user that opened/created the issue. -In the context of the creation activity, this user is called "Issue Author". -Each annotation is captured as an activity that uses the latest version of the web resource to generate a new one. -Each annotation is associated with the user that is responsible for creating it. -Each annotation is informed by either the annotation that precedes it or - if no annotations have been recorded so far - the creation activity. -The annotations form a chain of events, that corresponds to the chain of interactions between users and the gitlab issue web resource. +The creation of the web resource is captured as an activity, which is associated with the user who opened/created the issue, referred to as the "Issue Author" within the context of the creation activity. -The issue web resource is captured by multiple entities. -One for the original web resource and its concept of originality called "Issue". -A second one for the version of the gitlab issue web resource at the time of its creation, called "Issue Version". -One entity per annotation capturing the issue web resource right after the annotation happened, called "Annotated Issue Version". -The original web resource and the resource version at the point of creation is generated by the creation activity. -The original web resource and the first resource version are attributed to the gitlab issue author. -Each annotated issue version is generated by the corresponding annotation activity. -Each annotated issue version is attributed to its annotator. +Each annotation is captured as an activity that uses the latest version of the web resource to generate a new version. +Each annotation is associated with the user responsible for creating it. + +Annotations are informed by either the preceding annotation, if any, or the creation activity if no annotations have been recorded yet. +This creates a chain of events that corresponds to the interactions between users and the GitLab issue web resource. + +The issue web resource is represented by multiple entities, including one for the original web resource referred to as "Issue" which captures its concept of originality. +Another entity is created for the version of the GitLab issue web resource at the time of its creation, called "Issue Version". +Additionally, one entity is created per annotation to capture the issue web resource right after the annotation has occurred, known as "Annotated Issue Version". + +Both the original web resource and the resource version at the point of creation are generated by the creation activity, and are attributed to the GitLab issue author. + +Each annotated issue version is generated by the corresponding annotation activity, and is attributed to its annotator. +This allows for capturing the changes in the issue web resource after each annotation activity has taken place. **`Issue Author`** @@ -687,34 +695,40 @@ The following tables define the attributes attached to these relations. ![GitLab Merge Request Model](./svgs/gitlab_merge_request_model.svg) +The GitLab: Merge Request Web Resource model represents the creation and annotation of a GitLab merge request web resource, which refers to the webpage of a merge request as displayed in GitLab. + +The structure of the GitLab: Merge Request Web Resource model is similar to that of the GitLab: Commit Web Resource model. + +Both models share similar concepts and ideas behind their design. + This model captures the creation and annotation of a GitLab merge request web resource, i.e. the webpage of a merge request as displayed by GitLab. -The GitLab: Merge Request Web Resource model is structurally similar to the GitLab: Commit Web Resource model. -The idea behind it is very similar aswell. - -The model includes all human actors involved in the process of the creation or annotation of a gitlab merge request web resource. -In this case the actors are the author of the gitlab merge request web resource aswell as all users responsible for annotations. -The issue author represents the user that opened/created the merge request in the first place. -An annotator is a user that is responsible for the existance of an annotation such as a comment, label, etc. -For example: In case of the annotation being a comment, the responsible annotator would be the author of the comment. - -The creation of the web resource is captured as an activity. -The creation activity is associated with the user that opened/created the merge request. -In the context of the creation activity, this user is called "Merge Request Author". -Each annotation is captured as an activity that uses the latest version of the web resource to generate a new one. -Each annotation is associated with the user that is responsible for creating it. -Each annotation is informed by either the annotation that precedes it or - if no annotations have been recorded so far - the creation activity. -The annotations form a chain of events, that corresponds to the chain of interactions between users and the gitlab merge request web resource. - -The merge request web resource is captured by multiple entities. -One for the original web resource and its concept of originality called "Merge Request". -A second one for the version of the gitlab merge request web resource at the time of its creation, called "Merge Request Version". -One entity per annotation capturing the merge request web resource right after the annotation happened, called "Annotated Merge Request Version". -The original web resource and the resource version at the point of creation is generated by the creation activity. -The original web resource and the first resource version are attributed to the gitlab merge request author. -Each annotated merge request version is generated by the corresponding annotation activity. -Each annotated merge request version is attributed to its annotator. +The model encompasses all human actors involved in the creation or annotation process of a GitLab merge request web resource. +These actors include the author of the GitLab merge request web resource, as well as users responsible for annotations. + +The merge request author refers to the user who originally opened/created the merge request. + +An annotator is a user who is responsible for creating annotations, such as comments, labels, etc. +For instance, in the case of a comment annotation, the author of the comment is considered the responsible annotator. + +The creation of the merge request web resource is represented as an activity in the model. +The creation activity is associated with the user who opened/created the merge request, referred to as "Merge Request Author" in the context of the creation activity. + +Each annotation is captured as an activity that utilizes the latest version of the web resource to generate a new version. +Each annotation is associated with the user who is responsible for creating it. + +Annotations are informed by either the preceding annotation or, if no annotations have been recorded yet, the creation activity itself. +These annotations collectively form a chain of events that corresponds to the interactions between users and the GitLab merge request web resource. + +The GitLab merge request web resource is represented by multiple entities in the model. +One entity represents the original web resource and its concept of originality, referred to as "Merge Request". +A second entity represents the version of the GitLab merge request web resource at the time of its creation, called "Merge Request Version". + +For each annotation, a separate entity is created to capture the merge request web resource right after the annotation occurred, called "Annotated Merge Request Version". + +The original web resource and the resource version at the point of creation are generated by the creation activity, and are attributed to the GitLab merge request author. +Each annotated merge request version is generated by the corresponding annotation activity, and is attributed to its annotator. **`Merge Request Author`** | Attribute | Fixed Value | Description | @@ -1033,32 +1047,29 @@ The following tables define the attributes attached to these relations. ## Annotations -GitLab displays annotations that occur on resources on the webpages of the respective resources. -For example, if a resource was mentioned in the comment thread of another resource, this mention is displayed in the comment section of the mentioned target. +GitLab allows annotations or comments to be displayed on the webpages of respective resources. +For example, if a resource (such as an issue or merge request) is mentioned in the comment thread of another resource, GitLab displays that mention in the comment section of the mentioned target. +This allows for discussions, references, and annotations to be visible and accessible within the context of the related resources, making it easier for users to collaborate and track discussions on GitLab webpages. ![comment thread](issue-thread.png) -These annotations can be parsed from multiple sources that are provided by the official GitLab API. -Sadly there is no dedicated endpoint for all annotations that are of interest. -Especially annotations that connect resources are difficult to get. -Here a quick summary of what data needs to be retrieved, how to parse it and the workarounds that we deployed to achieve annotation parsing. +Annotations can be parsed from various sources provided by the official GitLab API. +However, there is no dedicated endpoint for retrieving all annotations of interest, particularly those that connect resources, which can be challenging to obtain. +Here is a brief summary of the data that needs to be retrieved, how to parse it, and the workarounds that have been deployed to achieve annotation parsing. -For label events we use the official API endpoint from which we parse the appropriate annotations ("add_label", "remove_label"). -Emoji awards can be retrieved from the appropriate API endpoint. -We parse everything else - such as mentions, time tracking stats, due dates, TODO's, etc. - from system notes that GitLab uses to display annotations in their web-interface. +To retrieve label events, we use the official API endpoint and parse the relevant annotations such as "add_label" and "remove_label". +Emoji awards can be retrieved from the appropriate API endpoint. +For other annotations such as mentions, time tracking stats, due dates, TODO's, etc., we parse them from system notes that GitLab uses to display annotations in their web interface. -System notes include a string that describe the annotation that they represent. -We classify the annotation that the string denotes using regular expressions. -If necessary we include named groups in the regular expressions to extract relevant information from the annotation strings. -These are later added to PROV element attributes. +System notes contain a string that describes the annotation they represent. +We classify the annotation based on the string using regular expressions, and use named groups in the regular expressions to extract relevant information from the annotation strings. +These extracted information are later added to PROV element attributes. -Noted, this is not optimal as older GitLab versions employ different string notations for the same annotation. -Sometimes only differing by a few characters and other times having a completly different string for the same annotation. -In addition there is a problem when parsing imported projects. -For example, while parsing a project that was imported from SVN, relevant annotations wheren't recorded as system notes but rather as normal notes. -This is not accounted for and is - as of right now - not covered by the current note parsing approach. +Noted, this approach may not be optimal as older GitLab versions may employ different string notations for the same annotation, sometimes differing by only a few characters or even having completely different strings for the same annotation. +Additionally, there may be issues when parsing imported projects, where relevant annotations may not be recorded as system notes but rather as normal notes. +This is not currently accounted for in the current note parsing approach. -Here a list of annotations that we are currently able to parse with a short description of what the annotation is and the API resource from which we parse that annotation. +Here is a list of annotations that we are currently able to parse, along with a short description of what the annotation is and the API resource from which we parse that annotation. ### List of Annotations From e2ed0eada0438133440b232c490ac457596d6aff Mon Sep 17 00:00:00 2001 From: cdboer <dev@claas.plus> Date: Mon, 17 Apr 2023 11:32:30 +0200 Subject: [PATCH 57/81] Remove old test suite --- tests/integration/test_repository.py | 38 -- tests/random_refs.py | 17 - tests/unit/test_annotation_parsing.py | 66 -- tests/unit/test_classifiers.py | 87 --- tests/unit/test_fetch_utils.py | 25 - tests/unit/test_handlers.py | 94 --- tests/unit/test_objects.py | 869 -------------------------- tests/unit/test_operations.py | 225 ------- 8 files changed, 1421 deletions(-) delete mode 100644 tests/integration/test_repository.py delete mode 100644 tests/random_refs.py delete mode 100644 tests/unit/test_annotation_parsing.py delete mode 100644 tests/unit/test_classifiers.py delete mode 100644 tests/unit/test_fetch_utils.py delete mode 100644 tests/unit/test_handlers.py delete mode 100644 tests/unit/test_objects.py delete mode 100644 tests/unit/test_operations.py diff --git a/tests/integration/test_repository.py b/tests/integration/test_repository.py deleted file mode 100644 index c82e221..0000000 --- a/tests/integration/test_repository.py +++ /dev/null @@ -1,38 +0,0 @@ -from datetime import datetime, timedelta - -from gitlab2prov.adapters import repository -from gitlab2prov.domain import objects - - -today = datetime.now() -tomorrow = today + timedelta(days=1) -yesterday = today - timedelta(days=1) - - -class TestInMemoryRepository: - def test_get(self): - repo = repository.InMemoryRepository() - u1 = objects.User(name="u1", email="e1", prov_role="r1") - u2 = objects.User(name="u2", email="e2", prov_role="r2") - repo.add(u1) - repo.add(u2) - assert repo.get(objects.User, name="u1") == u1 - assert repo.get(objects.User, name="u2") == u2 - - def test_get_returns_none_if_repository_is_empty(self): - repo = repository.InMemoryRepository() - assert repo.get(objects.User, name="name") == None - - def test_list_all(self): - repo = repository.InMemoryRepository() - u1 = objects.User(name="u1", email="e1", prov_role="r1") - u2 = objects.User(name="u2", email="e2", prov_role="r1") - repo.add(u1) - repo.add(u2) - assert repo.list_all(objects.User, name="u1") == [u1] - assert repo.list_all(objects.User, name="u2") == [u2] - assert repo.list_all(objects.User, prov_role="r1") == [u1, u2] - - def test_list_all_returns_empty_list_if_repository_is_empty(self): - repo = repository.InMemoryRepository() - assert repo.list_all(objects.User, name="name") == [] diff --git a/tests/random_refs.py b/tests/random_refs.py deleted file mode 100644 index 173759f..0000000 --- a/tests/random_refs.py +++ /dev/null @@ -1,17 +0,0 @@ -import uuid -from gitlab2prov.domain import objects -from gitlab2prov.domain.constants import ProvRole - - -def random_suffix(): - return uuid.uuid4().hex[:6] - - -def random_user(): - return objects.User( - name=f"user-name-{random_suffix()}", - email=f"user-email-{random_suffix()}", - gitlab_username=f"gitlab-user-name-{random_suffix()}", - gitlab_id=f"gitlab-user-id-{random_suffix()}", - prov_role=ProvRole.AUTHOR, - ) diff --git a/tests/unit/test_annotation_parsing.py b/tests/unit/test_annotation_parsing.py deleted file mode 100644 index 3e8a841..0000000 --- a/tests/unit/test_annotation_parsing.py +++ /dev/null @@ -1,66 +0,0 @@ -from gitlab2prov.adapters.fetch.annotations import CLASSIFIERS -from gitlab2prov.adapters.fetch.annotations.parse import classify_system_note -from gitlab2prov.adapters.fetch.annotations.parse import longest_matching_classifier -from gitlab2prov.adapters.fetch.annotations.parse import normalize - - -class TestNormalize: - def test_removes_trailing_whitespace(self): - string = " test " - assert not normalize(string).startswith(" ") - assert not normalize(string).endswith(" ") - - def test_lowercase(self): - string = "TEST" - assert normalize(string).islower() - - -class TestLongestMatchingClassifier: - def test_returns_classifier_with_the_longest_match(self): - string = "changed epic to slug&123" - assert longest_matching_classifier(string) is CLASSIFIERS[1] - assert longest_matching_classifier(string).name == "change_epic" - string = "close via merge request slug!123" - assert longest_matching_classifier(string) is CLASSIFIERS[7] - assert longest_matching_classifier(string).name == "close_by_external_merge_request" - string = "enabled automatic add to merge train when the pipeline for 12345abcde succeeds" - assert longest_matching_classifier(string) is CLASSIFIERS[-1] - assert longest_matching_classifier(string).name == "enable_automatic_add_to_merge_train" - - def test_returns_none_if_no_match_was_found(self): - string = "NOT_MATCHABLE" - assert longest_matching_classifier(string) is None - - -class TestClassifySystemNote: - def test_returns_import_statement_capture_groups(self): - expected_captures = {"pre_import_author": "original-author"} - string = "*by original-author on 1970-01-01T00:00:00 (imported from gitlab project)*" - assert classify_system_note(string)[1] == expected_captures - string = "*by original-author on 1970-01-01 00:00:00 UTC (imported from gitlab project)*" - assert classify_system_note(string)[1] == expected_captures - - def test_returns_annotation_classifier_capture_groups(self): - string = "assigned to @developer" - expected_captures = {"user_name": "developer"} - assert classify_system_note(string)[1] == expected_captures - - def test_returns_combined_capture_groups_of_the_import_statement_and_the_classifier( - self, - ): - string = "assigned to @developer *by original-author on 1970-01-01T00:00:00 (imported from gitlab project)*" - expected_captures = { - "user_name": "developer", - "pre_import_author": "original-author", - } - assert classify_system_note(string)[1] == expected_captures - - def test_returns_classifier_name_for_known_string(self): - string = "assigned to @developer" - expected_name = "assign_user" - assert classify_system_note(string)[0] == expected_name - - def test_returns_default_annotation_for_unknown_string(self): - string = "UNKNOWN" - expected_name = "default_annotation" - assert classify_system_note(string)[0] == expected_name diff --git a/tests/unit/test_classifiers.py b/tests/unit/test_classifiers.py deleted file mode 100644 index cdf7dff..0000000 --- a/tests/unit/test_classifiers.py +++ /dev/null @@ -1,87 +0,0 @@ -import random -import re -import string - -import pytest - -from gitlab2prov.adapters.fetch.annotations.classifiers import Classifier -from gitlab2prov.adapters.fetch.annotations.classifiers import ImportStatement -from gitlab2prov.adapters.fetch.annotations.classifiers import match_length - - -class TestMatchLength: - def test_raises_value(self): - with pytest.raises(TypeError): - match_length(None) - - def test_match_length_with_n_length_matches(self): - for idx in range(1, 1000): - pattern = r"\d{%d}" % idx - s = "".join(random.choices(string.digits, k=idx)) - match = re.search(pattern, s) - assert match_length(match) == idx - - -class TestClassifier: - def test_longest_matching_classifier_wins_selection(self): - classifiers = [ - Classifier(patterns=[r"\d{1}"]), - Classifier(patterns=[r"\d{2}"]), - Classifier(patterns=[r"\d{3}"]), - ] - for classifier in classifiers: - classifier.matches(string.digits) - assert max(classifiers, key=len) == classifiers[-1] - - def test_matches_should_return_true_if_any_pattern_matches(self): - classifier = Classifier(patterns=[r"\d", r"\s"]) - assert classifier.matches(string.digits) == True - - def test_matches_should_return_false_if_no_pattern_matches(self): - c = Classifier(patterns=[r"\d", r"\s"]) - assert c.matches(string.ascii_letters) == False - - def test_matches_should_store_the_longest_match_in_the_class_attributes(self): - regexes = [r"\d{1}", r"\d{2}", r"\d{3}"] - classifier = Classifier(patterns=regexes) - classifier.matches(string.digits) - assert classifier.match.re.pattern == regexes[-1] - - def test_groupdict_should_return_empty_dict_if_no_pattern_matches(self): - classifier = Classifier(patterns=[r"\d"]) - classifier.matches(string.ascii_letters) - assert classifier.groupdict() == dict() - - def test_groupdict_should_return_captured_groups_if_a_pattern_matches(self): - classifier = Classifier(patterns=[r"(?P<number>\d)"]) - classifier.matches(string.digits) - assert classifier.groupdict() == {"number": string.digits[0]} - - def test_length_should_be_0_if_no_match_was_found(self): - classifier = Classifier(patterns=[r"\d"]) - classifier.matches(string.ascii_letters) - assert len(classifier) == 0 - - def test_length_should_be_the_span_of_the_found_match(self): - classifier = Classifier(patterns=[r"\d"]) - classifier.matches(string.digits) - assert len(classifier) == 1 - - -class TestImportStatement: - def test_replace_returns_unchanged_string_if_no_match_was_found(self): - imp = ImportStatement(patterns=[r"\d{3}"]) - imp.matches(string.ascii_letters) - assert imp.replace(string.ascii_letters) == string.ascii_letters - - def test_import_statement_removes_only_the_leftmost_occurence(self): - imp = ImportStatement(patterns=[r"\d{3}"]) - imp.matches(string.digits) - assert imp.replace(string.digits) == string.digits[3:] - - def test_removes_trailing_whitespace_after_import_pattern_replacement(self): - imp = ImportStatement(patterns=[r"\d{3}"]) - s = f"{string.whitespace}{string.digits}{string.whitespace}" - imp.matches(s) - assert not imp.replace(s).endswith(" ") - assert not imp.replace(s).startswith(" ") diff --git a/tests/unit/test_fetch_utils.py b/tests/unit/test_fetch_utils.py deleted file mode 100644 index 8e0ee58..0000000 --- a/tests/unit/test_fetch_utils.py +++ /dev/null @@ -1,25 +0,0 @@ -from gitlab2prov.adapters.fetch import utils - - -class TestHelpers: - def test_project_slug(self): - expected_slug = "owner/project" - assert expected_slug == utils.project_slug("https://gitlab.com/owner/project") - - def test_gitlab_url(self): - expected_url = "https://gitlab.com" - assert expected_url == utils.instance_url("https://gitlab.com/owner/project") - - def test_github_url(self): - expected_url = "https://github.com" - assert expected_url == utils.instance_url("https://github.com/owner/project") - - def test_clone_over_https_url(self): - expected_gitlab_url = "https://gitlab.com:TOKEN@gitlab.com/owner/project" - assert expected_gitlab_url == utils.clone_over_https_url( - "https://gitlab.com/owner/project", "TOKEN", "gitlab" - ) - expected_github_url = "https://TOKEN@github.com/owner/project.git" - assert expected_github_url == utils.clone_over_https_url( - "https://github.com/owner/project", "TOKEN", "github" - ) diff --git a/tests/unit/test_handlers.py b/tests/unit/test_handlers.py deleted file mode 100644 index 14224f5..0000000 --- a/tests/unit/test_handlers.py +++ /dev/null @@ -1,94 +0,0 @@ -from typing import TypeVar, Type, Optional - -from gitlab2prov import bootstrap -from gitlab2prov.adapters import repository -from gitlab2prov.service_layer import unit_of_work - - -R = TypeVar("R") - - -class FakeRepository(repository.AbstractRepository): - def __init__(self, resources: R): - self._resources = set(resources) - - def _add(self, resource: R): - self._resources.add(resource) - - def _get(self, resource_type: Type[R], **filters) -> Optional[R]: - return next( - ( - r - for r in self._resources - if all(getattr(r, key) == val for key, val in filters.items()) - ) - ) - - def _list_all(self, resource_type: Type[R], **filters) -> list[R]: - return [ - r - for r in self._resources - if all(getattr(r, key) == val for key, val in filters.items()) - ] - - -class FakeUnitOfWork(unit_of_work.AbstractUnitOfWork): - def __init__(self): - self.resources = FakeRepository([]) - self.committed = False - - def _commit(self): - self.committed = True - - def rollback(self): - pass - - -def FakeGitFetcher(resources): - class FakeGitRepositoryMiner: - def __init__(self, url, token): - self.resources = resources - - def __enter__(self): - return self - - def __exit__(self, exc_type, exc_val, exc_tb): - pass - - def do_clone(self): - pass - - def fetch_git(self): - return iter(self.resources) - - return FakeGitRepositoryMiner - - -def FakeGitlabFetcher(resources): - class FakeGitlabFetcher: - def __init__(self, url, token): - self.resources = resources - - def do_login(self): - pass - - def fetch_gitlab(self): - return iter(self.resources) - - return FakeGitlabFetcher - - -def bootstrap_test_app(git_resources=None, gitlab_resources=None): - if git_resources is None: - git_resources = [] - if gitlab_resources is None: - gitlab_resources = [] - return bootstrap.bootstrap( - uow=FakeUnitOfWork(), - git_fetcher=FakeGitFetcher(git_resources), - gitlab_fetcher=FakeGitlabFetcher(gitlab_resources), - ) - - -class TestHandlers: - pass diff --git a/tests/unit/test_objects.py b/tests/unit/test_objects.py deleted file mode 100644 index 204dcbf..0000000 --- a/tests/unit/test_objects.py +++ /dev/null @@ -1,869 +0,0 @@ -from datetime import datetime, timedelta -from urllib.parse import urlencode - -from prov.model import ( - PROV_TYPE, - PROV_ROLE, - PROV_ATTR_STARTTIME, - PROV_ATTR_ENDTIME, - PROV_LABEL, -) - -from gitlab2prov.domain import objects -from gitlab2prov.domain.constants import ProvType, ProvRole -from gitlab2prov.prov.operations import qualified_name - -from tests.random_refs import random_suffix - - -today = datetime.now() -yesterday = today - timedelta(days=1) -next_week = today + timedelta(days=7) -tomorrow = today + timedelta(days=1) - - -class TestUser: - def test_identifier(self): - name = f"user-name-{random_suffix()}" - email = f"user-email-{random_suffix()}" - username = f"user-username-{random_suffix()}" - id = f"user-id-{random_suffix()}" - role = ProvRole.AUTHOR - user = objects.User( - name=name, - email=email, - gitlab_username=username, - gitlab_id=id, - prov_role=role, - ) - expected_identifier = qualified_name( - f"User?{urlencode([('name', name), ('email', email)])}" - ) - assert user.prov_identifier == expected_identifier - - def test_attributes(self): - name = f"user-name-{random_suffix()}" - email = f"user-email-{random_suffix()}" - username = f"user-username-{random_suffix()}" - id = f"user-id-{random_suffix()}" - role = f"user-prov-role-{random_suffix()}" - role = ProvRole.AUTHOR - user = objects.User( - name=name, - email=email, - gitlab_username=username, - gitlab_id=id, - prov_role=role, - ) - expected_attributes = [ - ("name", name), - ("email", email), - ("gitlab_username", username), - ("gitlab_id", id), - (PROV_ROLE, role), - (PROV_TYPE, ProvType.USER), - (PROV_LABEL, user.prov_label), - ] - assert user.prov_attributes == expected_attributes - - def test_email_normalization(self): - name = f"user-name-{random_suffix()}" - role = f"user-prov-role-{random_suffix()}" - uppercase = f"user-email-{random_suffix()}".upper() - user = objects.User(name=name, email=uppercase, prov_role=role) - assert user.email.islower() - - -class TestFile: - def test_identifier(self): - path = f"file-path-{random_suffix()}" - hexsha = f"commit-hash-{random_suffix()}" - f = objects.File(path=path, committed_in=hexsha) - expected_identifier = qualified_name( - f"File?{urlencode([('path', path), ('committed_in', hexsha)])}" - ) - assert f.prov_identifier == expected_identifier - - def test_attributes(self): - path = f"file-path-{random_suffix()}" - hexsha = f"commit-hash-{random_suffix()}" - f = objects.File(path=path, committed_in=hexsha) - expected_attributes = [ - ("path", path), - ("committed_in", hexsha), - (PROV_TYPE, ProvType.FILE), - (PROV_LABEL, f.prov_label), - ] - assert f.prov_attributes == expected_attributes - - -class TestFileRevision: - def test_identifier(self): - path = f"file-path-{random_suffix()}" - hexsha = f"commit-hash-{random_suffix()}" - change_type = f"change-type-{random_suffix()}" - file_revision = objects.FileRevision( - path=path, - committed_in=hexsha, - change_type=change_type, - original=None, - previous=None, - ) - expected_identifier = qualified_name( - f"FileRevision?{urlencode([('path', path), ('committed_in', hexsha), ('change_type', change_type)])}" - ) - assert file_revision.prov_identifier == expected_identifier - - def test_attributes(self): - path = f"file-path-{random_suffix()}" - hexsha = f"commit-hash-{random_suffix()}" - change_type = f"change-type-{random_suffix()}" - file_revision = objects.FileRevision( - path=path, - committed_in=hexsha, - change_type=change_type, - original=None, - previous=None, - ) - expected_attributes = [ - ("path", path), - ("committed_in", hexsha), - ("change_type", change_type), - (PROV_TYPE, ProvType.FILE_REVISION), - (PROV_LABEL, file_revision.prov_label), - ] - assert file_revision.prov_attributes == expected_attributes - - -class TestGitCommit: - def test_identifier(self): - hexsha = f"commit-hash-{random_suffix()}" - msg = f"commit-message-{random_suffix()}" - title = f"commit-title-{random_suffix()}" - commit = objects.GitCommit( - hexsha=hexsha, - message=msg, - title=title, - author=None, - committer=None, - parents=[], - prov_start=today, - prov_end=tomorrow, - ) - expected_identifier = qualified_name(f"GitCommit?{urlencode([('hexsha', hexsha)])}") - assert commit.prov_identifier == expected_identifier - - def test_attributes(self): - hexsha = f"commit-hash-{random_suffix()}" - msg = f"commit-message-{random_suffix()}" - title = f"commit-title-{random_suffix()}" - commit = objects.GitCommit( - hexsha=hexsha, - message=msg, - title=title, - author=None, - committer=None, - parents=[], - prov_start=today, - prov_end=tomorrow, - ) - expected_attributes = [ - ("hexsha", hexsha), - ("message", msg), - ("title", title), - (PROV_ATTR_STARTTIME, today), - (PROV_ATTR_ENDTIME, tomorrow), - (PROV_TYPE, ProvType.GIT_COMMIT), - (PROV_LABEL, commit.prov_label), - ] - assert commit.prov_attributes == expected_attributes - - -class TestAsset: - def test_identifier(self): - url = f"asset-url-{random_suffix()}" - fmt = f"asset-format-{random_suffix()}" - asset = objects.Asset(url=url, format=fmt) - expected_identifier = qualified_name(f"Asset?{urlencode([('url', url), ('format', fmt)])}") - assert asset.prov_identifier == expected_identifier - - def test_attributes(self): - url = f"asset-url-{random_suffix()}" - fmt = f"asset-format-{random_suffix()}" - asset = objects.Asset(url=url, format=fmt) - expected_attributes = [ - ("url", url), - ("format", fmt), - (PROV_TYPE, ProvType.ASSET), - (PROV_LABEL, asset.prov_label), - ] - assert asset.prov_attributes == expected_attributes - - -class TestEvidence: - def test_identifier(self): - sha = f"evidence-sha-{random_suffix()}" - url = f"evidence-url-{random_suffix()}" - evidence = objects.Evidence(hexsha=sha, url=url, collected_at=today) - expected_identifier = qualified_name( - f"Evidence?{urlencode([('hexsha', sha), ('url', url), ('collected_at', today)])}" - ) - assert evidence.prov_identifier == expected_identifier - - def test_attributes(self): - sha = f"evidence-sha-{random_suffix()}" - url = f"evidence-url-{random_suffix()}" - evidence = objects.Evidence(hexsha=sha, url=url, collected_at=today) - expected_attributes = [ - ("hexsha", sha), - ("url", url), - ("collected_at", today), - (PROV_TYPE, ProvType.EVIDENCE), - (PROV_LABEL, evidence.prov_label), - ] - assert evidence.prov_attributes == expected_attributes - - -class TestAnnotatedVersion: - def test_identifier(self): - vid = f"version-id-{random_suffix()}" - aid = f"annotation-id-{random_suffix()}" - annotated_version = objects.AnnotatedVersion( - version_id=vid, - annotation_id=aid, - prov_type=ProvType.GITLAB_COMMIT_VERSION_ANNOTATED, - ) - expected_identifier = qualified_name( - f"{ProvType.GITLAB_COMMIT_VERSION_ANNOTATED}?{urlencode([('version_id', vid), ('annotation_id', aid)])}" - ) - assert annotated_version.prov_identifier == expected_identifier - - def test_attributes(self): - vid = f"version-id-{random_suffix()}" - aid = f"annotation-id-{random_suffix()}" - annotated_version = objects.AnnotatedVersion( - version_id=vid, annotation_id=aid, prov_type="TestAnnotatedVersion" - ) - expected_attributes = [ - ("version_id", vid), - ("annotation_id", aid), - (PROV_TYPE, "TestAnnotatedVersion"), - (PROV_LABEL, annotated_version.prov_label), - ] - assert annotated_version.prov_attributes == expected_attributes - - -class TestCreation: - def test_identifier(self): - id = f"creation-id-{random_suffix()}" - creation = objects.Creation( - creation_id=id, - prov_start=today, - prov_end=tomorrow, - prov_type=ProvType.TAG_CREATION, - ) - expected_identifier = qualified_name( - f"{ProvType.TAG_CREATION}?{urlencode([('creation_id', id)])}" - ) - assert creation.prov_identifier == expected_identifier - - def test_attributes(self): - id = f"creation-id-{random_suffix()}" - creation = objects.Creation( - creation_id=id, - prov_start=today, - prov_end=tomorrow, - prov_type=ProvType.TAG_CREATION, - ) - expected_attributes = [ - ("creation_id", id), - (PROV_ATTR_STARTTIME, today), - (PROV_ATTR_ENDTIME, tomorrow), - (PROV_TYPE, "TagCreation"), - (PROV_LABEL, creation.prov_label), - ] - assert creation.prov_attributes == expected_attributes - - -class TestAnnotation: - def test_identifier(self): - id = f"annotation-id-{random_suffix()}" - type = f"annotation-type-{random_suffix()}" - body = f"annotation-body-{random_suffix()}" - annotation = objects.Annotation( - id=id, - type=type, - body=body, - annotator=None, - prov_start=today, - prov_end=tomorrow, - ) - expected_identifier = qualified_name( - f"Annotation?{urlencode([('id', id), ('type', type)])}" - ) - assert annotation.prov_identifier == expected_identifier - - def test_attributes(self): - id = f"annotation-id-{random_suffix()}" - type = f"annotation-type-{random_suffix()}" - body = f"annotation-body-{random_suffix()}" - annotation = objects.Annotation( - id=id, - type=type, - body=body, - annotator=None, - prov_start=today, - prov_end=tomorrow, - ) - expected_attributes = [ - ("id", id), - ("type", type), - ("body", body), - (PROV_ATTR_STARTTIME, today), - (PROV_ATTR_ENDTIME, tomorrow), - (PROV_TYPE, ProvType.ANNOTATION), - (PROV_LABEL, annotation.prov_label), - ] - assert annotation.prov_attributes == expected_attributes - - def test_kwargs(self): - id = f"annotation-id-{random_suffix()}" - type = f"annotation-type-{random_suffix()}" - body = f"annotation-body-{random_suffix()}" - kwargs = {"kwarg1": "value1", "kwarg2": "value2"} - annotation = objects.Annotation( - id=id, - type=type, - body=body, - annotator=None, - prov_start=today, - prov_end=tomorrow, - kwargs=kwargs, - ) - expected_attributes = [ - ("id", id), - ("type", type), - ("body", body), - ("kwarg1", "value1"), - ("kwarg2", "value2"), - (PROV_ATTR_STARTTIME, today), - (PROV_ATTR_ENDTIME, tomorrow), - (PROV_TYPE, ProvType.ANNOTATION), - (PROV_LABEL, annotation.prov_label), - ] - assert annotation.prov_attributes == expected_attributes - - -class TestIssue: - def test_identifier(self): - id = f"issue-id-{random_suffix()}" - iid = f"issue-iid-{random_suffix()}" - title = f"issue-title-{random_suffix()}" - desc = f"issue-description-{random_suffix()}" - url = f"issue-url-{random_suffix()}" - issue = objects.Issue( - id=id, - iid=iid, - title=title, - description=desc, - url=url, - author=None, - annotations=[], - created_at=today, - closed_at=tomorrow, - ) - expected_identifier = qualified_name( - f"Issue?{urlencode([('id', id), ('iid', iid), ('title', title)])}" - ) - assert issue.prov_identifier == expected_identifier - - def test_attributes(self): - id = f"issue-id-{random_suffix()}" - iid = f"issue-iid-{random_suffix()}" - title = f"issue-title-{random_suffix()}" - desc = f"issue-description-{random_suffix()}" - url = f"issue-url-{random_suffix()}" - issue = objects.Issue( - id=id, - iid=iid, - title=title, - description=desc, - url=url, - author=None, - annotations=[], - created_at=today, - closed_at=tomorrow, - ) - expected_attributes = [ - ("id", id), - ("iid", iid), - ("title", title), - ("description", desc), - ("url", url), - ("created_at", today), - ("closed_at", tomorrow), - (PROV_TYPE, ProvType.ISSUE), - (PROV_LABEL, issue.prov_label), - ] - assert issue.prov_attributes == expected_attributes - - def test_creation(self): - id = f"issue-id-{random_suffix()}" - issue = objects.Issue( - id=id, - iid="", - title="", - description="", - url="", - author=None, - annotations=[], - created_at=today, - closed_at=tomorrow, - ) - expected_creation = objects.Creation( - creation_id=id, - prov_start=today, - prov_end=tomorrow, - prov_type=ProvType.ISSUE_CREATION, - ) - assert issue.creation == expected_creation - - def test_first_version(self): - id = f"issue-id-{random_suffix()}" - issue = objects.Issue( - id=id, - iid="", - title="", - description="", - url="", - author=None, - annotations=[], - created_at=today, - closed_at=tomorrow, - ) - expected_first_version = objects.Version(version_id=id, prov_type=ProvType.ISSUE_VERSION) - assert issue.first_version == expected_first_version - - def test_annotated_versions(self): - hexsha = f"commit-sha-{random_suffix()}" - aid1 = f"annotation-id-{random_suffix()}" - aid2 = f"annotation-id-{random_suffix()}" - annot1 = objects.Annotation( - id=aid1, - type="", - body="", - annotator=None, - prov_start=today, - prov_end=tomorrow, - ) - annot2 = objects.Annotation( - id=aid2, - type="", - body="", - annotator=None, - prov_start=today, - prov_end=tomorrow, - ) - annots = [annot1, annot2] - commit = objects.GitlabCommit( - hexsha=hexsha, - url="", - author=None, - annotations=annots, - authored_at=today, - committed_at=tomorrow, - ) - ver1 = objects.AnnotatedVersion( - version_id=hexsha, - annotation_id=annot1.id, - prov_type=ProvType.GITLAB_COMMIT_VERSION_ANNOTATED, - ) - ver2 = objects.AnnotatedVersion( - version_id=hexsha, - annotation_id=annot2.id, - prov_type=ProvType.GITLAB_COMMIT_VERSION_ANNOTATED, - ) - expected_versions = [ver1, ver2] - assert commit.annotated_versions == expected_versions - - -class TestGitlabCommit: - def test_identifier(self): - hexsha = f"commit-hash-{random_suffix()}" - url = f"commit-url-{random_suffix()}" - commit = objects.GitlabCommit( - hexsha=hexsha, - url=url, - author=None, - annotations=[], - authored_at=today, - committed_at=tomorrow, - ) - expected_identifier = qualified_name(f"GitlabCommit?{urlencode([('hexsha', hexsha)])}") - assert commit.prov_identifier == expected_identifier - - def test_attributes(self): - hexsha = f"commit-hash-{random_suffix()}" - url = f"commit-url-{random_suffix()}" - commit = objects.GitlabCommit( - hexsha=hexsha, - url=url, - author=None, - annotations=[], - authored_at=today, - committed_at=tomorrow, - ) - expected_attributes = [ - ("hexsha", hexsha), - ("url", url), - ("authored_at", today), - ("committed_at", tomorrow), - (PROV_TYPE, ProvType.GITLAB_COMMIT), - (PROV_LABEL, commit.prov_label), - ] - assert commit.prov_attributes == expected_attributes - - def test_creation(self): - hexsha = f"commit-sha-{random_suffix()}" - commit = objects.GitlabCommit( - hexsha=hexsha, - url="", - author=None, - annotations=[], - authored_at=today, - committed_at=tomorrow, - ) - expected_creation = objects.Creation( - creation_id=hexsha, - prov_start=today, - prov_end=tomorrow, - prov_type=ProvType.GITLAB_COMMIT_CREATION, - ) - assert commit.creation == expected_creation - - def test_first_version(self): - hexsha = f"commit-sha-{random_suffix()}" - commit = objects.GitlabCommit( - hexsha=hexsha, - url="", - author=None, - annotations=[], - authored_at=today, - committed_at=tomorrow, - ) - expected_first_version = objects.Version( - version_id=hexsha, prov_type=ProvType.GITLAB_COMMIT_VERSION - ) - assert commit.first_version == expected_first_version - - def test_annotated_versions(self): - hexsha = f"commit-sha-{random_suffix()}" - aid1 = f"annotation-id-{random_suffix()}" - aid2 = f"annotation-id-{random_suffix()}" - annot1 = objects.Annotation( - id=aid1, - type="", - body="", - annotator=None, - prov_start=today, - prov_end=tomorrow, - ) - annot2 = objects.Annotation( - id=aid2, - type="", - body="", - annotator=None, - prov_start=today, - prov_end=tomorrow, - ) - annots = [annot1, annot2] - commit = objects.GitlabCommit( - hexsha=hexsha, - url="", - author=None, - annotations=annots, - authored_at=today, - committed_at=tomorrow, - ) - ver1 = objects.AnnotatedVersion( - version_id=hexsha, - annotation_id=annot1.id, - prov_type=ProvType.GITLAB_COMMIT_VERSION_ANNOTATED, - ) - ver2 = objects.AnnotatedVersion( - version_id=hexsha, - annotation_id=annot2.id, - prov_type=ProvType.GITLAB_COMMIT_VERSION_ANNOTATED, - ) - expected_versions = [ver1, ver2] - assert commit.annotated_versions == expected_versions - - -class TestMergeRequest: - def test_identifier(self): - id = f"merge-request-id-{random_suffix()}" - iid = f"merge-request-iid-{random_suffix()}" - title = f"merge-request-title-{random_suffix()}" - desc = f"merge-request-description-{random_suffix()}" - url = f"merge-request-url-{random_suffix()}" - source_branch = f"merge-request-source-branch-{random_suffix()}" - target_branch = f"merge-request-target-branch-{random_suffix()}" - merge_request = objects.MergeRequest( - id=id, - iid=iid, - title=title, - description=desc, - url=url, - source_branch=source_branch, - target_branch=target_branch, - author=None, - annotations=[], - created_at=today, - closed_at=tomorrow, - merged_at=next_week, - first_deployed_to_production_at=yesterday, - ) - expected_identifier = qualified_name( - f"MergeRequest?{urlencode([('id', id), ('iid', iid), ('title', title)])}" - ) - assert merge_request.prov_identifier == expected_identifier - - def test_attributes(self): - id = f"merge-request-id-{random_suffix()}" - iid = f"merge-request-iid-{random_suffix()}" - title = f"merge-request-title-{random_suffix()}" - desc = f"merge-request-description-{random_suffix()}" - url = f"merge-request-url-{random_suffix()}" - source_branch = f"merge-request-source-branch-{random_suffix()}" - target_branch = f"merge-request-target-branch-{random_suffix()}" - merge_request = objects.MergeRequest( - id=id, - iid=iid, - title=title, - description=desc, - url=url, - source_branch=source_branch, - target_branch=target_branch, - author=None, - annotations=[], - created_at=today, - closed_at=tomorrow, - merged_at=next_week, - first_deployed_to_production_at=yesterday, - ) - expected_attributes = [ - ("id", id), - ("iid", iid), - ("title", title), - ("description", desc), - ("url", url), - ("source_branch", source_branch), - ("target_branch", target_branch), - ("created_at", today), - ("closed_at", tomorrow), - ("merged_at", next_week), - ("first_deployed_to_production_at", yesterday), - (PROV_TYPE, ProvType.MERGE_REQUEST), - (PROV_LABEL, merge_request.prov_label), - ] - assert merge_request.prov_attributes == expected_attributes - - def test_creation(self): - id = f"merge-request-id-{random_suffix()}" - merge_request = objects.MergeRequest( - id=id, - iid="", - title="", - description="", - url="", - source_branch="", - target_branch="", - author=None, - annotations=[], - created_at=today, - closed_at=tomorrow, - merged_at=yesterday, - first_deployed_to_production_at=next_week, - ) - expected_creation = objects.Creation( - creation_id=id, - prov_start=today, - prov_end=tomorrow, - prov_type=ProvType.MERGE_REQUEST_CREATION, - ) - assert merge_request.creation == expected_creation - - def test_first_version(self): - id = f"merge-request-id-{random_suffix()}" - merge_request = objects.MergeRequest( - id=id, - iid="", - title="", - description="", - url="", - source_branch="", - target_branch="", - author=None, - annotations=[], - created_at=today, - closed_at=tomorrow, - merged_at=yesterday, - first_deployed_to_production_at=next_week, - ) - expected_version = objects.Version(version_id=id, prov_type=ProvType.MERGE_REQUEST_VERSION) - assert merge_request.first_version == expected_version - - def test_annotated_versions(self): - id = f"merge-request-id-{random_suffix()}" - aid1 = f"annotation-id-{random_suffix()}" - aid2 = f"annotation-id-{random_suffix()}" - annot1 = objects.Annotation( - id=aid1, - type="", - body="", - kwargs=None, - annotator=None, - prov_start=today, - prov_end=tomorrow, - ) - annot2 = objects.Annotation( - id=aid2, - type="", - body="", - kwargs=None, - annotator=None, - prov_start=today, - prov_end=tomorrow, - ) - annots = [annot1, annot2] - merge_request = objects.MergeRequest( - id=id, - iid="", - title="", - description="", - url="", - source_branch="", - target_branch="", - author=None, - annotations=annots, - created_at=today, - closed_at=tomorrow, - merged_at=yesterday, - first_deployed_to_production_at=next_week, - ) - ver1 = objects.AnnotatedVersion( - version_id=id, - annotation_id=annot1.id, - prov_type=ProvType.MERGE_REQUEST_VERSION_ANNOTATED, - ) - ver2 = objects.AnnotatedVersion( - version_id=id, - annotation_id=annot2.id, - prov_type=ProvType.MERGE_REQUEST_VERSION_ANNOTATED, - ) - expected_versions = [ver1, ver2] - assert merge_request.annotated_versions == expected_versions - - -class TestTag: - def test_identifier(self): - name = f"tag-name-{random_suffix()}" - hexsha = f"commit-sha-{random_suffix()}" - msg = f"tag-message-{random_suffix()}" - tag = objects.Tag(name=name, hexsha=hexsha, message=msg, author=None, created_at=today) - expected_identifier = qualified_name( - f"Tag?{urlencode([('name', name), ('hexsha', hexsha)])}" - ) - assert tag.prov_identifier == expected_identifier - - def test_attributes(self): - name = f"tag-name-{random_suffix()}" - hexsha = f"commit-sha-{random_suffix()}" - msg = f"tag-message-{random_suffix()}" - tag = objects.Tag(name=name, hexsha=hexsha, message=msg, author=None, created_at=today) - expected_attributes = [ - ("name", name), - ("hexsha", hexsha), - ("message", msg), - ("created_at", today), - (PROV_TYPE, ProvType.TAG), - (PROV_TYPE, ProvType.COLLECTION), - (PROV_LABEL, tag.prov_label), - ] - assert tag.prov_attributes == expected_attributes - - def test_creation(self): - name = f"tag-name-{random_suffix()}" - tag = objects.Tag(name=name, hexsha="", message="", author=None, created_at=today) - expected_creation = objects.Creation( - creation_id=name, - prov_start=today, - prov_end=today, - prov_type=ProvType.TAG_CREATION, - ) - assert tag.creation == expected_creation - - -class TestRelease: - def test_identifier(self): - name = f"release-name-{random_suffix()}" - desc = f"release-description-{random_suffix()}" - tag_name = f"tag-name-{random_suffix()}" - release = objects.Release( - name=name, - description=desc, - tag_name=tag_name, - author=None, - assets=[], - evidences=[], - created_at=today, - released_at=tomorrow, - ) - expected_identifier = qualified_name(f"Release?{urlencode([('name', name)])}") - assert release.prov_identifier == expected_identifier - - def test_attributes(self): - name = f"release-name-{random_suffix()}" - desc = f"release-description-{random_suffix()}" - tag_name = f"tag-name-{random_suffix()}" - release = objects.Release( - name=name, - description=desc, - tag_name=tag_name, - author=None, - assets=[], - evidences=[], - created_at=today, - released_at=tomorrow, - ) - expected_attributes = [ - ("name", name), - ("description", desc), - ("tag_name", tag_name), - ("created_at", today), - ("released_at", tomorrow), - (PROV_TYPE, ProvType.RELEASE), - (PROV_TYPE, ProvType.COLLECTION), - (PROV_LABEL, release.prov_label), - ] - assert release.prov_attributes == expected_attributes - - def test_creation(self): - name = f"release-name-{random_suffix()}" - release = objects.Release( - name=name, - description="", - tag_name="", - author=None, - assets=[], - evidences=[], - created_at=today, - released_at=tomorrow, - ) - expected_creation = objects.Creation( - creation_id=name, - prov_start=today, - prov_end=tomorrow, - prov_type=ProvType.RELEASE_CREATION, - ) - assert release.creation == expected_creation diff --git a/tests/unit/test_operations.py b/tests/unit/test_operations.py deleted file mode 100644 index d8bb6c2..0000000 --- a/tests/unit/test_operations.py +++ /dev/null @@ -1,225 +0,0 @@ -import hashlib - -from prov.model import ProvAgent, ProvDocument, ProvRelation, PROV_ROLE, PROV_TYPE - -from gitlab2prov.prov import operations -from gitlab2prov.prov.operations import qualified_name - -from tests.random_refs import random_suffix - - -class TestStats: - def test_format_as_ascii_table(self): - d = {"A": 1, "B": 2, "C": 3} - expected_header = [ - f"|{'Record Type':20}|{'Count':20}|", - f"+{'-'*20}+{'-'*20}+", - ] - expected_body = [ - f"|{'A':20}|{1:20}|", - f"|{'B':20}|{2:20}|", - f"|{'C':20}|{3:20}|", - ] - table = operations.format_stats_as_ascii_table(d) - lines = [l.strip() for l in table.split("\n") if l] - assert lines[:2] == expected_header - assert lines[2:] == expected_body - - def test_format_stats_as_csv(self): - d = {"A": 1, "B": 2, "C": 3} - expected_header = ["Record Type, Count"] - expected_body = [ - "A, 1", - "B, 2", - "C, 3", - ] - csv = operations.format_stats_as_csv(d) - lines = [l.strip() for l in csv.split("\n") if l] - assert lines[:1] == expected_header - assert lines[1:] == expected_body - - -class TestGraphFactory: - def test_namespace_uri_is_gitlab2prov(self): - graph = operations.graph_factory() - expected_uri = "http://github.com/dlr-sc/gitlab2prov/" - assert graph.get_default_namespace().uri == expected_uri - - def test_init_wo_list_of_records(self): - uri = "http://github.com/dlr-sc/gitlab2prov/" - expected_graph = ProvDocument() - expected_graph.set_default_namespace(uri) - assert operations.graph_factory() == expected_graph - - def test_init_with_list_of_records(self): - records = [ - ProvAgent(None, qualified_name(f"agent-id-{random_suffix()}")), - ProvAgent(None, qualified_name(f"agent-id-{random_suffix()}")), - ] - expected_graph = ProvDocument(records) - assert operations.graph_factory(records) == expected_graph - - -class TestCombine: - def test_returns_empty_graph_when_run_wo_subgraphs(self): - assert operations.combine(iter([])) == operations.graph_factory() - - def test_carries_over_all_records(self): - agent1 = ProvAgent(None, qualified_name(f"agent-id-{random_suffix()}")) - agent2 = ProvAgent(None, qualified_name(f"agent-id-{random_suffix()}")) - graph1 = ProvDocument([agent1]) - graph2 = ProvDocument([agent2]) - subgraphs = [graph1, graph2] - expected_graph = ProvDocument([agent1, agent2]) - assert operations.combine(iter(subgraphs)) == expected_graph - - -class TestDedupe: - def test_removes_duplicate_elements(self): - agent = ProvAgent(None, qualified_name(f"agent-id-{random_suffix()}")) - graph = ProvDocument([agent, agent]) - expected_graph = ProvDocument([agent]) - assert list(graph.get_records(ProvAgent)) == [agent, agent] - assert list(operations.dedupe(graph).get_records(ProvAgent)) == [agent] - assert operations.dedupe(graph) == expected_graph - - def test_merges_attributes_of_duplicate_elements(self): - id = qualified_name(f"agent-id-{random_suffix()}") - graph = ProvDocument() - graph.agent(id, {"attribute1": 1}) - graph.agent(id, {"attribute2": 2}) - expected_attributes = [ - (qualified_name("attribute1"), 1), - (qualified_name("attribute2"), 2), - ] - agents = list(operations.dedupe(graph).get_records(ProvAgent)) - assert len(agents) == 1 - assert agents[0].attributes == expected_attributes - - def test_remove_duplicate_relations(self): - graph = ProvDocument() - agent = graph.agent(qualified_name(f"agent-id-{random_suffix()}")) - entity = graph.entity(qualified_name(f"entity-id-{random_suffix()}")) - r1 = graph.wasAttributedTo(entity, agent) - r2 = graph.wasAttributedTo(entity, agent) - assert list(graph.get_records(ProvRelation)) == [r1, r2] - assert list(operations.dedupe(graph).get_records(ProvRelation)) == [r1] - - def test_merges_attributes_of_duplicate_relations(self): - graph = ProvDocument() - agent = graph.agent(qualified_name(f"agent-id-{random_suffix()}")) - entity = graph.entity(qualified_name(f"entity-id-{random_suffix()}")) - r1_attrs = [(qualified_name("attr"), "val1")] - r2_attrs = [(qualified_name("attr"), "val2")] - graph.wasAttributedTo(entity, agent, other_attributes=r1_attrs) - graph.wasAttributedTo(entity, agent, other_attributes=r2_attrs) - - graph = operations.dedupe(graph) - - relations = list(graph.get_records(ProvRelation)) - assert len(relations) == 1 - expected_extra_attributes = set( - [ - (qualified_name("attr"), "val1"), - (qualified_name("attr"), "val2"), - ] - ) - assert set(relations[0].extra_attributes) == expected_extra_attributes - - -class TestUncoverDoubleAgents: - def test_build_inverse_index(self): - mapping = {"name": ["alias1", "alias2"]} - expected_dict = {"alias1": "name", "alias2": "name"} - assert operations.build_inverse_index(mapping) == expected_dict - - def test_uncover_name(self): - names = {"alias": "name"} - graph = operations.graph_factory() - agent = graph.agent("agent-id", other_attributes={qualified_name("name"): "alias"}) - expected_name = (qualified_name("name"), "name") - assert operations.uncover_name(agent, names) == expected_name - - def test_uncover_duplicated_agents_resolves_agent_alias(self, mocker): - d = {"alias1": "name", "alias2": "name"} - mocker.patch("gitlab2prov.prov.operations.read_duplicated_agent_mapping") - mocker.patch("gitlab2prov.prov.operations.build_inverse_index", return_value=d) - - graph = operations.graph_factory() - graph.agent("agent1", {"name": "alias2"}) - graph.agent("agent2", {"name": "alias1"}) - - graph = operations.merge_duplicated_agents(graph, "") - - agents = list(graph.get_records(ProvAgent)) - assert len(agents) == 1 - expected_name = "name" - [(_, name)] = [(k, v) for k, v in agents[0].attributes if k.localpart == "name"] - assert name == expected_name - - def test_uncover_duplicated_agents_reroutes_relations(self, mocker): - d = {"alias1": "name", "alias2": "name"} - mocker.patch("gitlab2prov.prov.operations.read_duplicated_agent_mapping") - mocker.patch("gitlab2prov.prov.operations.build_inverse_index", return_value=d) - - graph = operations.graph_factory() - a1 = graph.agent("agent1", {"name": "alias2"}) - a2 = graph.agent("agent2", {"name": "alias1"}) - e1 = graph.entity("entity1") - e2 = graph.entity("entity2") - e1.wasAttributedTo(a1) - e2.wasAttributedTo(a2) - - graph = operations.merge_duplicated_agents(graph, "") - - relations = list(graph.get_records(ProvRelation)) - assert len(relations) == 2 - expected_identifier = "User?name=name" - assert all( - relation.formal_attributes[1][1].localpart == expected_identifier - for relation in relations - ) - - -class TestPseudonymize: - def test_pseudonymize_changes_agent_name_and_identifier(self): - graph = operations.graph_factory() - name = f"agent-name-{random_suffix()}" - email = f"agent-email-{random_suffix()}" - graph.agent("agent1", {"name": name, "email": email}) - - graph = operations.pseudonymize(graph) - - expected_name = hashlib.sha256(bytes(name, "utf-8")).hexdigest() - expected_email = hashlib.sha256(bytes(email, "utf-8")).hexdigest() - expected_identifier = qualified_name(f"User?name={expected_name}&email={expected_email}") - - agent = next(graph.get_records(ProvAgent)) - assert agent.identifier == expected_identifier - assert list(agent.get_attribute("name"))[0] == expected_name - assert list(agent.get_attribute("email"))[0] == expected_email - - def test_pseudonymize_deletes_non_name_attributes_apart_from_role_and_type(self): - graph = operations.graph_factory() - graph.agent( - "agent1", - { - "name": f"agent-name-{random_suffix()}", - "email": f"email-{random_suffix()}", - "gitlab_username": f"gitlab-username-{random_suffix()}", - "gitlab_id": f"gitlab-id-{random_suffix()}", - PROV_ROLE: f"prov-role-{random_suffix()}", - PROV_TYPE: f"prov-type-{random_suffix()}", - }, - ) - - graph = operations.pseudonymize(graph) - - agent = next(graph.get_records(ProvAgent)) - expected_attributes = [ - PROV_ROLE, - PROV_TYPE, - qualified_name("name"), - qualified_name("email"), - ] - assert all([(attr in expected_attributes) for (attr, _) in agent.extra_attributes]) From 52f420ddaa2c617b10ffa427206a402e8ae5c3d1 Mon Sep 17 00:00:00 2001 From: cdboer <dev@claas.plus> Date: Mon, 22 May 2023 10:56:20 +0200 Subject: [PATCH 58/81] Update attribute tables --- docs/README.md | 393 ++++++++++++++++++++-------------- gitlab2prov/domain/objects.py | 94 +++++--- 2 files changed, 294 insertions(+), 193 deletions(-) diff --git a/docs/README.md b/docs/README.md index 9de0832..207af1e 100644 --- a/docs/README.md +++ b/docs/README.md @@ -60,8 +60,8 @@ Both entities are generated by the commit activity and are attributed to the aut | email | - | `git config user.email` Set in the author's git config. | | gitlab_username | - | Gitlab user account username. | | github_username | - | Github user account username. | -| gitlab_email | - | Gitlab user account email. | -| github_email | - | Github user account email. | +| gitlab_id | - | Gitlab user id. | +| github_id | - | Github user id. | | prov:role | Author | Function of the agent in context of the commit activity. | | prov:type | User | Agent type. | | prov:label | - | Human readable representation of the agent. | @@ -74,8 +74,8 @@ Both entities are generated by the commit activity and are attributed to the aut | email | - | `git config user.email` Set in the author's git config. | | gitlab_username | - | Gitlab user account username. | | github_username | - | Github user account username. | -| gitlab_email | - | Gitlab user account email. | -| github_email | - | Github user account email. | +| gitlab_id | - | Gitlab user id. | +| github_id | - | Github user id. | | prov:role | Committer | Function of the agent in context of the commit activity. | | prov:type | User | Agent type. | | prov:label | - | Human readable representation of the agent. | @@ -87,6 +87,10 @@ Both entities are generated by the commit activity and are attributed to the aut | sha | - | Commit SHA1 | | title | - | First 50 characters of the commit message. | | message | - | Commit message. | +| deleted | - | Number of lines deleted. | +| inserted | - | Number of lines inserted. | +| lines | - | Number of lines changed. | +| files | - | Number of files changed. | | authored_at | - | Time at which the commit was authored. | | committed_at | - | Time at which the commit was committed. | | prov:startTime | `COMMIT_AUTHOR_DATE` | Time at which the commit activity started. | @@ -106,15 +110,18 @@ Both entities are generated by the commit activity and are attributed to the aut **`File Revision`** -| Attribute | Fixed Value | Description | -| ----------- | ---------------------------------- | ---------------------------------------------------------------------------- | -| name | - | Current file name. | -| path | - | Current file path of this revision. | -| commit | - | SHA1 of the commit that added this revision to the repository. | -| status | `added` or `modified` or `deleted` | Change status of the file revision. | -| change_type | - | [`git diff`](https://git-scm.com/docs/git-diff) change type / change status. | -| prov:type | FileRevision | Entity type. | -| prov:label | - | Human readable representation of the entity. | +| Attribute | Fixed Value | Description | +| ---------- | ---------------------------------- | --------------------------------------------------------------------------------------------------- | +| name | - | Current file name. | +| path | - | Current file path of this revision. | +| commit | - | SHA1 of the commit that added this revision to the repository. | +| status | `added` or `modified` or `deleted` | Change status of the file revision. | +| inserted | - | Number of lines inserted. | +| deleted | - | Number of lines deleted. | +| lines | - | Number of lines changed. | +| score | - | Percentage of similarity compared to previous revision ([Docs](https://git-scm.com/docs/git-diff)). | +| prov:type | FileRevision | Entity type. | +| prov:label | - | Human readable representation of the entity. | Some PROV relations in this model are "qualified" relations. @@ -130,10 +137,14 @@ The following tables define the attributes attached to these relations. **`File Revision - [wasGeneratedBy] -> Commit`** -| Attribute | Fixed Value | Description | -| --------- | ----------------------------- | ---------------------------------------------------------------------- | -| prov:role | FileRevisionAtPointOfAddition | Function of the FileRevision entity in context of the Commit activity. | -| prov:time | `COMMIT_AUTHOR_DATE` | Time at which the FileRevision entity was generated. | +| Attribute | Fixed Value | Description | +| --------- | ----------------------------- | --------------------------------------------------------------------------------------------------- | +| inserted | - | Number of lines inserted. | +| deleted | - | Number of lines deleted. | +| lines | - | Number of lines changed. | +| score | - | Percentage of similarity compared to previous revision ([Docs](https://git-scm.com/docs/git-diff)). | +| prov:role | FileRevisionAtPointOfAddition | Function of the FileRevision entity in context of the Commit activity. | +| prov:time | `COMMIT_AUTHOR_DATE` | Time at which the FileRevision entity was generated. | **`Commit - [wasAssociatedWith] -> Author`** @@ -197,9 +208,13 @@ All revisions are marked as specializations of the File entity. **`Commit`** | Attribute | Fixed Value | Description | | -------------- | ----------------------- | ---------------------------------------------- | -| hexsha | - | Commit SHA1 | -| message | - | Commit message. | +| sha | - | Commit SHA1 | | title | - | First 50 characters of the commit message. | +| message | - | Commit message. | +| deleted | - | Number of lines deleted. | +| inserted | - | Number of lines inserted. | +| lines | - | Number of lines changed. | +| files | - | Number of files changed. | | prov:startTime | `COMMIT_AUTHOR_DATE` | Time at which the commit activity started. | | prov:endTime | `COMMIT_COMMITTER_DATE` | Time at which the commit activity ended. | | prov:type | GitCommit | Activity type. | @@ -207,32 +222,43 @@ All revisions are marked as specializations of the File entity. **`File`** -| Attribute | Fixed Value | Description | -| ------------ | ----------- | ------------------------------------------------------------------ | -| path | - | Original file path. The path at which this file was first created. | -| committed_in | - | SHA1 of the commit that added this file to the repository. | -| prov:type | File | Entity type. | -| prov:label | - | Human readable representation of the entity. | +| Attribute | Fixed Value | Description | +| ---------- | ----------- | ------------------------------------------------------------------ | +| name | - | Original file name. | +| path | - | Original file path. The path at which this file was first created. | +| commit | - | SHA1 of the commit that added this file to the repository. | +| prov:type | File | Entity type. | +| prov:label | - | Human readable representation of the entity. | **`File Revision`** -| Attribute | Fixed Value | Description | -| ------------ | ------------ | ---------------------------------------------------------------------------- | -| path | - | Current file path of this revision. | -| committed_in | - | SHA1 of the commit that added this revision to the repository. | -| change_type | - | [`git diff`](https://git-scm.com/docs/git-diff) change type / change status. | -| prov:type | FileRevision | Entity type. | -| prov:label | - | Human readable representation of the entity. | +| Attribute | Fixed Value | Description | +| ---------- | ---------------------------------- | --------------------------------------------------------------------------------------------------- | +| name | - | Original file name. | +| path | - | Original file path. The path at which this file was first created. | +| commit | - | SHA1 of the commit that added this file to the repository. | +| status | `added` or `modified` or `deleted` | Change status of the file revision. | +| inserted | - | Number of lines inserted. | +| deleted | - | Number of lines deleted. | +| lines | - | Number of lines changed. | +| score | - | Percentage of similarity compared to previous revision ([Docs](https://git-scm.com/docs/git-diff)). | +| prov:type | FileRevision | Entity type. | +| prov:label | - | Human readable representation of the entity. | **`Previous File Revision`** -| Attribute | Fixed Value | Description | -| ------------ | ------------ | ---------------------------------------------------------------------------- | -| path | - | Current file path of this revision. | -| committed_in | - | SHA1 of the commit that added this revision to the repository. | -| change_type | - | [`git diff`](https://git-scm.com/docs/git-diff) change type / change status. | -| prov:type | FileRevision | Entity type. | -| prov:label | - | Human readable representation of the entity. | +| Attribute | Fixed Value | Description | +| ---------- | ---------------------------------- | --------------------------------------------------------------------------------------------------- | +| name | - | Original file name. | +| path | - | Original file path. The path at which this file was first created. | +| commit | - | SHA1 of the commit that added this file to the repository. | +| status | `added` or `modified` or `deleted` | Change status of the file revision. | +| inserted | - | Number of lines inserted. | +| deleted | - | Number of lines deleted. | +| lines | - | Number of lines changed. | +| score | - | Percentage of similarity compared to previous revision ([Docs](https://git-scm.com/docs/git-diff)). | +| prov:type | FileRevision | Entity type. | +| prov:label | - | Human readable representation of the entity. | Some PROV relations in this model are "qualified" relations. In simple terms: Some PROV relations have attributes attached to them. @@ -247,10 +273,14 @@ The following tables define the attributes attached to these relations. **`File Revision - [wasGeneratedBy] -> Commit`** -| Attribute | Fixed Value | Description | -| --------- | ----------------------------- | -------------------------------------------------------------- | -| prov:role | FileRevisionAfterModification | Function of the File entity in context of the Commit activity. | -| prov:time | `COMMIT_AUTHOR_DATE` | Time at which the File entity was generated. | +| Attribute | Fixed Value | Description | +| --------- | ----------------------------- | --------------------------------------------------------------------------------------------------- | +| inserted | - | Number of lines inserted. | +| deleted | - | Number of lines deleted. | +| lines | - | Number of lines changed. | +| score | - | Percentage of similarity compared to previous revision ([Docs](https://git-scm.com/docs/git-diff)). | +| prov:role | FileRevisionAfterModification | Function of the File entity in context of the Commit activity. | +| prov:time | `COMMIT_AUTHOR_DATE` | Time at which the File entity was generated. | **`Commit - [wasAssociatedWith] -> Author`** @@ -309,9 +339,13 @@ The deleted revision is invalidated by the commit that removes it from the repos **`Commit`** | Attribute | Fixed Value | Description | | -------------- | ----------------------- | ------------------------------------------- | -| hexsha | - | Commit SHA1 | -| message | - | Commit message. | +| sha | - | Commit SHA1 | | title | - | First 50 characters of the commit message. | +| message | - | Commit message. | +| deleted | - | Number of lines deleted. | +| inserted | - | Number of lines inserted. | +| lines | - | Number of lines changed. | +| files | - | Number of files changed. | | prov:startTime | `COMMIT_AUTHOR_DATE` | Time at which the commit activity started. | | prov:endTime | `COMMIT_COMMITTER_DATE` | Time at which the commit activity ended. | | prov:type | GitCommit | Activity type. | @@ -319,22 +353,28 @@ The deleted revision is invalidated by the commit that removes it from the repos **`File`** -| Attribute | Fixed Value | Description | -| ------------ | ----------- | ------------------------------------------------------------------ | -| path | - | Original file path. The path at which this file was first created. | -| committed_in | - | SHA1 of the commit that added this file to the repository. | -| prov:type | File | Entity type. | -| prov:label | - | Human readable representation of the entity. | +| Attribute | Fixed Value | Description | +| ---------- | ----------- | ------------------------------------------------------------------ | +| name | - | Original file name. | +| path | - | Original file path. The path at which this file was first created. | +| commit | - | SHA1 of the commit that added this file to the repository. | +| prov:type | File | Entity type. | +| prov:label | - | Human readable representation of the entity. | **`File Revision`** -| Attribute | Fixed Value | Description | -| ------------ | ------------ | ---------------------------------------------------------------------------- | -| path | - | Current file path of this revision. | -| committed_in | - | SHA1 of the commit that added this revision to the repository. | -| change_type | - | [`git diff`](https://git-scm.com/docs/git-diff) change type / change status. | -| prov:type | FileRevision | Entity type. | -| prov:label | - | Human readable representation of the entity. | +| Attribute | Fixed Value | Description | +| ---------- | ---------------------------------- | --------------------------------------------------------------------------------------------------- | +| name | - | Original file name. | +| path | - | Original file path. The path at which this file was first created. | +| commit | - | SHA1 of the commit that added this file to the repository. | +| status | `added` or `modified` or `deleted` | Change status of the file revision. | +| inserted | - | Number of lines inserted. | +| deleted | - | Number of lines deleted. | +| lines | - | Number of lines changed. | +| score | - | Percentage of similarity compared to previous revision ([Docs](https://git-scm.com/docs/git-diff)). | +| prov:type | FileRevision | Entity type. | +| prov:label | - | Human readable representation of the entity. | Some PROV relations in this model are "qualified" relations. @@ -357,6 +397,9 @@ The following tables define the attributes attached to these relations. **`File Revision - [wasInvalidatedBy] -> Commit`** | Attribute | Fixed Value | Description | | --------- | ----------------------------- | ---------------------------------------------------------------------- | +| inserted | - | Number of lines inserted. | +| deleted | - | Number of lines deleted. | +| lines | - | Number of lines changed. | | prov:time | `COMMIT_AUTHOR_DATE` | Time at which the FileRevision entity was invalidated. | | prov:role | FileRevisionAtPointOfDeletion | Function of the FileRevision entity in context of the Commit activity. | @@ -408,22 +451,28 @@ This way, the model captures the lineage of the GitLab commit web resource and a **`Annotator`** -| Attribute | Fixed Value | Description | -| --------------- | ----------- | -------------------------------------------------------------- | -| name | - | Annotator given name. As set in the annotators GitLab profile. | -| gitlab_username | - | GitLab username. As set in the annotators GitLab profile. | -| gitlab_id | - | Gitlab internal user id. | -| prov:role | Annotator | Function of the agent in context of the commit activity. | -| prov:type | User | Agent type. | -| prov:label | - | Human readable representation of the agent. | +| Attribute | Fixed Value | Description | +| --------------- | ----------- | --------------------------------------------------------- | +| name | - | Annotator given name. | +| gitlab_username | - | GitLab username. As set in the annotators GitLab profile. | +| github_username | - | GitHub username. As set in the annotators GitHub profile. | +| gitlab_id | - | Gitlab user id. | +| github_id | - | GitHub user id. | +| prov:role | Annotator | Function of the agent in context of the commit activity. | +| prov:type | User | Agent type. | +| prov:label | - | Human readable representation of the agent. | **`Git Commit`** | Attribute | Fixed Value | Description | | -------------- | ----------------------- | ---------------------------------------------- | -| hexsha | - | Commit SHA1 | -| message | - | Commit message. | +| sha | - | Commit SHA1 | | title | - | First 50 characters of the commit message. | +| message | - | Commit message. | +| deleted | - | Number of lines deleted. | +| inserted | - | Number of lines inserted. | +| lines | - | Number of lines changed. | +| files | - | Number of files changed. | | prov:startTime | `COMMIT_AUTHOR_DATE` | Time at which the commit activity started. | | prov:endTime | `COMMIT_COMMITTER_DATE` | Time at which the commit activity ended. | | prov:type | GitCommit | Activity type. | @@ -433,7 +482,7 @@ This way, the model captures the lineage of the GitLab commit web resource and a **`Creation`** | Attribute | Fixed Value | Description | | -------------- | ----------------------- | ----------------------------------------------- | -| creation_id | - | SHA1 of the commit that triggered the creation. | +| id | - | SHA1 of the commit that triggered the creation. | | prov:startTime | `COMMIT_COMMITTER_DATE` | Time at which the web resource was created. | | prov:endTime | `COMMIT_COMMITTER_DATE` | Time at which the web resource was created. | | prov:type | GitlabCommitCreation | Activity type. | @@ -444,7 +493,7 @@ This way, the model captures the lineage of the GitLab commit web resource and a | Attribute | Fixed Value | Description | | -------------- | ----------- | ----------------------------------------------------------------------------- | | id | - | Internal GitLab ID of the datastructure from which the annotation was parsed. | -| type | - | Annotation type. Parsed from the annotation body. | +| name | - | Annotation name/class. Parsed from the annotation body. | | body | - | Annotation string. The string from which the type is parsed. | | prov:startTime | - | Time at which the annotation was created. | | prov:endTime | - | Time at which the annotation was created. | @@ -459,29 +508,30 @@ All recognized annotation types are listed in the "Annotations" section of this **`Commit`** -| Attribute | Fixed Value | Description | -| ---------- | ----------- | ----------------------------------------------------- | -| hexsha | - | Commit SHA1. | -| url | - | URL to the webpage of the gitlab commit web resource. | -| prov:type | Resource | Entity type. | -| prov:label | - | Human readable representation of the entity. | +| Attribute | Fixed Value | Description | +| ---------- | -------------------- | ----------------------------------------------------- | +| sha | - | Commit SHA1. | +| url | - | URL to the webpage of the gitlab commit web resource. | +| platform | `gitlab` or `github` | Platform identifier string. | +| prov:type | Resource | Entity type. | +| prov:label | - | Human readable representation of the entity. | **`Commit Version`** | Attribute | Fixed Value | Description | | ---------- | ------------------------- | -------------------------------------------- | -| version_id | - | Commit SHA1. | +| id | - | Commit SHA1. | | prov:type | ResourceAtPointOfAddition | Entity type. | | prov:label | - | Human readable representation of the entity. | **`Annotated Commit Version`** -| Attribute | Fixed Value | Description | -| ------------- | ------------------------ | -------------------------------------------- | -| version_id | - | Commit SHA1. | -| annotation_id | - | Gitlab annotation id. | -| prov:type | AnnotatedResourceVersion | Entity type. | -| prov:label | - | Human readable representation of the entity. | +| Attribute | Fixed Value | Description | +| ---------- | ------------------------ | -------------------------------------------- | +| id | - | Commit SHA1. | +| annotation | - | Gitlab annotation id. | +| prov:type | AnnotatedResourceVersion | Entity type. | +| prov:label | - | Human readable representation of the entity. | Some PROV relations in this model are "qualified" relations. @@ -568,9 +618,11 @@ This allows for capturing the changes in the issue web resource after each annot **`Issue Author`** | Attribute | Fixed Value | Description | | --------------- | ----------- | -------------------------------------------------------- | -| name | - | Author name. As set in the authors gitlab profile. | +| name | - | Author name. | | gitlab_username | - | GitLab username. As set in the authors gitlab profile. | -| gitlab_id | - | Gitlab internal user id. | +| github_username | - | GitHub username. As set in the authors gitlab profile. | +| gitlab_id | - | GitLab user id. | +| github_id | - | GitHub user id. | | prov:role | IssueAuthor | Function of the agent in context of the commit activity. | | prov:type | User | Agent type. | | prov:label | - | Human readable representation of the agent. | @@ -580,8 +632,10 @@ This allows for capturing the changes in the issue web resource after each annot | Attribute | Fixed Value | Description | | --------------- | ----------- | -------------------------------------------------------------- | | name | - | Annotator given name. As set in the annotators gitlab profile. | -| gitlab_username | - | GitLab username. As set in the annotators gitlab profile. | -| gitlab_id | - | Gitlab internal user id. | +| gitlab_username | - | GitLab username. As set in the authors gitlab profile. | +| github_username | - | GitHub username. As set in the authors gitlab profile. | +| gitlab_id | - | GitLab user id. | +| github_id | - | GitHub user id. | | prov:role | Annotator | Function of the agent in context of the commit activity. | | prov:type | User | Agent type. | | prov:label | - | Human readable representation of the agent. | @@ -590,7 +644,7 @@ This allows for capturing the changes in the issue web resource after each annot **`Creation`** | Attribute | Fixed Value | Description | | -------------- | ------------- | ---------------------------------------------- | -| creation_id | - | Gitlab issue id. | +| id | - | Gitlab issue id. | | prov:startTime | - | Time at which the web resource was created. | | prov:endTime | - | Time at which the web resource was created. | | prov:type | IssueCreation | Activity type. | @@ -601,7 +655,7 @@ This allows for capturing the changes in the issue web resource after each annot | Attribute | Fixed Value | Description | | -------------- | ----------- | ----------------------------------------------------------------------------- | | id | - | Internal gitlab id of the datastructure from which the annotation was parsed. | -| type | - | Annotation type. Parsed from the annotation body. | +| name | - | Annotation name/class. Parsed from the annotation body. | | body | - | Annotation string. The string from which the type is parsed. | | prov:startTime | - | Time at which the annotation was created. | | prov:endTime | - | Time at which the annotation was created. | @@ -616,34 +670,35 @@ All recognized annotation types are listed in the "Annotations" section of this **`Issue`** -| Attribute | Fixed Value | Description | -| ----------- | ----------- | -------------------------------------------- | -| id | - | Gitlab issue ID. | -| iid | - | Internal Gitlab issue ID. | -| title | - | Issue title. | -| description | - | Issue description. | -| url | - | URL to the gitlab issue. | -| created_at | - | Time at which the issue was created at. | -| closed_at | - | Time at which the issue was closed at. | -| prov:type | Issue | Entity type. | -| prov:label | - | Human readable representation of the entity. | +| Attribute | Fixed Value | Description | +| ---------- | -------------------- | -------------------------------------------- | +| id | - | Issue ID. | +| iid | - | Internal issue ID. | +| title | - | Issue title. | +| body | - | Issue body. | +| platform | `gitlab` or `github` | Platform identifier string. | +| url | - | Issue webpage url. | +| created_at | - | Time at which the issue was created at. | +| closed_at | - | Time at which the issue was closed at. | +| prov:type | Issue | Entity type. | +| prov:label | - | Human readable representation of the entity. | **`Issue Version`** | Attribute | Fixed Value | Description | | ---------- | ------------ | -------------------------------------------- | -| version_id | - | Gitlab id of the issue. | +| id | - | GitLab/GitHub id of the issue. | | prov:type | IssueVersion | Entity type. | | prov:label | - | Human readable representation of the entity. | **`Annotated Issue Version`** -| Attribute | Fixed Value | Description | -| ------------- | --------------------- | ----------------------------------------------------------------- | -| version_id | - | Gitlab id of the issue. | -| annotation_id | - | Gitlab id of the annotation that generated the annotated version. | -| prov:type | AnnotatedIssueVersion | Entity type. | -| prov:label | - | Human readable representation of the entity. | +| Attribute | Fixed Value | Description | +| ---------- | --------------------- | ------------------------------------------------------------------------ | +| id | - | GitLab/GitHub id of the issue. | +| annotation | - | GitLab/GitHub id of the annotation that generated the annotated version. | +| prov:type | AnnotatedIssueVersion | Entity type. | +| prov:label | - | Human readable representation of the entity. | Some PROV relations in this model are "qualified" relations. @@ -733,9 +788,11 @@ Each annotated merge request version is generated by the corresponding annotatio **`Merge Request Author`** | Attribute | Fixed Value | Description | | --------------- | ------------------ | --------------------------------------------------------------- | -| name | - | Author name. As set in the authors GitLab profile. | -| gitlab_username | - | GitLab username. As set in the authors GitLab profile. | -| gitlab_id | - | Gitlab user id. | +| name | - | Author name. | +| gitlab_username | - | GitLab username. As set in the authors gitlab profile. | +| github_username | - | GitHub username. As set in the authors gitlab profile. | +| gitlab_id | - | GitLab user id. | +| github_id | - | GitHub user id. | | prov:role | MergeRequestAuthor | Function of the agent in context of the merge request activity. | | prov:type | User | Agent type. | | prov:label | - | Human readable representation of the agent. | @@ -744,9 +801,11 @@ Each annotated merge request version is generated by the corresponding annotatio **`Annotator`** | Attribute | Fixed Value | Description | | --------------- | ----------- | --------------------------------------------------------------- | -| name | - | Annotator given name. As set in the annotators GitLab profile. | -| gitlab_username | - | GitLab username. As set in the annotators GitLab profile. | -| gitlab_id | - | Gitlab user id. | +| name | - | Author name. | +| gitlab_username | - | GitLab username. As set in the authors gitlab profile. | +| github_username | - | GitHub username. As set in the authors gitlab profile. | +| gitlab_id | - | GitLab user id. | +| github_id | - | GitHub user id. | | prov:role | Annotator | Function of the agent in context of the merge request activity. | | prov:type | User | Agent type. | | prov:label | - | Human readable representation of the agent. | @@ -755,7 +814,7 @@ Each annotated merge request version is generated by the corresponding annotatio **`Creation`** | Attribute | Fixed Value | Description | | -------------- | -------------------- | ---------------------------------------------- | -| creation_id | - | Gitlab merge request id. | +| id | - | GitLab/GitHub merge request id. | | prov:startTime | - | Time at which the web resource was created. | | prov:endTime | - | Time at which the web resource was created. | | prov:type | MergeRequestCreation | Activity type. | @@ -763,15 +822,15 @@ Each annotated merge request version is generated by the corresponding annotatio **`Annotation`** -| Attribute | Fixed Value | Description | -| -------------- | ----------- | ----------------------------------------------------------------------------- | -| id | - | Internal gitLab id of the datastructure from which the annotation was parsed. | -| type | - | Annotation type. Parsed from the annotation body. | -| body | - | Annotation string. The string from which the type is parsed. | -| prov:startTime | - | Time at which the annotation was created. | -| prov:endTime | - | Time at which the annotation was created. | -| prov:type | Annotation | Activity type. | -| prov:label | - | Human readable representation of the activity. | +| Attribute | Fixed Value | Description | +| -------------- | ----------- | ---------------------------------------------------------------------- | +| id | - | Internal id of the datastructure from which the annotation was parsed. | +| name | - | Annotation name/class. Parsed from the annotation body. | +| body | - | Annotation string. The string from which the type is parsed. | +| prov:startTime | - | Time at which the annotation was created. | +| prov:endTime | - | Time at which the annotation was created. | +| prov:type | Annotation | Activity type. | +| prov:label | - | Human readable representation of the activity. | The set of attributes for annotations can change according to the annotation type. @@ -781,38 +840,39 @@ All recognized annotation types are listed in the "Annotations" section of this **`Merge Request`** -| Attribute | Fixed Value | Description | -| ------------------------------- | ------------ | ----------------------------------------------------------------- | -| id | - | Gitlab merge request id. | -| iid | - | Internal gitlab merge request id. | -| title | - | Issue title. | -| description | - | Issue description. | -| url | - | URL to the gitlab issue. | -| source_branch | - | Merge request source branch name. | -| target_branch | - | Merge request target branch name. | -| created_at | - | Time at which the merge request was created at. | -| closed_at | - | Time at which the merge request was closed at. | -| merged_at | - | Time at which the merge request was merged at. | -| first_deployed_to_production_at | - | Time at which the merge request was first deployed to production. | -| prov:type | MergeRequest | Entity type. | -| prov:label | - | Human readable representation of the entity. | +| Attribute | Fixed Value | Description | +| ------------------------------- | -------------------- | ----------------------------------------------------------------- | +| id | - | GitLab/GitHub merge request id. | +| iid | - | Internal GitLab/GitHub merge request id. | +| title | - | Merge request title. | +| body | - | Merge request body. | +| url | - | URL to the GitLab/GitHub merge request. | +| platform | `gitlab` or `github` | Platform identifier string. | +| source_branch | - | Merge request source branch name. | +| target_branch | - | Merge request target branch name. | +| created_at | - | Time at which the merge request was created at. | +| closed_at | - | Time at which the merge request was closed at. | +| merged_at | - | Time at which the merge request was merged at. | +| first_deployed_to_production_at | - | Time at which the merge request was first deployed to production. | +| prov:type | MergeRequest | Entity type. | +| prov:label | - | Human readable representation of the entity. | **`Merge Request Version`** | Attribute | Fixed Value | Description | | ---------- | ------------------------- | -------------------------------------------- | -| version_id | - | Gitlab id of the merge request. | +| id | - | Gitlab/Github id of the merge request. | | prov:type | GitlabMergeRequestVersion | Entity type. | | prov:label | - | Human readable representation of the entity. | **`Annotated Merge Request Version`** -| Attribute | Fixed Value | Description | -| ------------- | ---------------------------- | ----------------------------------------------------------------- | -| version_id | - | Gitlab id of the merge request. | -| annotation_id | - | Gitlab id of the annotation that generated the annotated version. | -| prov:type | AnnotatedMergeRequestVersion | Entity type. | -| prov:label | - | Human readable representation of the entity. | +| Attribute | Fixed Value | Description | +| ---------- | ---------------------------- | ------------------------------------------------------------------------ | +| id | - | Gitlab/Github id of the merge request. | +| annotation | - | Gitlab/Github id of the annotation that generated the annotated version. | +| prov:type | AnnotatedMergeRequestVersion | Entity type. | +| prov:label | - | Human readable representation of the entity. | Some PROV relations in this model are "qualified" relations. @@ -897,7 +957,7 @@ The commit is generated by the commit creation activity. **`Evidence`** | Attribute | Fixed Value | Description | | ------------ | ----------- | -------------------------------------------- | -| hexsha | - | Evidence SHA. | +| sha | - | Evidence SHA. | | url | - | Evidence URL. | | collected_at | - | Time at which the evidence was generated. | | prov:type | Asset | Entity type. | @@ -907,9 +967,13 @@ The commit is generated by the commit creation activity. **`Commit`** | Attribute | Fixed Value | Description | | ---------- | ----------- | -------------------------------------------- | -| hexsha | - | Commit SHA1 | -| message | - | Commit message. | +| sha | - | Commit SHA1 | | title | - | First 50 characters of the commit message. | +| message | - | Commit message. | +| deleted | - | Number of lines deleted. | +| inserted | - | Number of lines inserted. | +| lines | - | Number of lines changed. | +| files | - | Number of files changed. | | prov:type | GitCommit | Entity type. | | prov:label | - | Human readable representation of the entity. | @@ -918,7 +982,7 @@ The commit is generated by the commit creation activity. | Attribute | Fixed Value | Description | | ---------- | --------------- | ------------------------------------------------- | | name | - | Tag name. | -| hexsha | - | Commit SHA1 of the commit that pushed the tag. | +| sha | - | Commit SHA1 of the commit that pushed the tag. | | message | - | Commit message of the commit that pushed the tag. | | created_at | - | Time at which the tag was created. | | prov:type | Tag | Entity type. | @@ -927,16 +991,17 @@ The commit is generated by the commit creation activity. **`Release`** -| Attribute | Fixed Value | Description | -| ----------- | --------------- | -------------------------------------------- | -| name | - | Release name. | -| description | - | Release description. | -| tag_name | - | Release tag name. | -| created_at | - | Time at which the release was created. | -| released_at | - | Time at which the release was released. | -| prov:type | Tag | Entity type. | -| prov:type | prov:Collection | Entity type. | -| prov:label | - | Human readable representation of the entity. | +| Attribute | Fixed Value | Description | +| ----------- | -------------------- | -------------------------------------------- | +| name | - | Release name. | +| body | - | Release body. | +| tag_name | - | Release tag name. | +| platform | `gitlab` or `github` | Platform identifier string. | +| created_at | - | Time at which the release was created. | +| released_at | - | Time at which the release was released. | +| prov:type | Tag | Entity type. | +| prov:type | prov:Collection | Entity type. | +| prov:label | - | Human readable representation of the entity. | **`Commit Author`** @@ -964,8 +1029,10 @@ The commit is generated by the commit creation activity. | --------------- | ------------- | ------------------------------------------------------------------------------------------------ | | name | - | Author name. As set in the authors GitLab profile. Only available if the token has admin rights. | | email | - | Author email. Set in the author's git config. Only available if the token has admin rights. | -| gitlab_username | - | GitLab username. As set in the authors GitLab profile. | +| gitlab_username | - | GitLab username. As set in the annotators GitLab profile. | +| github_username | - | GitHub username. As set in the annotators GitHub profile. | | gitlab_id | - | Gitlab user id. | +| github_id | - | GitHub user id. | | prov:role | ReleaseAuthor | Function of the agent in context of the release creation activity. | | prov:type | User | Agent type. | | prov:label | - | Human readable representation of the agent. | @@ -974,7 +1041,7 @@ The commit is generated by the commit creation activity. **`Commit Creation`** | Attribute | Fixed Value | Description | | -------------- | -------------- | ---------------------------------------------- | -| creation_id | - | Commit SHA1. | +| id | - | Commit SHA1. | | prov:startTime | - | Time at which the commit was created. | | prov:endTime | - | Time at which the commit was created. | | prov:type | CommitCreation | Activity type. | @@ -984,7 +1051,7 @@ The commit is generated by the commit creation activity. **`Tag Creation`** | Attribute | Fixed Value | Description | | -------------- | ----------- | ---------------------------------------------- | -| creation_id | - | Tag name. | +| id | - | Tag name. | | prov:startTime | - | Time at which the tag was created. | | prov:endTime | - | Time at which the tag was created. | | prov:type | TagCreation | Activity type. | @@ -994,7 +1061,7 @@ The commit is generated by the commit creation activity. **`Release Creation`** | Attribute | Fixed Value | Description | | -------------- | --------------- | ---------------------------------------------- | -| creation_id | - | Tag name. | +| id | - | Tag name. | | prov:startTime | - | Time at which the release was created. | | prov:endTime | - | Time at which the release was realeased. | | prov:type | ReleaseCreation | Activity type. | diff --git a/gitlab2prov/domain/objects.py b/gitlab2prov/domain/objects.py index f836a1d..80f8b18 100644 --- a/gitlab2prov/domain/objects.py +++ b/gitlab2prov/domain/objects.py @@ -28,6 +28,7 @@ @dataclass class User: + # TODO: github_email, gitlab_email name: str email: str gitlab_username: str | None = None @@ -83,6 +84,10 @@ def to_prov_element(self) -> ProvEntity: @dataclass class FileRevision(File): status: str + inserted: int + deleted: int + lines: int + score: float file: File | None = None previous: FileRevision | None = None @@ -97,6 +102,10 @@ def to_prov_element(self) -> ProvEntity: ("name", self.name), ("path", self.path), ("status", self.status), + ("inserted", self.inserted), + ("deleted", self.deleted), + ("lines", self.lines), + ("score", self.score), (PROV_TYPE, ProvType.FILE_REVISION), ] return ProvEntity( @@ -108,7 +117,7 @@ def to_prov_element(self) -> ProvEntity: @dataclass class Annotation: - uid: str + id: str name: str body: str start: datetime @@ -118,11 +127,11 @@ class Annotation: @property def identifier(self) -> QualifiedName: - return qualified_name(f"Annotation?{self.uid=}&{self.name=}") + return qualified_name(f"Annotation?{self.id=}&{self.name=}") def to_prov_element(self) -> ProvActivity: attributes = [ - ("uid", self.uid), + ("id", self.id), ("name", self.name), ("body", self.body), (PROV_ATTR_STARTTIME, self.start), @@ -135,12 +144,12 @@ def to_prov_element(self) -> ProvActivity: @dataclass class Version: - uid: str - resource: str + id: str + resource: str # ProvType @property def identifier(self) -> QualifiedName: - return qualified_name(f"{self.resource}Version?{self.uid=}") + return qualified_name(f"{self.resource}Version?{self.id=}") @classmethod def from_commit(cls, commit: Commit): @@ -155,35 +164,47 @@ def from_merge_request(cls, merge_request: MergeRequest): return cls(uid=merge_request.id, resource=ProvType.MERGE_REQUEST) def to_prov_element(self) -> ProvEntity: - attributes = [("uid", self.uid), (PROV_TYPE, f"{self.resource}Version")] + attributes = [("id", self.id), (PROV_TYPE, f"{self.resource}Version")] return ProvEntity(PLACEHOLDER, self.identifier, attributes) @dataclass class AnnotatedVersion: - uid: str - aid: str - resource: str + id: str + annotation: str # Annotation.id + resource: str # ProvType start: datetime @property def identifier(self) -> QualifiedName: - return qualified_name(f"Annotated{self.resource}Version?{self.uid=}&{self.aid=}") + return qualified_name(f"Annotated{self.resource}Version?{self.id=}&{self.annotation=}") @classmethod def from_commit(cls, commit: Commit, annotation: Annotation): - return cls(uid=commit.sha, aid=annotation.uid, resource=ProvType.COMMIT, start=annotation.start) + return cls( + id=commit.sha, + annotation=annotation.id, + resource=ProvType.COMMIT, + start=annotation.start, + ) @classmethod def from_issue(cls, issue: Issue, annotation: Annotation): - return cls(uid=issue.id, aid=annotation.uid, resource=ProvType.ISSUE, start=annotation.start) + return cls( + id=issue.id, annotation=annotation.id, resource=ProvType.ISSUE, start=annotation.start + ) @classmethod def from_merge_request(cls, merge_request: MergeRequest, annotation: Annotation): - return cls(uid=merge_request.id, aid=annotation.uid, resource=ProvType.MERGE_REQUEST, start=annotation.start) + return cls( + id=merge_request.id, + annotation=annotation.id, + resource=ProvType.MERGE_REQUEST, + start=annotation.start, + ) def to_prov_element(self) -> ProvEntity: - attributes = [("uid", self.uid), (PROV_TYPE, f"Annotated{self.resource}Version")] + attributes = [("id", self.id), (PROV_TYPE, f"Annotated{self.resource}Version")] return ProvEntity( PLACEHOLDER, self.identifier, @@ -193,23 +214,23 @@ def to_prov_element(self) -> ProvEntity: @dataclass class Creation: - uid: str + id: str resource: str start: datetime end: datetime @property def identifier(self) -> QualifiedName: - return qualified_name(f"Creation?{self.uid=}&{self.resource=}") + return qualified_name(f"Creation?{self.id=}&{self.resource=}") @classmethod def from_tag(cls, tag: GitTag): - return cls(uid=tag.name, resource=ProvType.TAG, start=tag.created_at, end=tag.created_at) + return cls(id=tag.name, resource=ProvType.TAG, start=tag.created_at, end=tag.created_at) @classmethod def from_commit(cls, commit: Commit): return cls( - uid=commit.sha, + id=commit.sha, resource=ProvType.COMMIT, start=commit.authored_at, end=commit.committed_at, @@ -218,13 +239,13 @@ def from_commit(cls, commit: Commit): @classmethod def from_issue(cls, issue: Issue): return cls( - uid=issue.id, resource=ProvType.ISSUE, start=issue.created_at, end=issue.closed_at + id=issue.id, resource=ProvType.ISSUE, start=issue.created_at, end=issue.closed_at ) @classmethod def from_merge_request(cls, merge_request: MergeRequest): return cls( - uid=merge_request.id, + id=merge_request.id, resource=ProvType.MERGE_REQUEST, start=merge_request.created_at, end=merge_request.closed_at, @@ -232,7 +253,7 @@ def from_merge_request(cls, merge_request: MergeRequest): def to_prov_element(self) -> ProvActivity: attributes = [ - ("uid", self.uid), + ("id", self.id), (PROV_ATTR_STARTTIME, self.start), (PROV_ATTR_ENDTIME, self.end), (PROV_TYPE, ProvType.CREATION), @@ -242,12 +263,17 @@ def to_prov_element(self) -> ProvActivity: @dataclass class GitCommit: - sha: str - title: str - message: str - author: User - committer: User - parents: list[str] + sha: str # commit sha + title: str # commit title + message: str # commit message + author: User # author: User + committer: User # committer: User + deletions: int # number of lines deleted + insertions: int # number of lines inserted + lines: int # number of lines changed + files: int # number of files changed + file_paths: list[str] # list of file paths of changed files + parents: list[str] # list of parent commit shas start: datetime # authored date end: datetime # committed date @@ -260,6 +286,10 @@ def to_prov_element(self) -> ProvActivity: ("sha", self.sha), ("title", self.title), ("message", self.message), + ("deleted", self.deleted), + ("inserted", self.inserted), + ("lines", self.lines), + ("files", self.files), ("authored_at", self.start), ("committed_at", self.end), (PROV_ATTR_STARTTIME, self.start), @@ -302,11 +332,10 @@ def to_prov_element(self) -> ProvActivity: attributes = [ ("id", self.id), ("iid", self.iid), - ("platform", self.platform), ("title", self.title), ("body", self.body), - ("url", self.url), ("platform", self.platform), + ("url", self.url), (PROV_ATTR_STARTTIME, self.created_at), (PROV_ATTR_ENDTIME, self.closed_at), (PROV_TYPE, ProvType.ISSUE), @@ -398,6 +427,10 @@ def to_prov_element(self) -> ProvActivity: ("platform", self.platform), ("source_branch", self.source_branch), ("target_branch", self.target_branch), + ("created_at", self.created_at), + ("closed_at", self.closed_at), + ("merged_at", self.merged_at), + ("first_deployed_to_production_at", self.first_deployed_to_production_at), (PROV_ATTR_STARTTIME, self.created_at), (PROV_ATTR_ENDTIME, self.closed_at), (PROV_TYPE, ProvType.MERGE_REQUEST), @@ -426,6 +459,7 @@ def to_prov_element(self) -> ProvEntity: ("name", self.name), ("sha", self.sha), ("message", self.message), + ("created_at", self.created_at), (PROV_ATTR_STARTTIME, self.created_at), (PROV_ATTR_ENDTIME, self.created_at), (PROV_TYPE, ProvType.TAG), From c6c84981538bc51af15500c921b518cedb6e4dfe Mon Sep 17 00:00:00 2001 From: cdboer <dev@claas.plus> Date: Mon, 22 May 2023 11:02:34 +0200 Subject: [PATCH 59/81] Add test cases for repository implementation --- tests/unit/test_repository.py | 96 +++++++++++++++++++++++++++++++++++ 1 file changed, 96 insertions(+) create mode 100644 tests/unit/test_repository.py diff --git a/tests/unit/test_repository.py b/tests/unit/test_repository.py new file mode 100644 index 0000000..99b54c2 --- /dev/null +++ b/tests/unit/test_repository.py @@ -0,0 +1,96 @@ +from gitlab2prov.adapters.repository import InMemoryRepository +from tests.random_generation import generate_random_users + + +class TestInMemoryRepository: + + users = generate_random_users(10) + + def test_add_resource(self): + repo = InMemoryRepository() + resource = self.users[0] + repo.add(resource) + assert len(repo.repo[type(resource)]) == 1 + assert repo.repo[type(resource)][0] == resource + + def test_get_resource_existing(self): + repo = InMemoryRepository() + resource = self.users[0] + repo.add(resource) + retrieved_resource = repo.get(type(resource)) + assert retrieved_resource == resource + + def test_get_resource_non_existing(self): + repo = InMemoryRepository() + retrieved_resource = repo.get(type(self.users[0])) + assert retrieved_resource is None + + def test_get_resource_with_filters_existing(self): + repo = InMemoryRepository() + resource1 = self.users[0] + resource2 = self.users[1] + repo.add(resource1) + repo.add(resource2) + retrieved_resource = repo.get(type(resource1), email=resource1.email, name=resource1.name) + assert retrieved_resource == resource1 + + def test_get_resource_with_filters_non_existing(self): + repo = InMemoryRepository() + resource = self.users[0] + repo.add(resource) + retrieved_resource = repo.get(type(resource), name="...", email="...") + assert retrieved_resource is None + + def test_get_resource_throws_attribute_error_for_non_existing_attributes(self): + repo = InMemoryRepository() + resource = self.users[0] + repo.add(resource) + try: + repo.get(type(resource), non_existing_attribute="...") + except AttributeError: + assert True + else: + assert False + + def test_list_all_resources(self): + repo = InMemoryRepository() + resource1 = self.users[0] + resource2 = self.users[1] + repo.add(resource1) + repo.add(resource2) + retrieved_resources = repo.list_all(type(resource1)) + assert len(retrieved_resources) == 2 + assert resource1 in retrieved_resources + assert resource2 in retrieved_resources + + def test_list_all_resources_with_filters_existing(self): + repo = InMemoryRepository() + resource1 = self.users[0] + resource2 = self.users[1] + repo.add(resource1) + repo.add(resource2) + retrieved_resources = repo.list_all( + type(resource1), name=resource1.name, email=resource1.email + ) + assert len(retrieved_resources) == 1 + assert resource1 in retrieved_resources + + def test_list_all_resources_with_filters_non_existing(self): + repo = InMemoryRepository() + resource1 = self.users[0] + resource2 = self.users[1] + repo.add(resource1) + repo.add(resource2) + retrieved_resources = repo.list_all(type(resource1), name="...", email="...") + assert len(retrieved_resources) == 0 + + def test_list_all_resources_throws_attribute_error_for_non_existing_attributes(self): + repo = InMemoryRepository() + resource1 = self.users[0] + repo.add(resource1) + try: + repo.list_all(type(resource1), non_existing_attribute="...") + except AttributeError: + assert True + else: + assert False From 66781350c38cc01cef810aae7a184a295bd555f3 Mon Sep 17 00:00:00 2001 From: cdboer <dev@claas.plus> Date: Mon, 22 May 2023 11:03:27 +0200 Subject: [PATCH 60/81] Add test cases for file objects --- tests/unit/objects/test_file.py | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) create mode 100644 tests/unit/objects/test_file.py diff --git a/tests/unit/objects/test_file.py b/tests/unit/objects/test_file.py new file mode 100644 index 0000000..ef3cca3 --- /dev/null +++ b/tests/unit/objects/test_file.py @@ -0,0 +1,26 @@ +from gitlab2prov.domain.objects import File + + +class TestFile: + def test_file_creation(self): + # Test File object creation + file_obj = File(name="test_file.txt", path="/path/to/file", commit="12345") + assert file_obj.name == "test_file.txt" + assert file_obj.path == "/path/to/file" + assert file_obj.commit == "12345" + + def test_identifier_property(self): + # Test identifier property + file_obj = File(name="test_file.txt", path="/path/to/file", commit="12345") + assert ( + file_obj.identifier.localpart + == "File?name=test_file.txt&path=/path/to/file&commit=12345" + ) + + def test_to_prov_element_method(self): + # Test to_prov_element() method + file_obj = File(name="test_file.txt", path="/path/to/file", commit="12345") + prov_entity = file_obj.to_prov_element() + assert prov_entity.get_attribute("name") == "test_file.txt" + assert prov_entity.get_attribute("path") == "/path/to/file" + assert prov_entity.get_attribute("commit") == "12345" From 9c21dfe4adc08fd06731b5512857d2ecbb308c64 Mon Sep 17 00:00:00 2001 From: cdboer <dev@claas.plus> Date: Mon, 22 May 2023 11:03:49 +0200 Subject: [PATCH 61/81] Add test cases for user objects --- tests/unit/objects/test_user.py | 83 +++++++++++++++++++++++++++++++++ 1 file changed, 83 insertions(+) create mode 100644 tests/unit/objects/test_user.py diff --git a/tests/unit/objects/test_user.py b/tests/unit/objects/test_user.py new file mode 100644 index 0000000..0d835b0 --- /dev/null +++ b/tests/unit/objects/test_user.py @@ -0,0 +1,83 @@ +from gitlab2prov.domain.objects import User +from gitlab2prov.domain.constants import ProvType + + +class TestUser: + # Test cases for User class + def test_user_creation(self): + # Test User creation with valid inputs + user = User(name="John Doe", email="johndoe@example.com") + assert user.name == "John Doe" + assert user.email == "johndoe@example.com" + assert user.gitlab_username is None + assert user.github_username is None + assert user.gitlab_id is None + assert user.github_id is None + assert user.prov_role is None + + # Test User creation with optional parameters + user = User( + name="Jane Smith", + email="janesmith@example.com", + gitlab_username="janesmith", + github_username="janesmith", + gitlab_id="123", + github_id="456", + prov_role="developer", + ) + assert user.name == "Jane Smith" + assert user.email == "janesmith@example.com" + assert user.gitlab_username == "janesmith" + assert user.github_username == "janesmith" + assert user.gitlab_id == "123" + assert user.github_id == "456" + assert user.prov_role == "developer" + + def test_user_post_init(self): + # Test __post_init__() method with lowercase email + user = User(name="John Doe", email="JohnDoe@example.com") + assert user.email == "johndoe@example.com" + + # Test __post_init__() method with None email + user = User(name="Jane Smith", email=None) + assert user.email is None + + def test_user_identifier(self): + # Test identifier property + user = User(name="John Doe", email="johndoe@example.com") + assert user.identifier.localpart == "User?name=John Doe&email=johndoe@example.com" + + def test_user_to_prov_element(self): + # Test to_prov_element() method with minimum attributes + user = User(name="John Doe", email="johndoe@example.com") + prov_element = user.to_prov_element() + assert prov_element.identifier == "User?name=John Doe&email=johndoe@example.com" + assert prov_element.attributes == [ + ("name", "John Doe"), + ("email", "johndoe@example.com"), + ("prov_role", None), + ("prov_type", ProvType.USER), + ] + + # Test to_prov_element() method with all attributes + user = User( + name="Jane Smith", + email="janesmith@example.com", + gitlab_username="janesmith", + github_username="janesmith", + gitlab_id="123", + github_id="456", + prov_role="developer", + ) + prov_element = user.to_prov_element() + assert prov_element.identifier == "User?name=Jane Smith&email=janesmith@example.com" + assert prov_element.attributes == [ + ("name", "Jane Smith"), + ("email", "janesmith@example.com"), + ("gitlab_username", "janesmith"), + ("github_username", "janesmith"), + ("gitlab_id", "123"), + ("github_id", "456"), + ("prov_role", "developer"), + ("prov_type", ProvType.USER), + ] From dbf2a984a7ea82d272056a543e3b222f8abbef09 Mon Sep 17 00:00:00 2001 From: cdboer <dev@claas.plus> Date: Mon, 22 May 2023 11:05:37 +0200 Subject: [PATCH 62/81] Add script to generate randomized objects --- tests/random_generation.py | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) create mode 100644 tests/random_generation.py diff --git a/tests/random_generation.py b/tests/random_generation.py new file mode 100644 index 0000000..32d5c70 --- /dev/null +++ b/tests/random_generation.py @@ -0,0 +1,27 @@ +import random +import string + +from gitlab2prov.domain.objects import User + + +def generate_random_user(): + name = "".join(random.choice(string.ascii_letters) for _ in range(6)) + email = f"{name}@example.com" + gitlab_username = "".join(random.choice(string.ascii_lowercase) for _ in range(6)) + github_username = "".join(random.choice(string.ascii_lowercase) for _ in range(6)) + gitlab_id = str(random.randint(1000, 9999)) + github_id = str(random.randint(1000, 9999)) + prov_role = random.choice(["admin", "user", "guest", None]) + return User( + name=name, + email=email, + gitlab_username=gitlab_username, + github_username=github_username, + gitlab_id=gitlab_id, + github_id=github_id, + prov_role=prov_role, + ) + + +def generate_random_users(num_users: int) -> list[User]: + return [generate_random_user() for _ in range(num_users)] From 12d7d87575aac214b14cdaeafb853d67fee7a27b Mon Sep 17 00:00:00 2001 From: cdboer <dev@claas.plus> Date: Mon, 22 May 2023 11:28:42 +0200 Subject: [PATCH 63/81] Rename random_generation to conftest.py --- tests/{random_generation.py => conftest.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename tests/{random_generation.py => conftest.py} (100%) diff --git a/tests/random_generation.py b/tests/conftest.py similarity index 100% rename from tests/random_generation.py rename to tests/conftest.py From b37fba789b88bbc464a9c6845a1fd9b2de7afc85 Mon Sep 17 00:00:00 2001 From: cdboer <dev@claas.plus> Date: Mon, 22 May 2023 11:29:10 +0200 Subject: [PATCH 64/81] Use pytest fixtures to generate random objects --- tests/unit/test_repository.py | 55 ++++++++++++++++++----------------- 1 file changed, 29 insertions(+), 26 deletions(-) diff --git a/tests/unit/test_repository.py b/tests/unit/test_repository.py index 99b54c2..912ef62 100644 --- a/tests/unit/test_repository.py +++ b/tests/unit/test_repository.py @@ -1,49 +1,49 @@ +import pytest from gitlab2prov.adapters.repository import InMemoryRepository -from tests.random_generation import generate_random_users class TestInMemoryRepository: - users = generate_random_users(10) - def test_add_resource(self): + def test_add_resource(self, random_user): repo = InMemoryRepository() - resource = self.users[0] + resource = random_user repo.add(resource) assert len(repo.repo[type(resource)]) == 1 assert repo.repo[type(resource)][0] == resource - def test_get_resource_existing(self): + def test_get_resource_existing(self, random_user): repo = InMemoryRepository() - resource = self.users[0] + resource = random_user repo.add(resource) retrieved_resource = repo.get(type(resource)) assert retrieved_resource == resource - def test_get_resource_non_existing(self): + def test_get_resource_non_existing(self, random_user): repo = InMemoryRepository() - retrieved_resource = repo.get(type(self.users[0])) + retrieved_resource = repo.get(type(random_user)) assert retrieved_resource is None - def test_get_resource_with_filters_existing(self): + @pytest.mark.fixt_data(2) + def test_get_resource_with_filters_existing(self, n_random_users): repo = InMemoryRepository() - resource1 = self.users[0] - resource2 = self.users[1] + resource1 = n_random_users[0] + resource2 = n_random_users[1] repo.add(resource1) repo.add(resource2) retrieved_resource = repo.get(type(resource1), email=resource1.email, name=resource1.name) assert retrieved_resource == resource1 - def test_get_resource_with_filters_non_existing(self): + def test_get_resource_with_filters_non_existing(self, random_user): repo = InMemoryRepository() - resource = self.users[0] + resource = random_user repo.add(resource) retrieved_resource = repo.get(type(resource), name="...", email="...") assert retrieved_resource is None - def test_get_resource_throws_attribute_error_for_non_existing_attributes(self): + def test_get_resource_throws_attribute_error_for_non_existing_attributes(self, random_user): repo = InMemoryRepository() - resource = self.users[0] + resource = random_user repo.add(resource) try: repo.get(type(resource), non_existing_attribute="...") @@ -52,10 +52,11 @@ def test_get_resource_throws_attribute_error_for_non_existing_attributes(self): else: assert False - def test_list_all_resources(self): + @pytest.mark.fixt_data(2) + def test_list_all_resources(self, n_random_users): repo = InMemoryRepository() - resource1 = self.users[0] - resource2 = self.users[1] + resource1 = n_random_users[0] + resource2 = n_random_users[1] repo.add(resource1) repo.add(resource2) retrieved_resources = repo.list_all(type(resource1)) @@ -63,10 +64,11 @@ def test_list_all_resources(self): assert resource1 in retrieved_resources assert resource2 in retrieved_resources - def test_list_all_resources_with_filters_existing(self): + pytest.mark.fixt_data(2) + def test_list_all_resources_with_filters_existing(self, n_random_users): repo = InMemoryRepository() - resource1 = self.users[0] - resource2 = self.users[1] + resource1 = n_random_users[0] + resource2 = n_random_users[1] repo.add(resource1) repo.add(resource2) retrieved_resources = repo.list_all( @@ -75,18 +77,19 @@ def test_list_all_resources_with_filters_existing(self): assert len(retrieved_resources) == 1 assert resource1 in retrieved_resources - def test_list_all_resources_with_filters_non_existing(self): + @pytest.mark.fixt_data(2) + def test_list_all_resources_with_filters_non_existing(self, n_random_users): repo = InMemoryRepository() - resource1 = self.users[0] - resource2 = self.users[1] + resource1 = n_random_users[0] + resource2 = n_random_users[1] repo.add(resource1) repo.add(resource2) retrieved_resources = repo.list_all(type(resource1), name="...", email="...") assert len(retrieved_resources) == 0 - def test_list_all_resources_throws_attribute_error_for_non_existing_attributes(self): + def test_list_all_resources_throws_attribute_error_for_non_existing_attributes(self, random_user): repo = InMemoryRepository() - resource1 = self.users[0] + resource1 = random_user repo.add(resource1) try: repo.list_all(type(resource1), non_existing_attribute="...") From 35f1c41228071f6186930d420cc0015ecbc78a02 Mon Sep 17 00:00:00 2001 From: cdboer <dev@claas.plus> Date: Mon, 22 May 2023 11:29:41 +0200 Subject: [PATCH 65/81] Add fixtures to conftest.py --- tests/conftest.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index 32d5c70..c130ca3 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,5 +1,6 @@ import random import string +import pytest from gitlab2prov.domain.objects import User @@ -21,7 +22,15 @@ def generate_random_user(): github_id=github_id, prov_role=prov_role, ) + +@pytest.fixture +def random_user() -> User: + return generate_random_user() -def generate_random_users(num_users: int) -> list[User]: - return [generate_random_user() for _ in range(num_users)] + +@pytest.fixture +def n_random_users(request) -> list[User]: + marker = request.node.get_closest_marker("fixt_data") + n = 10 if marker is None else marker.args[0] + return [generate_random_user() for _ in range(n)] From d69bba226538a4a1f1e6c02e46ed53ce12cbfa06 Mon Sep 17 00:00:00 2001 From: cdboer <dev@claas.plus> Date: Sun, 27 Aug 2023 15:41:55 +0200 Subject: [PATCH 66/81] Rename 'formatter' key to 'format' in schema definition --- gitlab2prov/config/schema.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gitlab2prov/config/schema.json b/gitlab2prov/config/schema.json index f39e719..d208473 100644 --- a/gitlab2prov/config/schema.json +++ b/gitlab2prov/config/schema.json @@ -139,7 +139,7 @@ "coarse": { "type": "boolean" }, - "formatter": { + "format": { "type": "string", "enum": [ "table", From 564d0d57975af4dbb8bf2f729e41f71de93c9753 Mon Sep 17 00:00:00 2001 From: cdboer <dev@claas.plus> Date: Sun, 27 Aug 2023 15:44:14 +0200 Subject: [PATCH 67/81] Add 'click' dependency, rename CLI entrypoints, and include 'schema.json' in package data --- pyproject.toml | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index f755dfd..2dc9863 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -20,6 +20,7 @@ dependencies = [ "ruamel.yaml", "pydot>=1.2.0", "PyGithub", + "click", ] keywords = [ "prov", @@ -47,8 +48,8 @@ dynamic = ["version"] dev = ["pytest", "pytest-mock", "black", "isort", "bump2version"] [project.scripts] -gitlab2prov = "gitlab2prov.entrypoints.cli:gitlab_cli" -github2prov = "gitlab2prov.entrypoints.cli:github_cli" +gitlab2prov = "gitlab2prov.entrypoints.cli:gitlab2prov" +github2prov = "gitlab2prov.entrypoints.cli:github2prov" [project.urls] Twitter = "https://twitter.com/dlr_software" @@ -61,6 +62,9 @@ version = { attr = "gitlab2prov.__version__" } [tool.setuptools.packages.find] exclude = ["tests*", "docs*"] +[tool.setuptools.package-data] +"gitlab2prov.config" = ["schema.json"] + [tool.isort] profile = "black" py_version = 310 From 0db55bb406cc70b6312f48476d73466d673194f0 Mon Sep 17 00:00:00 2001 From: cdboer <dev@claas.plus> Date: Sun, 27 Aug 2023 15:45:18 +0200 Subject: [PATCH 68/81] Refactor ProvenanceContext and models to improve attribute usage and simplify document access --- gitlab2prov/prov/model.py | 32 +++++++++++++++++--------------- 1 file changed, 17 insertions(+), 15 deletions(-) diff --git a/gitlab2prov/prov/model.py b/gitlab2prov/prov/model.py index 9f5574a..97c5f12 100644 --- a/gitlab2prov/prov/model.py +++ b/gitlab2prov/prov/model.py @@ -79,7 +79,7 @@ def hosted_resource_query(repository: Repository, resource_type: Type[HostedReso class ProvenanceContext: document: ProvDocument namespace: Optional[str] = None - + def add_element(self, dataclass_instance) -> ProvRecord: # Convert the dataclass instance to a ProvElement element = self.convert_to_prov_element(dataclass_instance) @@ -121,10 +121,7 @@ def add_relation( relationship.add_attributes(attributes) # Add the relationship to the ProvDocument self.document.add_record(relationship) - - def get_document(self): - return self.document - + @dataclass class FileAdditionModel: @@ -167,8 +164,12 @@ def build_provenance_model(self) -> ProvDocument: self.commit, ProvGeneration, { - PROV_ATTR_STARTTIME: self.commit.start, + PROV_ATTR_STARTTIME: self.commit.authored_at, PROV_ROLE: ProvRole.FILE, + "insertions": self.revision.insertions, + "deletions": self.revision.deletions, + "lines": self.revision.lines, + "score": self.revision.score, }, ) self.ctx.add_relation( @@ -176,14 +177,15 @@ def build_provenance_model(self) -> ProvDocument: self.commit, ProvGeneration, { - PROV_ATTR_STARTTIME: self.commit.start, + PROV_ATTR_STARTTIME: self.commit.authored_at, PROV_ROLE: ProvRole.ADDED_REVISION, }, ) self.ctx.add_relation(self.revision.file, self.commit.author, ProvAttribution) self.ctx.add_relation(self.revision, self.revision.file, ProvSpecialization) # Return the document - return self.ctx.get_document() + return self.ctx.document + @dataclass @@ -222,10 +224,10 @@ def build_provenance_model(self) -> ProvDocument: self.revision, self.commit, ProvInvalidation, - {PROV_ATTR_STARTTIME: self.commit.start, PROV_ROLE: ProvRole.DELETED_REVISION}, + {PROV_ATTR_STARTTIME: self.commit.authored_at, PROV_ROLE: ProvRole.DELETED_REVISION}, ) # Return the document - return self.ctx.get_document() + return self.ctx.document @dataclass @@ -266,7 +268,7 @@ def build_provenance_model(self) -> ProvDocument: self.revision, self.commit, ProvGeneration, - {PROV_ATTR_STARTTIME: self.commit.start, PROV_ROLE: ProvRole.MODIFIED_REVISION}, + {PROV_ATTR_STARTTIME: self.commit.authored_at, PROV_ROLE: ProvRole.MODIFIED_REVISION}, ) self.ctx.add_relation(self.revision, self.commit.author, ProvAttribution) self.ctx.add_relation( @@ -276,10 +278,10 @@ def build_provenance_model(self) -> ProvDocument: self.commit, self.previous, ProvUsage, - {PROV_ATTR_STARTTIME: self.commit.start, PROV_ROLE: ProvRole.PREVIOUS_REVISION}, + {PROV_ATTR_STARTTIME: self.commit.authored_at, PROV_ROLE: ProvRole.PREVIOUS_REVISION}, ) # Return the document - return self.ctx.get_document() + return self.ctx.document @dataclass @@ -319,7 +321,7 @@ def build_provenance_model(self): previous_annotation = current_annotation previous_version = current_version - return self.ctx.get_document() + return self.ctx.document def _add_creation_part_for_hosted_commits(self): # Add the elements to the context @@ -557,7 +559,7 @@ def build_provenance_model(self) -> ProvDocument: ProvAssociation, {PROV_ROLE: ProvRole.COMMIT_AUTHOR}, ) - return self.ctx.get_document() + return self.ctx.document @dataclass From 0638a9ef38040edc58dc5dbe9ce6f1f8191ae6c6 Mon Sep 17 00:00:00 2001 From: cdboer <dev@claas.plus> Date: Sun, 27 Aug 2023 15:46:40 +0200 Subject: [PATCH 69/81] Refactor domain object attributes for clarity - Renamed attributes for better clarity (e.g., inserted to insertions, start to uthored_at) - Removed redundant attributes like ile_paths - Consolidated datetime attributes (start and end to uthored_at and committed_at) - Standardized uid attribute to id across various domain classes --- gitlab2prov/domain/objects.py | 35 +++++++++++++++++------------------ 1 file changed, 17 insertions(+), 18 deletions(-) diff --git a/gitlab2prov/domain/objects.py b/gitlab2prov/domain/objects.py index 80f8b18..418e884 100644 --- a/gitlab2prov/domain/objects.py +++ b/gitlab2prov/domain/objects.py @@ -84,8 +84,8 @@ def to_prov_element(self) -> ProvEntity: @dataclass class FileRevision(File): status: str - inserted: int - deleted: int + insertions: int + deletions: int lines: int score: float file: File | None = None @@ -102,8 +102,8 @@ def to_prov_element(self) -> ProvEntity: ("name", self.name), ("path", self.path), ("status", self.status), - ("inserted", self.inserted), - ("deleted", self.deleted), + ("insertions", self.insertions), + ("deletions", self.deletions), ("lines", self.lines), ("score", self.score), (PROV_TYPE, ProvType.FILE_REVISION), @@ -153,15 +153,15 @@ def identifier(self) -> QualifiedName: @classmethod def from_commit(cls, commit: Commit): - return cls(uid=commit.sha, resource=ProvType.COMMIT) + return cls(id=commit.sha, resource=ProvType.COMMIT) @classmethod def from_issue(cls, issue: Issue): - return cls(uid=issue.id, resource=ProvType.ISSUE) + return cls(id=issue.id, resource=ProvType.ISSUE) @classmethod def from_merge_request(cls, merge_request: MergeRequest): - return cls(uid=merge_request.id, resource=ProvType.MERGE_REQUEST) + return cls(id=merge_request.id, resource=ProvType.MERGE_REQUEST) def to_prov_element(self) -> ProvEntity: attributes = [("id", self.id), (PROV_TYPE, f"{self.resource}Version")] @@ -271,11 +271,10 @@ class GitCommit: deletions: int # number of lines deleted insertions: int # number of lines inserted lines: int # number of lines changed - files: int # number of files changed - file_paths: list[str] # list of file paths of changed files + files_changed: int # number of files changed parents: list[str] # list of parent commit shas - start: datetime # authored date - end: datetime # committed date + authored_at: datetime # authored date + committed_at: datetime # committed date @property def identifier(self) -> QualifiedName: @@ -286,14 +285,14 @@ def to_prov_element(self) -> ProvActivity: ("sha", self.sha), ("title", self.title), ("message", self.message), - ("deleted", self.deleted), - ("inserted", self.inserted), + ("deletions", self.deletions), + ("insertions", self.insertions), ("lines", self.lines), - ("files", self.files), - ("authored_at", self.start), - ("committed_at", self.end), - (PROV_ATTR_STARTTIME, self.start), - (PROV_ATTR_ENDTIME, self.end), + ("files_changed", self.files_changed), + ("authored_at", self.authored_at), + ("committed_at", self.committed_at), + (PROV_ATTR_STARTTIME, self.authored_at), + (PROV_ATTR_ENDTIME, self.committed_at), (PROV_TYPE, ProvType.GIT_COMMIT), ] return ProvActivity(PLACEHOLDER, self.identifier, attributes) From cfd9b4885c6652f765dff72500c70de31151a5fa Mon Sep 17 00:00:00 2001 From: cdboer <dev@claas.plus> Date: Sun, 27 Aug 2023 15:47:37 +0200 Subject: [PATCH 70/81] Enhance CLI commands and streamline configuration handling - Improved naming clarity: Renamed command groups from gitlab_cli to gitlab2prov and github_cli to github2prov. - Refined config validation and loading: Split invoke_command_line_from_config to separate functions for loading and validation (load_and_validate_config) and command execution (execute_command_from_config). - Introduced clear separation between loading and validation of config to provide clearer error messages. - Replaced --verbose option with --explain for statistics command for better understanding of its purpose. - Minor cleanups: Removed unnecessary whitespace and lines. --- gitlab2prov/entrypoints/cli.py | 76 ++++++++++++++++++---------------- 1 file changed, 41 insertions(+), 35 deletions(-) diff --git a/gitlab2prov/entrypoints/cli.py b/gitlab2prov/entrypoints/cli.py index 28865e2..273e70a 100644 --- a/gitlab2prov/entrypoints/cli.py +++ b/gitlab2prov/entrypoints/cli.py @@ -14,33 +14,40 @@ from gitlab2prov.prov import operations -def enable_logging(ctx: click.Context, _, enable: bool): +def enable_logging(ctx: click.Context, param: str, enable: bool): """Callback that optionally enables logging.""" if enable: create_logger() -def invoke_command_line_from_config(ctx: click.Context, _, filepath: str): - """Callback that executes a gitlab2prov run from a config file.""" +def load_and_validate_config(ctx: click.Context, filepath: str) -> Config: + """Load configuration from file and validate it. Returns the config if successful, otherwise fails the context.""" if not filepath: - return + return None config = Config.read(filepath) - ok, err = config.validate() - if not ok: - ctx.fail(f"Validation failed: {err}") + is_valid, error_message = config.validate() + if not is_valid: + ctx.fail(f"Validation failed: {error_message}") + return config + + +def execute_command_from_config(ctx: click.Context, param: str, filepath: str): + """Callback that executes a gitlab2prov run from a config file.""" + config = load_and_validate_config(ctx, filepath) + if not config: + return + context = ctx.command.make_context(ctx.command.name, args=config.parse(), parent=ctx) ctx.command.invoke(context) ctx.exit() -def validate_config(ctx: click.Context, _, filepath: str): +def validate_config(ctx: click.Context, param: str, filepath: str): """Callback that validates config file using gitlab2prov/config/schema.json.""" - if not filepath: + config = load_and_validate_config(ctx, filepath) + if not config: return - config = Config.read(filepath) - ok, err = config.validate() - if not ok: - ctx.fail(f"Validation failed: {err}") + click.echo("Validation successful, the following command would be executed:\n") click.echo(f"gitlab2prov {' '.join(config.parse())}") ctx.exit() @@ -94,7 +101,7 @@ def new_func(stream, *args, **kwargs): "--config", type=click.Path(exists=True, dir_okay=False), expose_value=False, - callback=invoke_command_line_from_config, + callback=execute_command_from_config, help="Read config from file.", ) @click.option( @@ -105,7 +112,7 @@ def new_func(stream, *args, **kwargs): help="Validate config file and exit.", ) @click.pass_context -def gitlab_cli(ctx): +def gitlab2prov(ctx): """ Extract provenance information from GitLab projects. """ @@ -126,7 +133,7 @@ def gitlab_cli(ctx): "--config", type=click.Path(exists=True, dir_okay=False), expose_value=False, - callback=invoke_command_line_from_config, + callback=execute_command_from_config, help="Read config from file.", ) @click.option( @@ -137,12 +144,12 @@ def gitlab_cli(ctx): help="Validate config file and exit.", ) @click.pass_context -def github_cli(ctx): +def github2prov(ctx): ctx.obj = bootstrap.bootstrap("github") -@github_cli.result_callback() -@gitlab_cli.result_callback() +@github2prov.result_callback() +@gitlab2prov.result_callback() def process_commands(processors, **kwargs): """Execute the chain of commands. @@ -247,7 +254,6 @@ def write(bus, documents, formats, destination): documents = list(documents) for i, document in enumerate(documents, start=1): - for fmt in formats: filename = f"{destination}{'-' + str(i) if len(documents) > 1 else ''}.{fmt}" try: @@ -325,14 +331,14 @@ def combine(bus, documents: Iterator[ProvDocument]): ) @click.option("--format", type=click.Choice(["csv", "table"]), default="table") @click.option( - "--verbose", + "--explain", is_flag=True, help="Print a textual summary of all operations applied to the graphs.", ) @processor @click.pass_obj def statistics( - bus, documents: Iterator[ProvDocument], resolution: str, format: str, verbose: bool + bus, documents: Iterator[ProvDocument], resolution: str, format: str, explain: bool ): """Print statistics for one or more provenance documents. @@ -343,7 +349,7 @@ def statistics( for document in documents: try: statistics = bus.handle(commands.Statistics(document, resolution, format)) - if verbose: + if explain: statistics = f"{document.description}\n\n{statistics}" click.echo(statistics) except Exception: @@ -352,17 +358,17 @@ def statistics( # CLI group for gitlab commands -gitlab_cli.add_command(extract) -gitlab_cli.add_command(read) -gitlab_cli.add_command(write) -gitlab_cli.add_command(combine) -gitlab_cli.add_command(transform) -gitlab_cli.add_command(statistics) +gitlab2prov.add_command(extract) +gitlab2prov.add_command(read) +gitlab2prov.add_command(write) +gitlab2prov.add_command(combine) +gitlab2prov.add_command(transform) +gitlab2prov.add_command(statistics) # CLI group for github commands -github_cli.add_command(extract) -github_cli.add_command(read) -github_cli.add_command(write) -github_cli.add_command(combine) -github_cli.add_command(transform) -github_cli.add_command(statistics) +github2prov.add_command(extract) +github2prov.add_command(read) +github2prov.add_command(write) +github2prov.add_command(combine) +github2prov.add_command(transform) +github2prov.add_command(statistics) From 93e5cfa4c4e78da830e67c9f61e4d87d8cff9f5e Mon Sep 17 00:00:00 2001 From: cdboer <dev@claas.plus> Date: Sun, 27 Aug 2023 15:48:50 +0200 Subject: [PATCH 71/81] Refactor ProjectUrl and improve URL parsing - Introduced post-initialization method __post_init__ to parse the URL upon object creation, thus reducing repetitive calls to urlsplit. - Simplified slug property logic by utilizing the parsed URL path directly, ensuring clearer and more concise code. - Removed individual properties for etloc and scheme in favor of the parsed attributes from __post_init__. - Streamlined the clone_url method by removing redundant parameters and leveraging a dictionary lookup for platform-specific URLs. - Adapted child classes GitlabProjectUrl and GithubProjectUrl to match the streamlined method signature. --- gitlab2prov/adapters/project_url.py | 35 ++++++++++++++--------------- 1 file changed, 17 insertions(+), 18 deletions(-) diff --git a/gitlab2prov/adapters/project_url.py b/gitlab2prov/adapters/project_url.py index e73112d..b7eed3e 100644 --- a/gitlab2prov/adapters/project_url.py +++ b/gitlab2prov/adapters/project_url.py @@ -5,41 +5,40 @@ @dataclass class ProjectUrl: url: str + scheme: str = "https" + + def __post_init__(self): + parsed_url = urlsplit(self.url) + self.url_path = parsed_url.path + self.netloc = parsed_url.netloc @property def slug(self) -> str: - if path := urlsplit(self.url).path: - owner, project = (s for s in path.split("/") if s) + if self.url_path: + *owner, project = self.url_path.split("/") + owner = "/".join(owner)[1:] return f"{owner}/{project}" - return None + return "" @property def instance(self) -> str: return f"{self.scheme}://{self.netloc}" - @property - def netloc(self): - return urlsplit(self.url).netloc - - @property - def scheme(self): - return "https" - - def clone_url(self, platform: str, token: str | None = None, method: str = "https"): - urls = { + def clone_url(self, platform: str, token: str = "") -> str: + platform_urls = { "gitlab": f"{self.instance}:{token}@{self.netloc}/{self.slug}", "github": f"{self.scheme}://{token}@{self.netloc}/{self.slug}.git", } - return urls.get(platform) + return platform_urls.get(platform, "") @dataclass class GitlabProjectUrl(ProjectUrl): - def clone_url(self, token: str | None = None, method: str = "https"): - return super().clone_url("gitlab", token, method) + def clone_url(self, token: str = ""): + return super().clone_url("gitlab", token) @dataclass class GithubProjectUrl(ProjectUrl): - def clone_url(self, token: str | None = None, method: str = "https"): - return super().clone_url("github", token, method) + def clone_url(self, token: str = ""): + return super().clone_url("github", token) From 768ff9d2e14db8620c43f9697e3b75cce6bc8535 Mon Sep 17 00:00:00 2001 From: cdboer <dev@claas.plus> Date: Sun, 27 Aug 2023 15:51:12 +0200 Subject: [PATCH 72/81] Update Git fetcher with additional commit and revision attributes - Add 'deletions', 'insertions', 'lines', and 'files_changed' attributes to 'extract_commits'. - Rename 'start' to 'authored_at' and 'end' to 'committed_at' in 'extract_commits'. - Add 'insertions', 'deletions', 'lines', and 'score' attributes to 'FileRevision' in 'extract_revisions'. - Correct typo in comment from 'remeber' to 'remember' in 'extract_revisions'. --- gitlab2prov/adapters/git/fetcher.py | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/gitlab2prov/adapters/git/fetcher.py b/gitlab2prov/adapters/git/fetcher.py index 052110d..eccb7e5 100644 --- a/gitlab2prov/adapters/git/fetcher.py +++ b/gitlab2prov/adapters/git/fetcher.py @@ -97,9 +97,13 @@ def extract_commits(repo: Repo) -> Iterator[GitCommit]: message=commit.message, author=get_author(commit), committer=get_committer(commit), + deletions=commit.stats.total["deletions"], + insertions=commit.stats.total["insertions"], + lines=commit.stats.total["lines"], + files_changed=commit.stats.total["files"], parents=[parent.hexsha for parent in commit.parents], - start=commit.authored_datetime, - end=commit.committed_datetime, + authored_at=commit.authored_datetime, + committed_at=commit.committed_datetime, ) @@ -136,10 +140,18 @@ def extract_revisions(repo: Repo) -> Iterator[FileRevision]: status = {"A": "added", "M": "modified", "D": "deleted"}.get(status, "modified") revs.append( FileRevision( - name=Path(path).name, path=path, commit=hexsha, status=status, file=file + name=Path(path).name, + path=path, + commit=hexsha, + status=status, + insertions=0, + deletions=0, + lines=0, + score=0, + file=file, ) ) - # revisions remeber their predecessor (previous revision) + # revisions remember their predecessor (previous revision) for rev, prev in zip_longest(revs, revs[1:]): rev.previous = prev yield rev From bfc85009dc950407359ed6009533680e66741930 Mon Sep 17 00:00:00 2001 From: cdboer <dev@claas.plus> Date: Sun, 27 Aug 2023 15:53:42 +0200 Subject: [PATCH 73/81] Refactor GithubAnnotationParser and Enhance Annotation Validation - Introduced a ilter_valid method to filter annotations without nnotator or start values. - Modified parse method to use ilter_valid after sorting annotations. - Changed property name uid to id in methods: parse_commit_comment, parse_commit_status, parse_award_reaction, parse_issue_comment, and parse_issue_event. - Removed extraneous whitespace after parse_award_reaction method. --- gitlab2prov/adapters/hub/parser.py | 44 ++++++++++++++++++------------ 1 file changed, 26 insertions(+), 18 deletions(-) diff --git a/gitlab2prov/adapters/hub/parser.py b/gitlab2prov/adapters/hub/parser.py index 493256a..eb3d2fb 100644 --- a/gitlab2prov/adapters/hub/parser.py +++ b/gitlab2prov/adapters/hub/parser.py @@ -46,12 +46,20 @@ def choose_parser(self, raw_annotation: A) -> Callable[[A], Annotation]: case _: log.warning(f"no parser found for {raw_annotation=}") + @staticmethod + def filter_valid(annotations): + return [ + annot + for annot in annotations + if annot.annotator is not None and annot.start is not None + ] + def parse(self, annotations: list[A]) -> list[Annotation]: parsed_annotations = [] for annotation in annotations: if parser := self.choose_parser(annotation): parsed_annotations.append(parser(annotation)) - return self.sort_by_date(parsed_annotations) + return self.filter_valid(self.sort_by_date(parsed_annotations)) def parse_commit_comment(self, comment: CommitComment) -> Annotation: annotator = User( @@ -62,7 +70,7 @@ def parse_commit_comment(self, comment: CommitComment) -> Annotation: prov_role=ProvRole.ANNOTATOR, ) return Annotation( - uid=comment.id, + id=comment.id, name="add_comment", body=comment.body, start=comment.created_at, @@ -79,7 +87,7 @@ def parse_commit_status(self, status: CommitStatus) -> Annotation: prov_role=ProvRole.ANNOTATOR, ) return Annotation( - uid=status.id, + id=status.id, name="add_commit_status", body=status.description, start=status.created_at, @@ -96,14 +104,13 @@ def parse_reaction(self, reaction: Reaction) -> Annotation: prov_role=ProvRole.ANNOTATOR, ) return Annotation( - uid=reaction.id, + id=reaction.id, name="add_award", body=reaction.content, start=reaction.created_at, end=reaction.created_at, annotator=annotator, ) - def parse_issue_comment(self, comment: IssueComment) -> Annotation: annotator = User( @@ -114,7 +121,7 @@ def parse_issue_comment(self, comment: IssueComment) -> Annotation: prov_role=ProvRole.ANNOTATOR, ) return Annotation( - uid=comment.id, + id=comment.id, name="add_comment", body=comment.body, start=comment.created_at, @@ -131,7 +138,7 @@ def parse_issue_event(self, event: IssueEvent) -> Annotation: prov_role=ProvRole.ANNOTATOR, ) return Annotation( - uid=event.id, + id=event.id, name=event.event, body=event.event, start=event.created_at, @@ -140,20 +147,21 @@ def parse_issue_event(self, event: IssueEvent) -> Annotation: ) def parse_timeline_event(self, event: TimelineEvent) -> Annotation: - annotator = User( - name=event.actor.name, - email=event.actor.email, - github_username=event.actor.login, - github_id=event.actor.id, - prov_role=ProvRole.ANNOTATOR, - ) return Annotation( - uid=event.id, + id=event.id, name=event.event, body=event.event, start=event.created_at, end=event.created_at, - annotator=annotator, + annotator=User( + name=event.actor.name, + email=event.actor.email, + github_username=event.actor.login, + github_id=event.actor.id, + prov_role=ProvRole.ANNOTATOR, + ) + if event.actor + else None, ) def parse_pull_request_review(self, review: PullRequestReview) -> Annotation: @@ -165,7 +173,7 @@ def parse_pull_request_review(self, review: PullRequestReview) -> Annotation: prov_role=ProvRole.ANNOTATOR, ) return Annotation( - uid=review.id, + id=review.id, name="add_review", body=review.body, start=review.submitted_at, @@ -182,7 +190,7 @@ def parse_pull_request_comment(self, comment: PullRequestComment) -> Annotation: prov_role=ProvRole.ANNOTATOR, ) return Annotation( - uid=comment.id, + id=comment.id, name="add_comment", body=comment.body, start=comment.created_at, From f5c9218407e5f64b40f3275c7029f7080c7ace9b Mon Sep 17 00:00:00 2001 From: cdboer <dev@claas.plus> Date: Sun, 27 Aug 2023 15:54:26 +0200 Subject: [PATCH 74/81] Update Annotation Property Name in GitlabAnnotationParser - Replaced uid with id in the methods for parsing notes, comments, awards, and labels. --- gitlab2prov/adapters/lab/parser.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/gitlab2prov/adapters/lab/parser.py b/gitlab2prov/adapters/lab/parser.py index 7116914..2a4f959 100644 --- a/gitlab2prov/adapters/lab/parser.py +++ b/gitlab2prov/adapters/lab/parser.py @@ -67,7 +67,7 @@ def parse_system_note(self, note: ProjectIssueNote | ProjectMergeRequestNote) -> ) annotation_name, key_value_pairs = self.classifier.classify(note.body) return Annotation( - uid=note.id, + id=note.id, name=annotation_name, body=note.body, start=note.created_at, @@ -85,7 +85,7 @@ def parse_comment(self, comment: ProjectCommitComment) -> Annotation: prov_role=ProvRole.ANNOTATOR, ) return Annotation( - uid=f"{uuid.uuid4()}{annotator.gitlab_id}{abs(hash(comment.note))}", + id=f"{uuid.uuid4()}{annotator.gitlab_id}{abs(hash(comment.note))}", name="add_comment", body=comment.note, start=comment.created_at, @@ -102,7 +102,7 @@ def parse_note(self, note: ProjectIssueNote | ProjectMergeRequestNote) -> Annota prov_role=ProvRole.ANNOTATOR, ) return Annotation( - uid=note.id, + id=note.id, name="add_note", body=note.body, annotator=annotator, @@ -125,7 +125,7 @@ def parse_award( prov_role=ProvRole.ANNOTATOR, ) return Annotation( - uid=award.id, + id=award.id, name="add_award", body=award.name, annotator=annotator, @@ -144,7 +144,7 @@ def parse_label( prov_role=ProvRole.ANNOTATOR, ) return Annotation( - uid=label.id, + id=label.id, name=f"{label.action}_label", body=label.action, annotator=annotator, From 35dd383893594d86c602b0c41cf00613a077dfc2 Mon Sep 17 00:00:00 2001 From: cdboer <dev@claas.plus> Date: Sun, 27 Aug 2023 15:55:40 +0200 Subject: [PATCH 75/81] Refactor and Enhance Provenance Operations - Improved docstrings for clarity in various methods. - Updated error handling in the ead_provenance_file method to handle file not found exceptions. - Refactored the deserialize_string method to provide better feedback on deserialization failures. - Modified the file write mode in write_provenance_file based on the overwrite parameter. - Enhanced ASCII table formatting in ormat_stats_as_ascii_table. - Introduced new methods for pseudonymization: - generate_pseudonym - pseudonymize_agent - pseudonymize_relation - Overhauled existing pseudonymize method for better clarity and efficiency. --- gitlab2prov/prov/operations.py | 134 ++++++++++++++++++++++----------- 1 file changed, 92 insertions(+), 42 deletions(-) diff --git a/gitlab2prov/prov/operations.py b/gitlab2prov/prov/operations.py index 0b89e00..8cf07d6 100644 --- a/gitlab2prov/prov/operations.py +++ b/gitlab2prov/prov/operations.py @@ -36,57 +36,63 @@ def read_provenance_file(filename: str) -> ProvDocument: - """Read provenance document from file or sys.stdin.""" - if filename == "-": - content = sys.stdin.read() - else: - with open(filename, "r") as f: - content = f.read() + """Read a ProvDocument from a file or sys.stdin.""" + try: + if filename == "-": + content = sys.stdin.read() + else: + with open(filename, "r") as f: + content = f.read() + except FileNotFoundError: + raise FileNotFoundError(f"File {filename} does not exist.") return deserialize_string(content=content) def deserialize_string(content: str, format: str = None): """Deserialize a ProvDocument from a string.""" - for format in DESERIALIZATION_FORMATS: + formats = [format] if format else DESERIALIZATION_FORMATS + for fmt in formats: try: - doc = ProvDocument.deserialize(content=content, format=format) - return doc + return ProvDocument.deserialize(content=content, format=fmt) except Exception: pass - raise Exception(f"Deserialization failed for {content=}, {format=}") + raise ValueError(f"Deserialization failed for content: {content} and format: {format}") def write_provenance_file( document: ProvDocument, filename: str, format: str = "json", overwrite: bool = True -): - """Write provenance document to file.""" - if Path(filename).exists() and not overwrite: +) -> None: + """Write a ProvDocument to a file.""" + mode = "x" if not overwrite else "w" + try: + with open(filename, mode) as f: + f.write(serialize_string(document, format=format)) + except FileExistsError: raise FileExistsError(f"File {filename} already exists.") - with open(filename, "w") as f: - f.write(serialize_string(document, format=format)) def serialize_string(document: ProvDocument, format: str = "json") -> str: """Serialize a ProvDocument to a string.""" if format not in SERIALIZATION_FORMATS: - raise ValueError("Unsupported serialization format.") - if format != "dot": - return document.serialize(format=format) - return prov_to_dot(document).to_string() + raise ValueError(f"Unsupported serialization format: {format}") + if format == "dot": + return prov_to_dot(document).to_string() + return document.serialize(format=format) def format_stats_as_ascii_table(stats: dict[str, int]) -> str: - table = f"|{'Record Type':20}|{'Count':20}|\n+{'-'*20}+{'-'*20}+\n" - for record_type, count in stats.items(): - table += f"|{record_type:20}|{count:20}|\n" - return table + """Format a dictionary as an ASCII table.""" + header = "|Record Type |Count |\n" + line = "+---------------------+--------------------+\n" + rows = [f"|{record_type:20}|{count:20}|" for record_type, count in stats.items()] + return f"{header}{line}{''.join(rows)}" def format_stats_as_csv(stats: dict[str, int]) -> str: - csv = "Record Type, Count\n" - for record_type, count in stats.items(): - csv += f"{record_type}, {count}\n" - return csv + """Format a dictionary as a CSV string.""" + header = "Record Type, Count\n" + rows = [f"{record_type}, {count}" for record_type, count in stats.items()] + return f"{header}{''.join(rows)}" def stats(graph: ProvDocument, resolution: str, format: str = "table") -> str: @@ -233,21 +239,6 @@ def get_attribute(record: ProvRecord, attribute: str, first: bool = True) -> str return choices[0] if first else choices -def pseudonymize_agent( - agent: ProvAgent, - identifier: QualifiedName, - keep: list[QualifiedName], - replace: dict[str, Any], -) -> ProvAgent: - kept = [(key, val) for key, val in agent.extra_attributes if key in keep] - replaced = [ - (key, replace.get(key.localpart, val)) - for key, val in agent.extra_attributes - if key.localpart in replace - ] - return ProvAgent(agent.bundle, identifier, kept + replaced) - - def pseudonymize(graph: ProvDocument) -> ProvDocument: log.info(f"pseudonymize agents in {graph=}") @@ -290,3 +281,62 @@ def pseudonymize(graph: ProvDocument) -> ProvDocument: records.append(r_type(relation.bundle, relation.identifier, formal + extra)) return graph_factory(records) + + +def generate_pseudonym(name: str, email: str = None) -> QualifiedName: + """Generate pseudonym using hashed name and email.""" + name_hash = hashlib.sha256(bytes(name, "utf-8")).hexdigest() + email_hash = hashlib.sha256(bytes(email, "utf-8")).hexdigest() if email else None + return qualified_name(f"User?name={name_hash}&email={email_hash}") + + +def pseudonymize_agent(agent: ProvAgent, pseudonyms: dict) -> ProvAgent: + """Replace agent identifier with pseudonym.""" + name = get_attribute(agent, USERNAME) + mail = get_attribute(agent, USEREMAIL) + + if name is None: + raise ValueError("ProvAgent representing a user has to have a name!") + + pseudonym = generate_pseudonym(name, mail) + + keep = [PROV_ROLE, PROV_TYPE] + replace = {USERNAME: name, USEREMAIL: mail} + + kept = [(key, val) for key, val in agent.extra_attributes if key in keep] + replaced = [ + (key, replace.get(key.localpart, val)) + for key, val in agent.extra_attributes + if key.localpart in replace + ] + + pseudonymized_agent = ProvAgent(agent.bundle, pseudonym, kept + replaced) + + return pseudonymized_agent, agent.identifier, pseudonym + + +def pseudonymize_relation(relation: ProvRelation, pseudonyms: dict) -> ProvRelation: + """Replace relation identifiers with pseudonyms.""" + formal = [(key, pseudonyms.get(val, val)) for key, val in relation.formal_attributes] + extra = [(key, pseudonyms.get(val, val)) for key, val in relation.extra_attributes] + r_type = PROV_REC_CLS.get(relation.get_type()) + return r_type(relation.bundle, relation.identifier, formal + extra) + + +def pseudonymize(graph: ProvDocument) -> ProvDocument: + """Pseudonymize agents in a ProvDocument.""" + log.info(f"Pseudonymize agents in {graph=}") + + records = list(graph.get_records((ProvActivity, ProvEntity))) + pseudonyms = dict() + + for agent in graph.get_records(ProvAgent): + pseudonymized_agent, original_id, pseudonym = pseudonymize_agent(agent) + pseudonyms[original_id] = pseudonym + records.append(pseudonymized_agent) + + for relation in graph.get_records(ProvRelation): + pseudonymized_relation = pseudonymize_relation(relation, pseudonyms) + records.append(pseudonymized_relation) + + return graph_factory(records) From 01c28b21d622e57389a7035ecc6bd670b91dd756 Mon Sep 17 00:00:00 2001 From: cdboer <dev@claas.plus> Date: Mon, 28 Aug 2023 09:57:18 +0200 Subject: [PATCH 76/81] Update config file example --- config/example.yaml | 31 +++++++++++++++++++------------ 1 file changed, 19 insertions(+), 12 deletions(-) diff --git a/config/example.yaml b/config/example.yaml index 017c570..d3f8eaf 100644 --- a/config/example.yaml +++ b/config/example.yaml @@ -1,20 +1,27 @@ # yaml-language-server: $schema=../gitlab2prov/config/schema.json - extract: - url: ["https://gitlab.com/example/foo"] - token: tokenFoo + url: + - "https://gitlab.com/aristotle/nicomachean-ethics" + - "https://gitlab.com/aristotle/poetics" + token: golden_mean_and_drama_token - extract: - url: ["https://gitlab.com/example/bar"] - token: tokenBar + url: + - "https://gitlab.com/plato/the-republic" + - "https://gitlab.com/plato/phaedrus" + token: ideal_forms_and_speech_token +- extract: + url: ["https://gitlab.com/socrates/apology"] + token: know_thyself_token - read: - input: [example.rdf] + input: [aristotelian_logic.rdf] - combine: - transform: - use_pseudonyms: true - remove_duplicates: true + use_pseudonyms: true + remove_duplicates: true - write: - output: combined - format: [json, rdf, xml, dot] + output: philosopher_outputs + format: [json, rdf, xml, dot] - stats: - fine: true - explain: true - formatter: table \ No newline at end of file + fine: true + explain: true + format: table \ No newline at end of file From f93d403c10c49ce40beb4eddb0797abb2f71114c Mon Sep 17 00:00:00 2001 From: cdboer <dev@claas.plus> Date: Mon, 28 Aug 2023 09:57:27 +0200 Subject: [PATCH 77/81] Update README.md --- README.md | 121 ++++++++++++++++++++++++++++++++++++------------------ 1 file changed, 82 insertions(+), 39 deletions(-) diff --git a/README.md b/README.md index f398b88..3367266 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -<h1 align="center">Welcome to <code>gitlab2prov</code>! 👋</h1> +<h1 align="center">Welcome to <code>gitlab2prov</code> & <code>github2prov</code> ! 👋</h1> <p align="center"> <a href="https://github.com/dlr-sc/gitlab2prov/blob/master/LICENSE"> <img alt="License: MIT" src="https://img.shields.io/badge/license-MIT-yellow.svg" target="_blank" /> @@ -30,15 +30,17 @@ </p> -> `gitlab2prov` is a Python library and command line tool that extracts provenance information from GitLab projects. +> `gitlab2prov` is a Python library and command line tool that extracts provenance information from GitLab projects. GitHub support is provided by the `github2prov` command line tool contained in this package. --- -The `gitlab2prov` data model has been designed according to [W3C PROV](https://www.w3.org/TR/prov-overview/) specification. -The model documentation can be found [here](https://github.com/DLR-SC/gitlab2prov/tree/master/docs). +The data model underlying `gitlab2prov` & `github2prov` has been designed according to [W3C PROV](https://www.w3.org/TR/prov-overview/) specification. +The model documentation can be found [here](/docs/README.md). ## ️🏗️ ️Installation +Please note that this tool requires Git to be installed on your machine. + Clone the project and install using `pip`: ```bash pip install . @@ -55,17 +57,27 @@ pip install .[dev] # clone repo, install with extras pip install gitlab2prov[dev] # PyPi, install with extras ``` +That's it! You can now use `gitlab2prov` and `github2prov` from the command line. + +```bash +gitlab2prov --version # show version +github2prov --version # show version +``` + + ## ⚡ Getting started -`gitlab2prov` requires a [personal access token](https://docs.gitlab.com/ee/user/profile/personal_access_tokens.html) to clone git repositories and to authenticate with the GitLab API. +`gitlab2prov` & `github2prov` require a [personal access token](https://docs.gitlab.com/ee/user/profile/personal_access_tokens.html) to clone git repositories and to authenticate with the GitLab/GitHub API. -Use the following guide to obtain a token with the required [scopes](https://docs.gitlab.com/ee/user/profile/personal_access_tokens.html#personal-access-token-scopes) for yourself: +Use the following guides to obtain a token with the required [scopes](https://docs.gitlab.com/ee/user/profile/personal_access_tokens.html#personal-access-token-scopes) for yourself: - [Create a personal access token (GitLab)](./docs/guides/gitlab-token.md) - [Create a personal access token (GitHub)](./docs/guides/github-token.md) ## 🚀‍ Usage +The usage of `gitlab2prov` and `github2prov` is identical. The only difference being that `github2prov` only supports GitHub projects whereas `gitlab2prov` supports only GitLab projects. We will use `gitlab2prov` in the following examples. + `gitlab2prov` can be configured using the command line interface or by providing a configuration file in `.yaml` format. ### Command Line Usage @@ -84,67 +96,85 @@ Options: --help Show this message and exit. Commands: - combine Combine one or more provenance documents. - extract Extract provenance information for one or more gitlab... - read Read provenance information from file[s]. - stats Print statistics for one or more provenance documents. - transform Apply a set of transformations to provenance documents. - write Write provenance information to file[s]. + combine Combine one or more provenance documents. + extract Extract provenance information for one or more gitlab projects. + read Read provenance information from file[s]. + stats Print statistics for one or more provenance documents. + transform Apply a set of transformations to provenance documents. + write Write provenance information to file[s]. ``` ### Configuration Files `gitlab2prov` supports configuration files in `.yaml` format that are functionally equivalent to command line invocations. -To read configuration details from a file instead of specifying on the command line, use the `--config` option: +To envoke a run using a config file, use the `--config` option: ```ini -# initiate a run using a config file +# run gitlab2prov using the config file 'config/example.yaml' gitlab2prov --config config/example.yaml ``` -You can validate your config file using the provided JSON-Schema `gitlab2prov/config/schema.json` that comes packaged with every installation: +You can validate your config file using the provided [JSON Schema file](gitlab2prov/config/schema.json) that comes packaged with every installation: ```ini -# check config file for syntactical errors +# validate config file 'config/example.yaml' against the JSON Schema gitlab2prov --validate config/example.yaml ``` -Config file example: +Here is an example config file that extracts provenance information from three GitLab projects, reads a serialized provenance document from a file, combines the resulting provenance documents, transforms the combined document and writes it to files in different formats. Finally, statistics about the generated output are printed to the console: ```yaml - extract: - url: ["https://gitlab.com/example/foo"] - token: tokenA + url: + - "https://gitlab.com/aristotle/nicomachean-ethics" + - "https://gitlab.com/aristotle/poetics" + token: golden_mean_and_drama_token +- extract: + url: + - "https://gitlab.com/plato/the-republic" + - "https://gitlab.com/plato/phaedrus" + token: ideal_forms_and_speech_token - extract: - url: ["https://gitlab.com/example/bar"] - token: tokenB -- load: - input: [example.rdf] + url: ["https://gitlab.com/socrates/apology"] + token: know_thyself_token +- read: + input: [aristotelian_logic.rdf] - combine: - transform: - use_pseudonyms: true - remove_duplicates: true -- save: - output: combined - format: [json, rdf, xml, dot] + use_pseudonyms: true + remove_duplicates: true +- write: + output: philosopher_outputs + format: [json, rdf, xml, dot] - stats: - fine: true - explain: true - formatter: table + fine: true + explain: true + format: table ``` The config file example is functionally equivalent to this command line invocation: ``` -gitlab2prov extract -u https://gitlab.com/example/foo -t tokenFoo \ - extract -u https://gitlab.com/example/bar -t tokenBar \ - load -i example.rdf \ - combine \ - transform --use-pseudonyms --remove_duplicates \ - write -o combined -f json -f rdf -f xml -f dot \ - stats --fine --explain --formatter table +gitlab2prov \ + extract \ + --url https://gitlab.com/aristotle/nicomachean-ethics \ + --url https://gitlab.com/aristotle/poetics \ + --token golden_mean_and_drama_token \ + extract \ + --url https://gitlab.com/plato/the-republic \ + --url https://gitlab.com/plato/phaedrus \ + --token ideal_forms_and_speech_token \ + extract \ + --url https://gitlab.com/socrates/apology --token know_thyself_token \ + read --input aristotelian_logic.rdf \ + combine \ + transform --use_pseudonyms --remove_duplicates \ + write --output philosopher_outputs \ + --format json --format rdf --format xml --format dot \ + stats --fine --explain --format table + ``` ### 🎨 Provenance Output Formats -`gitlab2prov` supports output formats that the [`prov`](https://github.com/trungdong/prov) library provides: +`gitlab2prov` & `github2prov` support all output formats that the [`prov`](https://github.com/trungdong/prov) library provides: * [PROV-N](http://www.w3.org/TR/prov-n/) * [PROV-O](http://www.w3.org/TR/prov-o/) (RDF) * [PROV-XML](http://www.w3.org/TR/prov-xml/) @@ -199,6 +229,19 @@ You can also cite specific releases published on Zenodo: [![DOI](https://zenodo. * Andreas Schreiber (2020). [Visualization of contributions to open-source projects](https://doi.org/10.1145/3430036.3430057). In *Proceedings of the 13th International Symposium on Visual Information Communication and Interaction*. ACM, USA. +## 📜 Dependencies +`gitlab2prov` depends on several open source packages that are made freely available under their respective licenses. + +| Package | License | +|-----------------------------------------------------------------|---------------------------------------------------------------------------------------------------------------------------| +| [GitPython](https://github.com/gitpython-developers/GitPython) | [![License](https://img.shields.io/badge/License-BSD_3--Clause-orange.svg)](https://opensource.org/licenses/BSD-3-Clause) | +| [click](https://github.com/pallets/click) | [![License](https://img.shields.io/badge/License-BSD_3--Clause-orange.svg)](https://opensource.org/licenses/BSD-3-Clause) | +| [python-gitlab](https://github.com/python-gitlab/python-gitlab) | [![License: LGPL v3](https://img.shields.io/badge/License-LGPL_v3-blue.svg)](https://www.gnu.org/licenses/lgpl-3.0) | +| [prov](https://pypi.org/project/prov/) | [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT) | +| [jsonschema](https://github.com/python-jsonschema/jsonschema) | [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT) | +| [ruamel.yaml](https://pypi.org/project/ruamel.yaml/) | [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT) | +| [pydot](https://github.com/pydot/pydot) | [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT) | + ## 📝 License This project is [MIT](https://github.com/dlr-sc/gitlab2prov/blob/master/LICENSE) licensed. Copyright © 2019 German Aerospace Center (DLR) and individual contributors. From 66c591f25ebc70a60764b758d07d69052a38f0c6 Mon Sep 17 00:00:00 2001 From: cdboer <dev@claas.plus> Date: Mon, 28 Aug 2023 10:11:46 +0200 Subject: [PATCH 78/81] Update README title --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 3367266..65f7874 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -<h1 align="center">Welcome to <code>gitlab2prov</code> & <code>github2prov</code> ! 👋</h1> +<h1 align="center"> <code>gitlab2prov</code> 🦊🐈‍⬛ <code>github2prov</code> :octocat: </h1> <p align="center"> <a href="https://github.com/dlr-sc/gitlab2prov/blob/master/LICENSE"> <img alt="License: MIT" src="https://img.shields.io/badge/license-MIT-yellow.svg" target="_blank" /> From f87a18f3f8ead68d4b759c67529f904e41c30980 Mon Sep 17 00:00:00 2001 From: cdboer <dev@claas.plus> Date: Mon, 28 Aug 2023 10:12:26 +0200 Subject: [PATCH 79/81] Update README title --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 65f7874..710f36b 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -<h1 align="center"> <code>gitlab2prov</code> 🦊🐈‍⬛ <code>github2prov</code> :octocat: </h1> +<h1 align="center"> <code>gitlab2prov</code>, <code>github2prov: (🦊|🐈‍⬛) → 📄</code> </h1> <p align="center"> <a href="https://github.com/dlr-sc/gitlab2prov/blob/master/LICENSE"> <img alt="License: MIT" src="https://img.shields.io/badge/license-MIT-yellow.svg" target="_blank" /> From d28ee4d14f97cbb12209b00b3fb96f1bc238e9de Mon Sep 17 00:00:00 2001 From: cdboer <dev@claas.plus> Date: Mon, 28 Aug 2023 10:12:59 +0200 Subject: [PATCH 80/81] Update README title --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 710f36b..7af5b93 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -<h1 align="center"> <code>gitlab2prov</code>, <code>github2prov: (🦊|🐈‍⬛) → 📄</code> </h1> +<h1 align="center"> <code>gitlab2prov</code>, <code>github2prov</code>: (🦊|🐈‍⬛) → 📄 </h1> <p align="center"> <a href="https://github.com/dlr-sc/gitlab2prov/blob/master/LICENSE"> <img alt="License: MIT" src="https://img.shields.io/badge/license-MIT-yellow.svg" target="_blank" /> From 56818cab4c2484d6e3e869b1634f9a8cf4e5e141 Mon Sep 17 00:00:00 2001 From: cdboer <dev@claas.plus> Date: Mon, 28 Aug 2023 10:22:53 +0200 Subject: [PATCH 81/81] Update node attribute tables --- docs/README.md | 248 ++++++++++++++++++++++++------------------------- 1 file changed, 124 insertions(+), 124 deletions(-) diff --git a/docs/README.md b/docs/README.md index 207af1e..9c32bba 100644 --- a/docs/README.md +++ b/docs/README.md @@ -55,7 +55,7 @@ Both entities are generated by the commit activity and are attributed to the aut **`Author`** | Attribute | Fixed Value | Description | -| --------------- | ----------- | -------------------------------------------------------- | +|-----------------|-------------|----------------------------------------------------------| | name | - | `git config user.name` Set in the author's git config. | | email | - | `git config user.email` Set in the author's git config. | | gitlab_username | - | Gitlab user account username. | @@ -69,7 +69,7 @@ Both entities are generated by the commit activity and are attributed to the aut **`Committer`** | Attribute | Fixed Value | Description | -| --------------- | ----------- | -------------------------------------------------------- | +|-----------------|-------------|----------------------------------------------------------| | name | - | `git config user.name` Set in the author's git config. | | email | - | `git config user.email` Set in the author's git config. | | gitlab_username | - | Gitlab user account username. | @@ -83,12 +83,12 @@ Both entities are generated by the commit activity and are attributed to the aut **`Commit`** | Attribute | Fixed Value | Description | -| -------------- | ----------------------- | ------------------------------------------- | +|----------------|-------------------------|---------------------------------------------| | sha | - | Commit SHA1 | | title | - | First 50 characters of the commit message. | | message | - | Commit message. | -| deleted | - | Number of lines deleted. | -| inserted | - | Number of lines inserted. | +| deletions | - | Number of lines deleted. | +| insertions | - | Number of lines inserted. | | lines | - | Number of lines changed. | | files | - | Number of files changed. | | authored_at | - | Time at which the commit was authored. | @@ -101,7 +101,7 @@ Both entities are generated by the commit activity and are attributed to the aut **`File`** | Attribute | Fixed Value | Description | -| ---------- | ----------- | ------------------------------------------------------------------ | +|------------|-------------|--------------------------------------------------------------------| | name | - | Original file name. | | path | - | Original file path. The path at which this file was first created. | | commit | - | SHA1 of the commit that added this file to the repository. | @@ -111,13 +111,13 @@ Both entities are generated by the commit activity and are attributed to the aut **`File Revision`** | Attribute | Fixed Value | Description | -| ---------- | ---------------------------------- | --------------------------------------------------------------------------------------------------- | +|------------|------------------------------------|-----------------------------------------------------------------------------------------------------| | name | - | Current file name. | | path | - | Current file path of this revision. | | commit | - | SHA1 of the commit that added this revision to the repository. | | status | `added` or `modified` or `deleted` | Change status of the file revision. | -| inserted | - | Number of lines inserted. | -| deleted | - | Number of lines deleted. | +| insertions | - | Number of lines inserted. | +| deletions | - | Number of lines deleted. | | lines | - | Number of lines changed. | | score | - | Percentage of similarity compared to previous revision ([Docs](https://git-scm.com/docs/git-diff)). | | prov:type | FileRevision | Entity type. | @@ -131,31 +131,31 @@ The following tables define the attributes attached to these relations. **`File - [wasGeneratedBy] -> Commit`** | Attribute | Fixed Value | Description | -| --------- | -------------------- | -------------------------------------------------------------- | +|-----------|----------------------|----------------------------------------------------------------| | prov:role | File | Function of the File entity in context of the Commit activity. | | prov:time | `COMMIT_AUTHOR_DATE` | Time at which the File entity was generated. | **`File Revision - [wasGeneratedBy] -> Commit`** -| Attribute | Fixed Value | Description | -| --------- | ----------------------------- | --------------------------------------------------------------------------------------------------- | -| inserted | - | Number of lines inserted. | -| deleted | - | Number of lines deleted. | -| lines | - | Number of lines changed. | -| score | - | Percentage of similarity compared to previous revision ([Docs](https://git-scm.com/docs/git-diff)). | -| prov:role | FileRevisionAtPointOfAddition | Function of the FileRevision entity in context of the Commit activity. | -| prov:time | `COMMIT_AUTHOR_DATE` | Time at which the FileRevision entity was generated. | +| Attribute | Fixed Value | Description | +|------------|-------------------------------|-----------------------------------------------------------------------------------------------------| +| insertions | - | Number of lines inserted. | +| deletions | - | Number of lines deleted. | +| lines | - | Number of lines changed. | +| score | - | Percentage of similarity compared to previous revision ([Docs](https://git-scm.com/docs/git-diff)). | +| prov:role | FileRevisionAtPointOfAddition | Function of the FileRevision entity in context of the Commit activity. | +| prov:time | `COMMIT_AUTHOR_DATE` | Time at which the FileRevision entity was generated. | **`Commit - [wasAssociatedWith] -> Author`** | Attribute | Fixed Value | Description | -| --------- | ----------- | --------------------------------------------------------------- | +|-----------|-------------|-----------------------------------------------------------------| | prov:role | Author | Function of the Author agent in context of the Commit activity. | **`Commit - [wasAssociatedWith] -> Committer`** | Attribute | Fixed Value | Description | -| --------- | ----------- | ------------------------------------------------------------------ | +|-----------|-------------|--------------------------------------------------------------------| | prov:role | Committer | Function of the Committer agent in context of the Commit activity. | @@ -187,7 +187,7 @@ All revisions are marked as specializations of the File entity. **`Author`** | Attribute | Fixed Value | Description | -| ---------- | ----------- | -------------------------------------------------------- | +|------------|-------------|----------------------------------------------------------| | name | - | `git config user.name` Set in the author's git config. | | email | - | `git config user.email` Set in the author's git config. | | prov:role | Author | Function of the agent in context of the commit activity. | @@ -197,7 +197,7 @@ All revisions are marked as specializations of the File entity. **`Committer`** | Attribute | Fixed Value | Description | -| ---------- | ----------- | -------------------------------------------------------- | +|------------|-------------|----------------------------------------------------------| | name | - | `git config user.name` Set in the author's git config. | | email | - | `git config user.email` Set in the author's git config. | | prov:role | Committer | Function of the agent in context of the commit activity. | @@ -207,12 +207,12 @@ All revisions are marked as specializations of the File entity. **`Commit`** | Attribute | Fixed Value | Description | -| -------------- | ----------------------- | ---------------------------------------------- | +|----------------|-------------------------|------------------------------------------------| | sha | - | Commit SHA1 | | title | - | First 50 characters of the commit message. | | message | - | Commit message. | -| deleted | - | Number of lines deleted. | -| inserted | - | Number of lines inserted. | +| deletions | - | Number of lines deleted. | +| insertions | - | Number of lines inserted. | | lines | - | Number of lines changed. | | files | - | Number of files changed. | | prov:startTime | `COMMIT_AUTHOR_DATE` | Time at which the commit activity started. | @@ -223,7 +223,7 @@ All revisions are marked as specializations of the File entity. **`File`** | Attribute | Fixed Value | Description | -| ---------- | ----------- | ------------------------------------------------------------------ | +|------------|-------------|--------------------------------------------------------------------| | name | - | Original file name. | | path | - | Original file path. The path at which this file was first created. | | commit | - | SHA1 of the commit that added this file to the repository. | @@ -233,13 +233,13 @@ All revisions are marked as specializations of the File entity. **`File Revision`** | Attribute | Fixed Value | Description | -| ---------- | ---------------------------------- | --------------------------------------------------------------------------------------------------- | +|------------|------------------------------------|-----------------------------------------------------------------------------------------------------| | name | - | Original file name. | | path | - | Original file path. The path at which this file was first created. | | commit | - | SHA1 of the commit that added this file to the repository. | | status | `added` or `modified` or `deleted` | Change status of the file revision. | -| inserted | - | Number of lines inserted. | -| deleted | - | Number of lines deleted. | +| insertions | - | Number of lines inserted. | +| deletions | - | Number of lines deleted. | | lines | - | Number of lines changed. | | score | - | Percentage of similarity compared to previous revision ([Docs](https://git-scm.com/docs/git-diff)). | | prov:type | FileRevision | Entity type. | @@ -248,13 +248,13 @@ All revisions are marked as specializations of the File entity. **`Previous File Revision`** | Attribute | Fixed Value | Description | -| ---------- | ---------------------------------- | --------------------------------------------------------------------------------------------------- | +|------------|------------------------------------|-----------------------------------------------------------------------------------------------------| | name | - | Original file name. | | path | - | Original file path. The path at which this file was first created. | | commit | - | SHA1 of the commit that added this file to the repository. | | status | `added` or `modified` or `deleted` | Change status of the file revision. | -| inserted | - | Number of lines inserted. | -| deleted | - | Number of lines deleted. | +| insertions | - | Number of lines inserted. | +| deletions | - | Number of lines deleted. | | lines | - | Number of lines changed. | | score | - | Percentage of similarity compared to previous revision ([Docs](https://git-scm.com/docs/git-diff)). | | prov:type | FileRevision | Entity type. | @@ -267,31 +267,31 @@ The following tables define the attributes attached to these relations. **`Commit - [used] -> Previous File Revision`** | Attribute | Fixed Value | Description | -| --------- | ------------------------------ | -------------------------------------------------------------- | +|-----------|--------------------------------|----------------------------------------------------------------| | prov:role | FileRevisionBeforeModification | Function of the File entity in context of the Commit activity. | | prov:time | `COMMIT_AUTHOR_DATE` | Time at which the File entity was used. | **`File Revision - [wasGeneratedBy] -> Commit`** -| Attribute | Fixed Value | Description | -| --------- | ----------------------------- | --------------------------------------------------------------------------------------------------- | -| inserted | - | Number of lines inserted. | -| deleted | - | Number of lines deleted. | -| lines | - | Number of lines changed. | -| score | - | Percentage of similarity compared to previous revision ([Docs](https://git-scm.com/docs/git-diff)). | -| prov:role | FileRevisionAfterModification | Function of the File entity in context of the Commit activity. | -| prov:time | `COMMIT_AUTHOR_DATE` | Time at which the File entity was generated. | +| Attribute | Fixed Value | Description | +|------------|-------------------------------|-----------------------------------------------------------------------------------------------------| +| insertions | - | Number of lines inserted. | +| deletions | - | Number of lines deleted. | +| lines | - | Number of lines changed. | +| score | - | Percentage of similarity compared to previous revision ([Docs](https://git-scm.com/docs/git-diff)). | +| prov:role | FileRevisionAfterModification | Function of the File entity in context of the Commit activity. | +| prov:time | `COMMIT_AUTHOR_DATE` | Time at which the File entity was generated. | **`Commit - [wasAssociatedWith] -> Author`** | Attribute | Fixed Value | Description | -| --------- | ----------- | --------------------------------------------------------------- | +|-----------|-------------|-----------------------------------------------------------------| | prov:role | Author | Function of the Author agent in context of the Commit activity. | **`Commit - [wasAssociatedWith] -> Committer`** | Attribute | Fixed Value | Description | -| --------- | ----------- | ------------------------------------------------------------------ | +|-----------|-------------|--------------------------------------------------------------------| | prov:role | Committer | Function of the Committer agent in context of the Commit activity. | @@ -318,7 +318,7 @@ The deleted revision is invalidated by the commit that removes it from the repos **`Author`** | Attribute | Fixed Value | Description | -| ---------- | ----------- | -------------------------------------------------------- | +|------------|-------------|----------------------------------------------------------| | name | - | `git config user.name` Set in the author's git config. | | email | - | `git config user.email` Set in the author's git config. | | prov:role | Author | Function of the agent in context of the commit activity. | @@ -328,7 +328,7 @@ The deleted revision is invalidated by the commit that removes it from the repos **`Committer`** | Attribute | Fixed Value | Description | -| ---------- | ----------- | -------------------------------------------------------- | +|------------|-------------|----------------------------------------------------------| | name | - | `git config user.name` Set in the author's git config. | | email | - | `git config user.email` Set in the author's git config. | | prov:role | Committer | Function of the agent in context of the commit activity. | @@ -338,12 +338,12 @@ The deleted revision is invalidated by the commit that removes it from the repos **`Commit`** | Attribute | Fixed Value | Description | -| -------------- | ----------------------- | ------------------------------------------- | +|----------------|-------------------------|---------------------------------------------| | sha | - | Commit SHA1 | | title | - | First 50 characters of the commit message. | | message | - | Commit message. | -| deleted | - | Number of lines deleted. | -| inserted | - | Number of lines inserted. | +| deletions | - | Number of lines deleted. | +| insertions | - | Number of lines inserted. | | lines | - | Number of lines changed. | | files | - | Number of files changed. | | prov:startTime | `COMMIT_AUTHOR_DATE` | Time at which the commit activity started. | @@ -354,7 +354,7 @@ The deleted revision is invalidated by the commit that removes it from the repos **`File`** | Attribute | Fixed Value | Description | -| ---------- | ----------- | ------------------------------------------------------------------ | +|------------|-------------|--------------------------------------------------------------------| | name | - | Original file name. | | path | - | Original file path. The path at which this file was first created. | | commit | - | SHA1 of the commit that added this file to the repository. | @@ -364,13 +364,13 @@ The deleted revision is invalidated by the commit that removes it from the repos **`File Revision`** | Attribute | Fixed Value | Description | -| ---------- | ---------------------------------- | --------------------------------------------------------------------------------------------------- | +|------------|------------------------------------|-----------------------------------------------------------------------------------------------------| | name | - | Original file name. | | path | - | Original file path. The path at which this file was first created. | | commit | - | SHA1 of the commit that added this file to the repository. | | status | `added` or `modified` or `deleted` | Change status of the file revision. | -| inserted | - | Number of lines inserted. | -| deleted | - | Number of lines deleted. | +| insertions | - | Number of lines inserted. | +| deletions | - | Number of lines deleted. | | lines | - | Number of lines changed. | | score | - | Percentage of similarity compared to previous revision ([Docs](https://git-scm.com/docs/git-diff)). | | prov:type | FileRevision | Entity type. | @@ -384,24 +384,24 @@ The following tables define the attributes attached to these relations. **`Commit - [wasAssociatedWith] -> Author`** | Attribute | Fixed Value | Description | -| --------- | ----------- | --------------------------------------------------------------- | +|-----------|-------------|-----------------------------------------------------------------| | prov:role | Author | Function of the Author agent in context of the Commit activity. | **`Commit - [wasAssociatedWith] -> Committer`** | Attribute | Fixed Value | Description | -| --------- | ----------- | ------------------------------------------------------------------ | +|-----------|-------------|--------------------------------------------------------------------| | prov:role | Committer | Function of the Committer agent in context of the Commit activity. | **`File Revision - [wasInvalidatedBy] -> Commit`** -| Attribute | Fixed Value | Description | -| --------- | ----------------------------- | ---------------------------------------------------------------------- | -| inserted | - | Number of lines inserted. | -| deleted | - | Number of lines deleted. | -| lines | - | Number of lines changed. | -| prov:time | `COMMIT_AUTHOR_DATE` | Time at which the FileRevision entity was invalidated. | -| prov:role | FileRevisionAtPointOfDeletion | Function of the FileRevision entity in context of the Commit activity. | +| Attribute | Fixed Value | Description | +|------------|-------------------------------|------------------------------------------------------------------------| +| insertions | - | Number of lines inserted. | +| deletions | - | Number of lines deleted. | +| lines | - | Number of lines changed. | +| prov:time | `COMMIT_AUTHOR_DATE` | Time at which the FileRevision entity was invalidated. | +| prov:role | FileRevisionAtPointOfDeletion | Function of the FileRevision entity in context of the Commit activity. | ## GitLab: Commit Web Resource @@ -442,7 +442,7 @@ This way, the model captures the lineage of the GitLab commit web resource and a **`Gitlab Commit Author`** | Attribute | Fixed Value | Description | -| ---------- | ------------------ | -------------------------------------------------------- | +|------------|--------------------|----------------------------------------------------------| | name | - | `git config user.name` Set in the author's git config. | | email | - | `git config user.email` Set in the author's git config. | | prov:role | GitlabCommitAuthor | Function of the agent in context of the commit activity. | @@ -452,7 +452,7 @@ This way, the model captures the lineage of the GitLab commit web resource and a **`Annotator`** | Attribute | Fixed Value | Description | -| --------------- | ----------- | --------------------------------------------------------- | +|-----------------|-------------|-----------------------------------------------------------| | name | - | Annotator given name. | | gitlab_username | - | GitLab username. As set in the annotators GitLab profile. | | github_username | - | GitHub username. As set in the annotators GitHub profile. | @@ -465,12 +465,12 @@ This way, the model captures the lineage of the GitLab commit web resource and a **`Git Commit`** | Attribute | Fixed Value | Description | -| -------------- | ----------------------- | ---------------------------------------------- | +|----------------|-------------------------|------------------------------------------------| | sha | - | Commit SHA1 | | title | - | First 50 characters of the commit message. | | message | - | Commit message. | -| deleted | - | Number of lines deleted. | -| inserted | - | Number of lines inserted. | +| deletions | - | Number of lines deleted. | +| insertions | - | Number of lines inserted. | | lines | - | Number of lines changed. | | files | - | Number of files changed. | | prov:startTime | `COMMIT_AUTHOR_DATE` | Time at which the commit activity started. | @@ -481,7 +481,7 @@ This way, the model captures the lineage of the GitLab commit web resource and a **`Creation`** | Attribute | Fixed Value | Description | -| -------------- | ----------------------- | ----------------------------------------------- | +|----------------|-------------------------|-------------------------------------------------| | id | - | SHA1 of the commit that triggered the creation. | | prov:startTime | `COMMIT_COMMITTER_DATE` | Time at which the web resource was created. | | prov:endTime | `COMMIT_COMMITTER_DATE` | Time at which the web resource was created. | @@ -491,7 +491,7 @@ This way, the model captures the lineage of the GitLab commit web resource and a **`Annotation`** | Attribute | Fixed Value | Description | -| -------------- | ----------- | ----------------------------------------------------------------------------- | +|----------------|-------------|-------------------------------------------------------------------------------| | id | - | Internal GitLab ID of the datastructure from which the annotation was parsed. | | name | - | Annotation name/class. Parsed from the annotation body. | | body | - | Annotation string. The string from which the type is parsed. | @@ -509,7 +509,7 @@ All recognized annotation types are listed in the "Annotations" section of this **`Commit`** | Attribute | Fixed Value | Description | -| ---------- | -------------------- | ----------------------------------------------------- | +|------------|----------------------|-------------------------------------------------------| | sha | - | Commit SHA1. | | url | - | URL to the webpage of the gitlab commit web resource. | | platform | `gitlab` or `github` | Platform identifier string. | @@ -519,7 +519,7 @@ All recognized annotation types are listed in the "Annotations" section of this **`Commit Version`** | Attribute | Fixed Value | Description | -| ---------- | ------------------------- | -------------------------------------------- | +|------------|---------------------------|----------------------------------------------| | id | - | Commit SHA1. | | prov:type | ResourceAtPointOfAddition | Entity type. | | prov:label | - | Human readable representation of the entity. | @@ -527,7 +527,7 @@ All recognized annotation types are listed in the "Annotations" section of this **`Annotated Commit Version`** | Attribute | Fixed Value | Description | -| ---------- | ------------------------ | -------------------------------------------- | +|------------|--------------------------|----------------------------------------------| | id | - | Commit SHA1. | | annotation | - | Gitlab annotation id. | | prov:type | AnnotatedResourceVersion | Entity type. | @@ -541,40 +541,40 @@ The following tables define the attributes attached to these relations. **`Creation - [wasAssociatedWith] -> Gitlab Commit Author`** | Attribute | Fixed Value | Description | -| --------- | ------------------ | ----------------------------------------------------------------------------- | +|-----------|--------------------|-------------------------------------------------------------------------------| | prov:role | GitlabCommitAuthor | Function of the Gitlab Commit Author agent in context of the Commit activity. | **` Annotation - [wasAssociatedWith] -> Annotator`** | Attribute | Fixed Value | Description | -| --------- | ----------- | ------------------------------------------------------------------ | +|-----------|-------------|--------------------------------------------------------------------| | prov:role | Annotator | Function of the Annotator agent in context of the Commit activity. | **`Commit - [wasGeneratedBy] -> Creation`** | Attribute | Fixed Value | Description | -| --------- | ---------------------- | ---------------------------------------------------------------- | +|-----------|------------------------|------------------------------------------------------------------| | prov:role | GitlabCommitCreation | Function of the Commit entity in context of the Commit activity. | | prov:time | `COMMIT_COMMITER_DATE` | Time at which the Commit entity was generated. | **`Commit Version - [wasGeneratedBy] -> Creation`** | Attribute | Fixed Value | Description | -| --------- | ---------------------- | ------------------------------------------------------------------------ | +|-----------|------------------------|--------------------------------------------------------------------------| | prov:role | GitlabCommitVersion | Function of the Commit Version entity in context of the Commit activity. | | prov:time | `COMMIT_COMMITER_DATE` | Time at which the Commit Version entity was generated. | **`Annotated Commit Version - [wasGeneratedBy] -> Creation`** | Attribute | Fixed Value | Description | -| --------- | ---------------------------- | ---------------------------------------------------------------- | +|-----------|------------------------------|------------------------------------------------------------------| | prov:role | AnnotatedGitlabCommitVersion | Function of the commit entity in context of the commit activity. | | prov:time | - | Time at which the annotated commit version entity was generated. | **`Annotated Commit Version - [used] -> Creation`** | Attribute | Fixed Value | Description | -| --------- | ------------------------------------------------- | ---------------------------------------------------------------- | +|-----------|---------------------------------------------------|------------------------------------------------------------------| | prov:role | AnnotatedGitlabCommitVersion, GitlabCommitVersion | Function of the commit entity in context of the commit activity. | | prov:time | - | Time at which the annotated commit version entity was generated. | @@ -617,7 +617,7 @@ This allows for capturing the changes in the issue web resource after each annot **`Issue Author`** | Attribute | Fixed Value | Description | -| --------------- | ----------- | -------------------------------------------------------- | +|-----------------|-------------|----------------------------------------------------------| | name | - | Author name. | | gitlab_username | - | GitLab username. As set in the authors gitlab profile. | | github_username | - | GitHub username. As set in the authors gitlab profile. | @@ -630,7 +630,7 @@ This allows for capturing the changes in the issue web resource after each annot **`Annotator`** | Attribute | Fixed Value | Description | -| --------------- | ----------- | -------------------------------------------------------------- | +|-----------------|-------------|----------------------------------------------------------------| | name | - | Annotator given name. As set in the annotators gitlab profile. | | gitlab_username | - | GitLab username. As set in the authors gitlab profile. | | github_username | - | GitHub username. As set in the authors gitlab profile. | @@ -643,7 +643,7 @@ This allows for capturing the changes in the issue web resource after each annot **`Creation`** | Attribute | Fixed Value | Description | -| -------------- | ------------- | ---------------------------------------------- | +|----------------|---------------|------------------------------------------------| | id | - | Gitlab issue id. | | prov:startTime | - | Time at which the web resource was created. | | prov:endTime | - | Time at which the web resource was created. | @@ -653,7 +653,7 @@ This allows for capturing the changes in the issue web resource after each annot **`Annotation`** | Attribute | Fixed Value | Description | -| -------------- | ----------- | ----------------------------------------------------------------------------- | +|----------------|-------------|-------------------------------------------------------------------------------| | id | - | Internal gitlab id of the datastructure from which the annotation was parsed. | | name | - | Annotation name/class. Parsed from the annotation body. | | body | - | Annotation string. The string from which the type is parsed. | @@ -671,11 +671,11 @@ All recognized annotation types are listed in the "Annotations" section of this **`Issue`** | Attribute | Fixed Value | Description | -| ---------- | -------------------- | -------------------------------------------- | +|------------|----------------------|----------------------------------------------| | id | - | Issue ID. | | iid | - | Internal issue ID. | | title | - | Issue title. | -| body | - | Issue body. | +| body | - | Issue body. | | platform | `gitlab` or `github` | Platform identifier string. | | url | - | Issue webpage url. | | created_at | - | Time at which the issue was created at. | @@ -686,7 +686,7 @@ All recognized annotation types are listed in the "Annotations" section of this **`Issue Version`** | Attribute | Fixed Value | Description | -| ---------- | ------------ | -------------------------------------------- | +|------------|--------------|----------------------------------------------| | id | - | GitLab/GitHub id of the issue. | | prov:type | IssueVersion | Entity type. | | prov:label | - | Human readable representation of the entity. | @@ -694,7 +694,7 @@ All recognized annotation types are listed in the "Annotations" section of this **`Annotated Issue Version`** | Attribute | Fixed Value | Description | -| ---------- | --------------------- | ------------------------------------------------------------------------ | +|------------|-----------------------|--------------------------------------------------------------------------| | id | - | GitLab/GitHub id of the issue. | | annotation | - | GitLab/GitHub id of the annotation that generated the annotated version. | | prov:type | AnnotatedIssueVersion | Entity type. | @@ -708,40 +708,40 @@ The following tables define the attributes attached to these relations. **`Creation - [wasAssociatedWith] -> Issue Author`** | Attribute | Fixed Value | Description | -| --------- | ----------- | ----------------------------------------------------------------------- | +|-----------|-------------|-------------------------------------------------------------------------| | prov:role | IssueAuthor | Function of the issue author agent in context of the creation activity. | **`Annotation - [wasAssociatedWith] -> Annotator`** | Attribute | Fixed Value | Description | -| --------- | ----------- | ---------------------------------------------------------------------- | +|-----------|-------------|------------------------------------------------------------------------| | prov:role | Annotator | Function of the annotator agent in context of the annotation activity. | **`Issue - [wasGeneratedBy] -> Creation`** | Attribute | Fixed Value | Description | -| --------- | ----------- | ----------------------------------------------------------------- | +|-----------|-------------|-------------------------------------------------------------------| | prov:role | Resource | Function of the issue entity in context of the creation activity. | | prov:time | - | Time at which the issue entity was generated. | **`Issue Version - [wasGeneratedBy] -> Creation`** | Attribute | Fixed Value | Description | -| --------- | -------------------------------- | ----------------------------------------------------------------- | +|-----------|----------------------------------|-------------------------------------------------------------------| | prov:role | ResourceVersionAtPointOfCreation | Function of the issue entity in context of the creation activity. | | prov:time | - | Time at which the issue version entity was generated. | **`Annotated Issue Version - [wasGeneratedBy] -> Annotation`** | Attribute | Fixed Value | Description | -| --------- | ------------------------------ | ----------------------------------------------------------------- | +|-----------|--------------------------------|-------------------------------------------------------------------| | prov:role | ResourceVersionAfterAnnotation | Function of the issue entity in context of the creation activity. | | prov:time | - | Time at which the annotated issue version entity was generated. | **`Annotation - [used] -> Issue Version`** | Attribute | Fixed Value | Description | -| --------- | ---------------------------- | ----------------------------------------------------------------- | +|-----------|------------------------------|-------------------------------------------------------------------| | prov:role | ResourceVersionToBeAnnotated | Function of the issue entity in context of the creation activity. | | prov:time | - | Time at which the issue version entity was generated. | @@ -787,7 +787,7 @@ Each annotated merge request version is generated by the corresponding annotatio **`Merge Request Author`** | Attribute | Fixed Value | Description | -| --------------- | ------------------ | --------------------------------------------------------------- | +|-----------------|--------------------|-----------------------------------------------------------------| | name | - | Author name. | | gitlab_username | - | GitLab username. As set in the authors gitlab profile. | | github_username | - | GitHub username. As set in the authors gitlab profile. | @@ -800,7 +800,7 @@ Each annotated merge request version is generated by the corresponding annotatio **`Annotator`** | Attribute | Fixed Value | Description | -| --------------- | ----------- | --------------------------------------------------------------- | +|-----------------|-------------|-----------------------------------------------------------------| | name | - | Author name. | | gitlab_username | - | GitLab username. As set in the authors gitlab profile. | | github_username | - | GitHub username. As set in the authors gitlab profile. | @@ -813,7 +813,7 @@ Each annotated merge request version is generated by the corresponding annotatio **`Creation`** | Attribute | Fixed Value | Description | -| -------------- | -------------------- | ---------------------------------------------- | +|----------------|----------------------|------------------------------------------------| | id | - | GitLab/GitHub merge request id. | | prov:startTime | - | Time at which the web resource was created. | | prov:endTime | - | Time at which the web resource was created. | @@ -823,7 +823,7 @@ Each annotated merge request version is generated by the corresponding annotatio **`Annotation`** | Attribute | Fixed Value | Description | -| -------------- | ----------- | ---------------------------------------------------------------------- | +|----------------|-------------|------------------------------------------------------------------------| | id | - | Internal id of the datastructure from which the annotation was parsed. | | name | - | Annotation name/class. Parsed from the annotation body. | | body | - | Annotation string. The string from which the type is parsed. | @@ -841,7 +841,7 @@ All recognized annotation types are listed in the "Annotations" section of this **`Merge Request`** | Attribute | Fixed Value | Description | -| ------------------------------- | -------------------- | ----------------------------------------------------------------- | +|---------------------------------|----------------------|-------------------------------------------------------------------| | id | - | GitLab/GitHub merge request id. | | iid | - | Internal GitLab/GitHub merge request id. | | title | - | Merge request title. | @@ -860,7 +860,7 @@ All recognized annotation types are listed in the "Annotations" section of this **`Merge Request Version`** | Attribute | Fixed Value | Description | -| ---------- | ------------------------- | -------------------------------------------- | +|------------|---------------------------|----------------------------------------------| | id | - | Gitlab/Github id of the merge request. | | prov:type | GitlabMergeRequestVersion | Entity type. | | prov:label | - | Human readable representation of the entity. | @@ -868,7 +868,7 @@ All recognized annotation types are listed in the "Annotations" section of this **`Annotated Merge Request Version`** | Attribute | Fixed Value | Description | -| ---------- | ---------------------------- | ------------------------------------------------------------------------ | +|------------|------------------------------|--------------------------------------------------------------------------| | id | - | Gitlab/Github id of the merge request. | | annotation | - | Gitlab/Github id of the annotation that generated the annotated version. | | prov:type | AnnotatedMergeRequestVersion | Entity type. | @@ -882,40 +882,40 @@ The following tables define the attributes attached to these relations. **`Creation - [wasAssociatedWith] -> Merge Request Author`** | Attribute | Fixed Value | Description | -| --------- | ----------- | ------------------------------------------------------------------------------- | +|-----------|-------------|---------------------------------------------------------------------------------| | prov:role | IssueAuthor | Function of the merge request author agent in context of the creation activity. | **`Annotation - [wasAssociatedWith] -> Annotator`** | Attribute | Fixed Value | Description | -| --------- | ----------- | ---------------------------------------------------------------------- | +|-----------|-------------|------------------------------------------------------------------------| | prov:role | Annotator | Function of the annotator agent in context of the annotation activity. | **`Merge Request - [wasGeneratedBy] -> Creation`** | Attribute | Fixed Value | Description | -| --------- | ----------- | ------------------------------------------------------------------------- | +|-----------|-------------|---------------------------------------------------------------------------| | prov:role | Resource | Function of the merge request entity in context of the creation activity. | | prov:time | - | Time at which the merge request entity was generated. | **`Merge Request Version - [wasGeneratedBy] -> Creation`** | Attribute | Fixed Value | Description | -| --------- | -------------------------------- | ------------------------------------------------------------------------- | +|-----------|----------------------------------|---------------------------------------------------------------------------| | prov:role | ResourceVersionAtPointOfCreation | Function of the merge request entity in context of the creation activity. | | prov:time | - | Time at which the merge request version entity was generated. | **`Annotated Merge Request Version - [wasGeneratedBy] -> Annotation`** | Attribute | Fixed Value | Description | -| --------- | ------------------------------ | ------------------------------------------------------------------------- | +|-----------|--------------------------------|---------------------------------------------------------------------------| | prov:role | ResourceVersionAfterAnnotation | Function of the merge request entity in context of the creation activity. | | prov:time | - | Time at which the annotated merge request version entity was generated. | **`Annotation - [used] -> Merge Request Version`** | Attribute | Fixed Value | Description | -| --------- | ---------------------------- | ------------------------------------------------------------------------- | +|-----------|------------------------------|---------------------------------------------------------------------------| | prov:role | ResourceVersionToBeAnnotated | Function of the merge request entity in context of the creation activity. | | prov:time | - | Time at which the merge request version entity was generated. | @@ -947,7 +947,7 @@ The commit is generated by the commit creation activity. **`Asset`** | Attribute | Fixed Value | Description | -| ---------- | ----------- | -------------------------------------------- | +|------------|-------------|----------------------------------------------| | url | - | Asset URL. | | format | - | Asset format. | | prov:type | Asset | Entity type. | @@ -956,7 +956,7 @@ The commit is generated by the commit creation activity. **`Evidence`** | Attribute | Fixed Value | Description | -| ------------ | ----------- | -------------------------------------------- | +|--------------|-------------|----------------------------------------------| | sha | - | Evidence SHA. | | url | - | Evidence URL. | | collected_at | - | Time at which the evidence was generated. | @@ -966,12 +966,12 @@ The commit is generated by the commit creation activity. **`Commit`** | Attribute | Fixed Value | Description | -| ---------- | ----------- | -------------------------------------------- | +|------------|-------------|----------------------------------------------| | sha | - | Commit SHA1 | | title | - | First 50 characters of the commit message. | | message | - | Commit message. | -| deleted | - | Number of lines deleted. | -| inserted | - | Number of lines inserted. | +| deletions | - | Number of lines deleted. | +| insertions | - | Number of lines inserted. | | lines | - | Number of lines changed. | | files | - | Number of files changed. | | prov:type | GitCommit | Entity type. | @@ -980,7 +980,7 @@ The commit is generated by the commit creation activity. **`Tag`** | Attribute | Fixed Value | Description | -| ---------- | --------------- | ------------------------------------------------- | +|------------|-----------------|---------------------------------------------------| | name | - | Tag name. | | sha | - | Commit SHA1 of the commit that pushed the tag. | | message | - | Commit message of the commit that pushed the tag. | @@ -992,7 +992,7 @@ The commit is generated by the commit creation activity. **`Release`** | Attribute | Fixed Value | Description | -| ----------- | -------------------- | -------------------------------------------- | +|-------------|----------------------|----------------------------------------------| | name | - | Release name. | | body | - | Release body. | | tag_name | - | Release tag name. | @@ -1006,7 +1006,7 @@ The commit is generated by the commit creation activity. **`Commit Author`** | Attribute | Fixed Value | Description | -| ---------- | ----------- | -------------------------------------------------------- | +|------------|-------------|----------------------------------------------------------| | name | - | `git config user.name` Set in the author's git config. | | email | - | `git config user.email` Set in the author's git config. | | prov:role | Author | Function of the agent in context of the commit activity. | @@ -1016,7 +1016,7 @@ The commit is generated by the commit creation activity. **`Tag Author`** | Attribute | Fixed Value | Description | -| ---------- | ----------- | -------------------------------------------------------------- | +|------------|-------------|----------------------------------------------------------------| | name | - | `git config user.name` Set in the author's git config. | | email | - | `git config user.email` Set in the author's git config. | | prov:role | Author | Function of the agent in context of the tag creation activity. | @@ -1026,7 +1026,7 @@ The commit is generated by the commit creation activity. **`Release Author`** | Attribute | Fixed Value | Description | -| --------------- | ------------- | ------------------------------------------------------------------------------------------------ | +|-----------------|---------------|--------------------------------------------------------------------------------------------------| | name | - | Author name. As set in the authors GitLab profile. Only available if the token has admin rights. | | email | - | Author email. Set in the author's git config. Only available if the token has admin rights. | | gitlab_username | - | GitLab username. As set in the annotators GitLab profile. | @@ -1040,7 +1040,7 @@ The commit is generated by the commit creation activity. **`Commit Creation`** | Attribute | Fixed Value | Description | -| -------------- | -------------- | ---------------------------------------------- | +|----------------|----------------|------------------------------------------------| | id | - | Commit SHA1. | | prov:startTime | - | Time at which the commit was created. | | prov:endTime | - | Time at which the commit was created. | @@ -1050,7 +1050,7 @@ The commit is generated by the commit creation activity. **`Tag Creation`** | Attribute | Fixed Value | Description | -| -------------- | ----------- | ---------------------------------------------- | +|----------------|-------------|------------------------------------------------| | id | - | Tag name. | | prov:startTime | - | Time at which the tag was created. | | prov:endTime | - | Time at which the tag was created. | @@ -1060,7 +1060,7 @@ The commit is generated by the commit creation activity. **`Release Creation`** | Attribute | Fixed Value | Description | -| -------------- | --------------- | ---------------------------------------------- | +|----------------|-----------------|------------------------------------------------| | id | - | Tag name. | | prov:startTime | - | Time at which the release was created. | | prov:endTime | - | Time at which the release was realeased. | @@ -1075,39 +1075,39 @@ The following tables define the attributes attached to these relations. **`Release Creation - [wasAssociatedWith] -> Release Author`** | Attribute | Fixed Value | Description | -| --------- | ------------- | ------------------------------------------------------------------------------- | +|-----------|---------------|---------------------------------------------------------------------------------| | prov:role | ReleaseAuthor | Function of the merge request author agent in context of the creation activity. | **`Tag Creation - [wasAssociatedWith] -> Tag Author`** | Attribute | Fixed Value | Description | -| --------- | ----------- | ------------------------------------------------------------------------------- | +|-----------|-------------|---------------------------------------------------------------------------------| | prov:role | TagAuthor | Function of the merge request author agent in context of the creation activity. | **`Commit Creation - [wasAssociatedWith] -> Commit Author`** | Attribute | Fixed Value | Description | -| --------- | ----------- | ------------------------------------------------------------------------------- | +|-----------|-------------|---------------------------------------------------------------------------------| | prov:role | Author | Function of the merge request author agent in context of the creation activity. | **`Release - [wasGeneratedBy] -> Release Creation`** | Attribute | Fixed Value | Description | -| --------- | ----------- | --------------------------------------------------------------------------- | +|-----------|-------------|-----------------------------------------------------------------------------| | prov:role | Release | Function of the release entity in context of the release creation activity. | | prov:time | - | Time at which the release entity was generated. | **`Tag - [wasGeneratedBy] -> Tag Creation`** | Attribute | Fixed Value | Description | -| --------- | ----------- | ------------------------------------------------------------------- | +|-----------|-------------|---------------------------------------------------------------------| | prov:role | Tag | Function of the tag entity in context of the tag creation activity. | | prov:time | - | Time at which the tag entity was generated. | **`Commit - [wasGeneratedBy] -> Commit Creation`** | Attribute | Fixed Value | Description | -| --------- | ----------- | ------------------------------------------------------------------------- | +|-----------|-------------|---------------------------------------------------------------------------| | prov:role | Tag | Function of the commit entity in context of the commit creation activity. | | prov:time | - | Time at which the commit entity was generated. | @@ -1141,7 +1141,7 @@ Here is a list of annotations that we are currently able to parse, along with a ### List of Annotations | Annotation Type | Description | Parsed API Resource | -| ----------------------------------------------- | ------------------------------------------------------------------------------------ | ------------------- | +|-------------------------------------------------|--------------------------------------------------------------------------------------|---------------------| | `remove_label` | Removed label from a resource. | Label Event | | `change_target_branch` | Change merge request target. branch. | System Note | | `status_changed_to_merged` | Change status of merge request to merged. | System Note |