Skip to content

Commit 5b4ec7c

Browse files
committed
Provide script to import git log as csv
1 parent d8512cd commit 5b4ec7c

File tree

3 files changed

+153
-0
lines changed

3 files changed

+153
-0
lines changed
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
// Delete all Git log data in the Graph
2+
3+
MATCH (n:Git)
4+
CALL { WITH n
5+
DETACH DELETE n
6+
} IN TRANSACTIONS OF 1000 ROWS
7+
RETURN count(n) as numberOfDeletedRows
Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
// Import git log CSV data with the following schema: (Git:Author)-[:AUTHORED]->(Git:Commit)->[:CHANGED]->(Git:File)
2+
3+
LOAD CSV WITH HEADERS FROM "file:///gitLog.csv" AS row
4+
CALL { WITH row
5+
MERGE (git_author:Git:Author {name: row.author, email: row.email})
6+
MERGE (git_commit:Git:Commit {
7+
hash: row.hash,
8+
message: row.message,
9+
timestamp: datetime(row.timestamp),
10+
timestamp_unix: toInteger(row.timestamp_unix)
11+
})
12+
MERGE (git_file:Git:File {fileName: row.filename})
13+
MERGE (git_author)-[:AUTHORED]->(git_commit)
14+
MERGE (git_commit)-[:CHANGED]->(git_file)
15+
} IN TRANSACTIONS OF 1000 ROWS
16+
RETURN count(DISTINCT row.author) AS numberOfAuthors
17+
,count(DISTINCT row.filename) AS numberOfFiles
18+
,count(DISTINCT row.hash) AS numberOfCommits

scripts/importGitLog.sh

Lines changed: 128 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,128 @@
1+
#!/usr/bin/env bash
2+
3+
# Uses git log to create a comma separated values (CSV) file containing all commits, their author, email address, date and all the file names that were changed with it. The CSV is then imported into Neo4j.
4+
5+
# Note: This script needs the path to a git repository directory. It defaults to SOURCE_DIRECTORY ("source").
6+
# Note: Import will be skipped without an error if the directory is not a git repository.
7+
# Note: This script needs git to be installed.
8+
9+
# Fail on any error ("-e" = exit on first error, "-o pipefail" exist on errors within piped commands)
10+
set -o errexit -o pipefail
11+
12+
# Overrideable Defaults
13+
NEO4J_EDITION=${NEO4J_EDITION:-"community"} # Choose "community" or "enterprise"
14+
NEO4J_VERSION=${NEO4J_VERSION:-"5.16.0"}
15+
TOOLS_DIRECTORY=${TOOLS_DIRECTORY:-"tools"} # Get the tools directory (defaults to "tools")
16+
SOURCE_DIRECTORY=${SOURCE_DIRECTORY:-"source"} # Get the source repository directory (defaults to "source")
17+
18+
# Default and initial values for command line options
19+
repository="${SOURCE_DIRECTORY}"
20+
21+
# Read command line options
22+
USAGE="importGitLog: Usage: $0 [--repository <git repository directory>(default=source)]"
23+
while [[ $# -gt 0 ]]; do
24+
key="$1"
25+
case $key in
26+
--repository)
27+
repository="$2"
28+
# Check if the explicitly given repository is a valid directory
29+
if [ ! -d "${repository}" ] ; then
30+
echo "importGitLog: Error: The given repository <${repository}> is not a directory" >&2
31+
echo "${USAGE}" >&2
32+
exit 1
33+
fi
34+
shift
35+
;;
36+
*)
37+
echo "importGitLog: Error: Unknown option: ${key}"
38+
echo "${USAGE}" >&2
39+
exit 1
40+
esac
41+
shift
42+
done
43+
44+
# Check if the repository is actually a git repository
45+
if ! (cd "${repository}" || exit; git rev-parse --git-dir 2> /dev/null || exit); then
46+
echo "importGitLog: Import skipped. ${repository} is not a git repository."
47+
exit 0
48+
fi
49+
50+
echo "importGitLog: repository=${repository}"
51+
52+
## Get this "scripts" directory if not already set
53+
# Even if $BASH_SOURCE is made for Bourne-like shells it is also supported by others and therefore here the preferred solution.
54+
# CDPATH reduces the scope of the cd command to potentially prevent unintended directory changes.
55+
# This way non-standard tools like readlink aren't needed.
56+
SCRIPTS_DIR=${SCRIPTS_DIR:-$( CDPATH=. cd -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd -P )} # Repository directory containing the shell scripts
57+
echo "copyReportsIntoResults: SCRIPTS_DIR=$SCRIPTS_DIR"
58+
59+
# Get the "cypher" directory by taking the path of this script and going two directory up and then to "cypher".
60+
CYPHER_DIR=${CYPHER_DIR:-"${SCRIPTS_DIR}/../cypher"}
61+
echo "importGitLog: CYPHER_DIR=${CYPHER_DIR}"
62+
63+
# Define functions (like execute_cypher and execute_cypher_summarized) to execute cypher queries from within a given file
64+
source "${SCRIPTS_DIR}/executeQueryFunctions.sh"
65+
66+
# Internal constants
67+
IMPORTS_CYPHER_DIR="${CYPHER_DIR}/Imports"
68+
NEO4J_INSTALLATION_NAME="neo4j-${NEO4J_EDITION}-${NEO4J_VERSION}"
69+
NEO4J_INSTALLATION_DIRECTORY="${TOOLS_DIRECTORY}/${NEO4J_INSTALLATION_NAME}"
70+
NEO4J_FULL_IMPORT_DIRECTORY=$(cd "${NEO4J_INSTALLATION_DIRECTORY}/import"; pwd)
71+
OUTPUT_CSV_FILENAME="${NEO4J_FULL_IMPORT_DIRECTORY}/gitLog.csv"
72+
73+
# ----- Create a CSV file with git log data containing all commits and their changed files
74+
echo "importGitLog: Creating ${OUTPUT_CSV_FILENAME} from git log..."
75+
76+
(
77+
# Git log needs to be executed in the directory of the repository.
78+
# This is done in a sub shell to automatically return to the previous directory.
79+
cd "${repository}" || exit
80+
81+
# Prints the header line of the CSV file with the names of the columns.
82+
echo "hash,author,email,timestamp,timestamp_unix,message,filename" > "${OUTPUT_CSV_FILENAME}"
83+
84+
# Prints the git log in CSV format including the changed files.
85+
# Includes quoted strings, double quote escaping and supports commas in strings.
86+
git log --no-merges --pretty=format:' %h,,,%an,,,%ae,,,%aI,,,%ct,,,%s' --name-only | \
87+
awk 'BEGIN { COMMA=",";QUOTE="\"" } /^ / { split($0, a, ",,,"); gsub(/^ /, "", a[1]); gsub(/"/, "\"\"", a[2]); gsub(/"/, "\"\"", a[3]); gsub(/"/, "\"\"", a[6]); gsub(/\\/, " ", a[6]); commit=a[1] COMMA QUOTE a[2] QUOTE COMMA QUOTE a[3] QUOTE COMMA a[4] COMMA a[5] COMMA QUOTE a[6] QUOTE } NF && !/^\ / { print commit ",\""$0"\"" }' | \
88+
grep -v -F '[bot]' >> "${OUTPUT_CSV_FILENAME}"
89+
# Explanation:
90+
#
91+
# - --no-merges: Excludes merge commits from the log.
92+
# - %h: Abbreviated commit hash
93+
# - %an: Author name
94+
# - %ae: Author email
95+
# - %aI: Author date, ISO 8601 format
96+
# - %ct: Commit date, Unix timestamp
97+
# - %s: Subject of the commit
98+
# - --name-only: Lists the files affected by each commit.
99+
# - --pretty=format starts with a space that is needed to detect the start of a line.
100+
# - The chosen delimiters ,,, are used to separate these fields to make parsing easier.
101+
# It is very unlikely that they appear in the contents and will be used as an intermediate step before escaping.
102+
#
103+
# - BEGIN { COMMA=","; QUOTE="\"" }: Initializes the variables COMMA and QUOTE to hold a comma and a double-quote character respectively.
104+
# - /^ / { ... }: Processes lines that start with a space (indicating a file name in git log --name-only output).
105+
# - gsub(/^ /, "", a[1]): Removes leading spaces from the first field (commit hash) that was used to indicate a new commit.
106+
# - gsub(/"/, "\"\"", a[6]) escapes double quotes with two double quotes (CSV standard).
107+
# a[6] is the commit message column. Double quote escaping is done for every string column
108+
# - gsub(/\\/, " ", a[6]): Replaces backslashes in the commit message with spaces.
109+
# Otherwise, \" would lead to an error since it would be seen as an non escaped double quote.
110+
# - commit=...: Constructs the commit information in CSV format, including the quoted author name, author email, and commit message except for the file name.
111+
# - NF && !/^\ / { print commit ",\""$0"\"" }: For non-empty lines that do not start with a space (indicating commit information),
112+
# it prints the commit information followed by the file name(s), enclosed in quotes.
113+
#
114+
# - grep -v -F '[bot]': Filters out commits where the commit message includes [bot]
115+
# Used to identify commits made by automated systems or bots.
116+
)
117+
118+
csv_file_size=$(wc -c "${OUTPUT_CSV_FILENAME}" | awk '{print $1}')
119+
echo "importGitLog: File ${OUTPUT_CSV_FILENAME} with ${csv_file_size} bytes created."
120+
# ---------
121+
122+
# ----- Import git log data csv
123+
echo "importGitLog: Deleting all existing git data in the Graph..."
124+
execute_cypher "${IMPORTS_CYPHER_DIR}/Delete_git_log_data.cypher"
125+
126+
echo "importGitLog: Importing new git log data into the Graph..."
127+
execute_cypher "${IMPORTS_CYPHER_DIR}/Import_git_log_csv_data.cypher"
128+
# ---------

0 commit comments

Comments
 (0)