Skip to content

Commit 83cc73a

Browse files
committed
Provide script to import git log as csv
1 parent dcd9c29 commit 83cc73a

File tree

3 files changed

+130
-0
lines changed

3 files changed

+130
-0
lines changed
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
// Delete all Git log data in the Graph
2+
3+
MATCH (n:Git)
4+
CALL { WITH n
5+
DETACH DELETE n
6+
} IN TRANSACTIONS OF 1000 ROWS
7+
RETURN count(n) as numberOfDeletedRows
Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
// Import git log CSV data with the following schema: (Git:Author)-[:AUTHORED]->(Git:Commit)->[:CHANGED]->(Git:File)
2+
3+
LOAD CSV WITH HEADERS FROM "file:///gitLog.csv" AS row
4+
CALL { WITH row
5+
MERGE (git_author:Git:Author {name: row.author, email: row.email})
6+
MERGE (git_commit:Git:Commit {
7+
hash: row.hash,
8+
message: row.message,
9+
timestamp: datetime(row.timestamp),
10+
timestamp_unix: toInteger(row.timestamp_unix)
11+
})
12+
MERGE (git_file:Git:File {fileName: row.filename})
13+
MERGE (git_author)-[:AUTHORED]->(git_commit)
14+
MERGE (git_commit)-[:CHANGED]->(git_file)
15+
} IN TRANSACTIONS OF 1000 ROWS
16+
RETURN count(DISTINCT row.author) AS numberOfAuthors
17+
,count(DISTINCT row.filename) AS numberOfFiles
18+
,count(DISTINCT row.hash) AS numberOfCommits

scripts/importGitLog.sh

Lines changed: 105 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,105 @@
1+
#!/usr/bin/env bash
2+
3+
# Uses git log to create a comma separated values (CSV) file containing all commits, their author, email address, date and all the file names that were changed with it. The CSV is then imported into Neo4j.
4+
5+
# Note: This script needs the path to a git repository directory. It defaults to SOURCE_DIRECTORY ("source").
6+
# Note: Import will be skipped without an error if the directory is not a git repository.
7+
# Note: This script needs git to be installed.
8+
9+
# Fail on any error ("-e" = exit on first error, "-o pipefail" exist on errors within piped commands)
10+
set -o errexit -o pipefail
11+
12+
# Overrideable Defaults
13+
NEO4J_EDITION=${NEO4J_EDITION:-"community"} # Choose "community" or "enterprise"
14+
NEO4J_VERSION=${NEO4J_VERSION:-"5.16.0"}
15+
TOOLS_DIRECTORY=${TOOLS_DIRECTORY:-"tools"} # Get the tools directory (defaults to "tools")
16+
SOURCE_DIRECTORY=${SOURCE_DIRECTORY:-"source"} # Get the source repository directory (defaults to "source")
17+
18+
# Default and initial values for command line options
19+
repository="${SOURCE_DIRECTORY}"
20+
21+
# Read command line options
22+
USAGE="importGitLog: Usage: $0 [--repository <git repository directory>(default=source)]"
23+
while [[ $# -gt 0 ]]; do
24+
key="$1"
25+
case $key in
26+
--repository)
27+
repository="$2"
28+
# Check if the explicitly given repository is a valid directory
29+
if [ ! -d "${repository}" ] ; then
30+
echo "importGitLog: Error: The given repository <${repository}> is not a directory" >&2
31+
echo "${USAGE}" >&2
32+
exit 1
33+
fi
34+
shift
35+
;;
36+
*)
37+
echo "importGitLog: Error: Unknown option: ${key}"
38+
echo "${USAGE}" >&2
39+
exit 1
40+
esac
41+
shift
42+
done
43+
44+
# Check if the repository is actually a git repository
45+
if ! (cd "${repository}" || exit; git rev-parse --git-dir 2> /dev/null || exit); then
46+
echo "importGitLog: Import skipped. ${repository} is not a git repository."
47+
exit 0
48+
fi
49+
50+
echo "importGitLog: repository=${repository}"
51+
52+
## Get this "scripts" directory if not already set
53+
# Even if $BASH_SOURCE is made for Bourne-like shells it is also supported by others and therefore here the preferred solution.
54+
# CDPATH reduces the scope of the cd command to potentially prevent unintended directory changes.
55+
# This way non-standard tools like readlink aren't needed.
56+
SCRIPTS_DIR=${SCRIPTS_DIR:-$( CDPATH=. cd -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd -P )} # Repository directory containing the shell scripts
57+
echo "copyReportsIntoResults: SCRIPTS_DIR=$SCRIPTS_DIR"
58+
59+
# Get the "cypher" directory by taking the path of this script and going two directory up and then to "cypher".
60+
CYPHER_DIR=${CYPHER_DIR:-"${SCRIPTS_DIR}/../cypher"}
61+
echo "importGitLog: CYPHER_DIR=${CYPHER_DIR}"
62+
63+
# Define functions (like execute_cypher and execute_cypher_summarized) to execute cypher queries from within a given file
64+
source "${SCRIPTS_DIR}/executeQueryFunctions.sh"
65+
66+
# Internal constants
67+
IMPORTS_CYPHER_DIR="${CYPHER_DIR}/Imports"
68+
NEO4J_INSTALLATION_NAME="neo4j-${NEO4J_EDITION}-${NEO4J_VERSION}"
69+
NEO4J_INSTALLATION_DIRECTORY="${TOOLS_DIRECTORY}/${NEO4J_INSTALLATION_NAME}"
70+
NEO4J_FULL_IMPORT_DIRECTORY=$(cd "${NEO4J_INSTALLATION_DIRECTORY}/import"; pwd)
71+
OUTPUT_CSV_FILENAME="${NEO4J_FULL_IMPORT_DIRECTORY}/gitLog.csv"
72+
73+
# ----- Create a CSV file with git log data containing all commits and their changed files
74+
echo "importGitLog: Creating ${OUTPUT_CSV_FILENAME} from git log..."
75+
76+
(
77+
# Git log needs to be executed in the directory of the repository.
78+
# This is done in a sub shell to automatically return to the previous directory.
79+
cd "${repository}" || exit
80+
81+
# Prints the header line of the CSV file with the names of the columns.
82+
echo "hash,author,email,timestamp,timestamp_unix,message,filename" > "${OUTPUT_CSV_FILENAME}"
83+
84+
# Prints the git log in CSV format including the changed files.
85+
# Includes quoted strings, double quote escaping and supports commas in strings.
86+
# - --pretty=format starts with a space that is needed to detect the start of a line.
87+
# gsub(/^ /, "", a[1]); removes that space then afterwards
88+
# - 3 commas (,,,) should be very unlikely to appear in names, email addresses and commit messages so they are used as an intermediate separator (see split)
89+
# - gsub(/"/, "\"\"", a[6]) escapes double quotes with two of them (CSV standard)
90+
git log --no-merges --pretty=format:' %h,,,%an,,,%ae,,,%aI,,,%ct,,,%s' --name-only | \
91+
awk 'BEGIN { COMMA=",";QUOTE="\"" } /^ / { split($0, a, ",,,"); gsub(/^ /, "", a[1]); gsub(/"/, "\"\"", a[2]); gsub(/"/, "\"\"", a[3]); gsub(/"/, "\"\"", a[6]); gsub(/\\/, " ", a[6]); commit=a[1] COMMA QUOTE a[2] QUOTE COMMA QUOTE a[3] QUOTE COMMA a[4] COMMA a[5] COMMA QUOTE a[6] QUOTE } NF && !/^\ / { print commit ",\""$0"\"" }' | \
92+
grep -v -F '[bot]' >> "${OUTPUT_CSV_FILENAME}"
93+
)
94+
95+
csv_file_size=$(wc -c "${OUTPUT_CSV_FILENAME}" | awk '{print $1}')
96+
echo "importGitLog: File ${OUTPUT_CSV_FILENAME} with ${csv_file_size} bytes created."
97+
# ---------
98+
99+
# ----- Import git log data csv
100+
echo "importGitLog: Deleting all existing git data in the Graph..."
101+
execute_cypher "${IMPORTS_CYPHER_DIR}/Delete_git_log_data.cypher"
102+
103+
echo "importGitLog: Importing new git log data into the Graph..."
104+
execute_cypher "${IMPORTS_CYPHER_DIR}/Import_git_log_csv_data.cypher"
105+
# ---------

0 commit comments

Comments
 (0)