Skip to content

Commit 339011c

Browse files
committed
Provide script to import git log as csv
1 parent d8512cd commit 339011c

File tree

5 files changed

+304
-0
lines changed

5 files changed

+304
-0
lines changed
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
// Delete all Git log data in the Graph
2+
3+
MATCH (n:Git)
4+
CALL { WITH n
5+
DETACH DELETE n
6+
} IN TRANSACTIONS OF 1000 ROWS
7+
RETURN count(n) as numberOfDeletedRows
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
// Import aggregated git log CSV data with the following schema: (Git:Author)-[:AUTHORED]->(Git:ChangeSpan)-[:CONTAINS]->(Git:File)
2+
3+
LOAD CSV WITH HEADERS FROM "file:///aggregatedGitLog.csv" AS row
4+
CALL { WITH row
5+
MERGE (git_author:Git:Author {name: row.author, email: row.email})
6+
MERGE (git_change_span:Git:ChangeSpan {
7+
year: toInteger(row.year),
8+
month: toInteger(row.month),
9+
commits: toInteger(row.commits)
10+
})
11+
MERGE (git_file:Git:File {fileName: row.filename})
12+
MERGE (git_author)-[:AUTHORED]->(git_change_span)
13+
MERGE (git_change_span)-[:CONTAINS]->(git_file)
14+
} IN TRANSACTIONS OF 1000 ROWS
15+
RETURN count(DISTINCT row.author) AS numberOfAuthors
16+
,count(DISTINCT row.filename) AS numberOfFiles
17+
,sum(toInteger(row.commits)) AS numberOfCommits
Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
// Import git log CSV data with the following schema: (Git:Author)-[:AUTHORED]->(Git:Commit)-[:CHANGED]->(Git:File)
2+
3+
LOAD CSV WITH HEADERS FROM "file:///gitLog.csv" AS row
4+
CALL { WITH row
5+
MERGE (git_author:Git:Author {name: row.author, email: row.email})
6+
MERGE (git_commit:Git:Commit {
7+
hash: row.hash,
8+
message: row.message,
9+
timestamp: datetime(row.timestamp),
10+
timestamp_unix: toInteger(row.timestamp_unix)
11+
})
12+
MERGE (git_file:Git:File {fileName: row.filename})
13+
MERGE (git_author)-[:AUTHORED]->(git_commit)
14+
MERGE (git_commit)-[:CONTAINS]->(git_file)
15+
} IN TRANSACTIONS OF 1000 ROWS
16+
RETURN count(DISTINCT row.author) AS numberOfAuthors
17+
,count(DISTINCT row.filename) AS numberOfFiles
18+
,count(DISTINCT row.hash) AS numberOfCommits

scripts/importAggregatedGitLog.sh

Lines changed: 133 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,133 @@
1+
#!/usr/bin/env bash
2+
3+
# Uses git log to create a comma separated values (CSV) file containing aggregated changes, their author name and email address, year and month for all the files that were changed. The CSV is then imported into Neo4j.
4+
5+
# Note: This script needs the path to a git repository directory. It defaults to SOURCE_DIRECTORY ("source").
6+
# Note: Import will be skipped without an error if the directory is not a git repository.
7+
# Note: This script needs git to be installed.
8+
9+
# Fail on any error ("-e" = exit on first error, "-o pipefail" exist on errors within piped commands)
10+
set -o errexit -o pipefail
11+
12+
# Overrideable Defaults
13+
NEO4J_EDITION=${NEO4J_EDITION:-"community"} # Choose "community" or "enterprise"
14+
NEO4J_VERSION=${NEO4J_VERSION:-"5.16.0"}
15+
TOOLS_DIRECTORY=${TOOLS_DIRECTORY:-"tools"} # Get the tools directory (defaults to "tools")
16+
SOURCE_DIRECTORY=${SOURCE_DIRECTORY:-"source"} # Get the source repository directory (defaults to "source")
17+
18+
# Default and initial values for command line options
19+
repository="${SOURCE_DIRECTORY}"
20+
21+
# Read command line options
22+
USAGE="importAggregatedGitLog: Usage: $0 [--repository <git repository directory>(default=source)]"
23+
while [[ $# -gt 0 ]]; do
24+
key="$1"
25+
case $key in
26+
--repository)
27+
repository="$2"
28+
# Check if the explicitly given repository is a valid directory
29+
if [ ! -d "${repository}" ] ; then
30+
echo "importAggregatedGitLog: Error: The given repository <${repository}> is not a directory" >&2
31+
echo "${USAGE}" >&2
32+
exit 1
33+
fi
34+
shift
35+
;;
36+
*)
37+
echo "importAggregatedGitLog: Error: Unknown option: ${key}"
38+
echo "${USAGE}" >&2
39+
exit 1
40+
esac
41+
shift
42+
done
43+
44+
# Check if the repository is actually a git repository
45+
if ! (cd "${repository}" || exit; git rev-parse --git-dir 2> /dev/null || exit); then
46+
echo "importAggregatedGitLog: Import skipped. ${repository} is not a git repository."
47+
exit 0
48+
fi
49+
50+
echo "importAggregatedGitLog: repository=${repository}"
51+
52+
## Get this "scripts" directory if not already set
53+
# Even if $BASH_SOURCE is made for Bourne-like shells it is also supported by others and therefore here the preferred solution.
54+
# CDPATH reduces the scope of the cd command to potentially prevent unintended directory changes.
55+
# This way non-standard tools like readlink aren't needed.
56+
SCRIPTS_DIR=${SCRIPTS_DIR:-$( CDPATH=. cd -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd -P )} # Repository directory containing the shell scripts
57+
echo "importAggregatedGitLog: SCRIPTS_DIR=$SCRIPTS_DIR"
58+
59+
# Get the "cypher" directory by taking the path of this script and going two directory up and then to "cypher".
60+
CYPHER_DIR=${CYPHER_DIR:-"${SCRIPTS_DIR}/../cypher"}
61+
echo "importAggregatedGitLog: CYPHER_DIR=${CYPHER_DIR}"
62+
63+
# Define functions (like execute_cypher and execute_cypher_summarized) to execute cypher queries from within a given file
64+
source "${SCRIPTS_DIR}/executeQueryFunctions.sh"
65+
66+
# Internal constants
67+
IMPORTS_CYPHER_DIR="${CYPHER_DIR}/Imports"
68+
NEO4J_INSTALLATION_NAME="neo4j-${NEO4J_EDITION}-${NEO4J_VERSION}"
69+
NEO4J_INSTALLATION_DIRECTORY="${TOOLS_DIRECTORY}/${NEO4J_INSTALLATION_NAME}"
70+
NEO4J_FULL_IMPORT_DIRECTORY=$(cd "${NEO4J_INSTALLATION_DIRECTORY}/import"; pwd)
71+
OUTPUT_CSV_FILENAME="${NEO4J_FULL_IMPORT_DIRECTORY}/aggregatedGitLog.csv"
72+
73+
# ----- Create a CSV file with git log data containing all commits and their changed files
74+
echo "importAggregatedGitLog: Creating ${OUTPUT_CSV_FILENAME} from git log..."
75+
76+
(
77+
# Git log needs to be executed in the directory of the repository.
78+
# This is done in a sub shell to automatically return to the previous directory.
79+
cd "${repository}" || exit
80+
81+
# Prints the header line of the CSV file with the names of the columns.
82+
echo "filename,year,month,author,email,commits" > "${OUTPUT_CSV_FILENAME}"
83+
84+
# Prints the aggregated git log in CSV format starting with the changed file, year-month, author, author email and number of commits.
85+
# Includes quoted strings, double quote escaping and supports commas in strings.
86+
git log --no-merges --pretty=format:' %ad,,,%an,,,%ae' --date=format:'%Y,%m' --name-only | \
87+
awk 'BEGIN { COMMA=",";QUOTE="\"" } /^ / { split($0, a, ",,,"); gsub(/^ /, "", a[1]); gsub(/"/, "\"\"", a[2]); gsub(/"/, "\"\"", a[3]); commit=a[1] COMMA QUOTE a[2] QUOTE COMMA QUOTE a[3] QUOTE } NF && !/^\ / { print "\""$0"\"," commit }' |
88+
grep -v -F '[bot]' | \
89+
sort | uniq -c | \
90+
sed -E 's/^ *([0-9]+) (.+)/\2,\1/g' \
91+
>> "${OUTPUT_CSV_FILENAME}"
92+
# Explanation:
93+
#
94+
# - --no-merges: Excludes merge commits from the log.
95+
# - %ad: Author date (formatted as specified later)
96+
# - %an: Author name
97+
# - %ae: Author email
98+
# - %ct: Commit date, Unix timestamp
99+
# - %s: Subject of the commit
100+
# - --date=format:'%Y,%m': Takes the year and the month of the date separated by a comma for example 2024,06
101+
# - --name-only: Lists the files affected by each commit.
102+
# - --pretty=format starts with a space that is needed to detect the start of a line.
103+
# - The chosen delimiters ,,, are used to separate these fields to make parsing easier.
104+
# It is very unlikely that they appear in the contents and will be used as an intermediate step before escaping.
105+
#
106+
# - BEGIN { COMMA=","; QUOTE="\"" }: Initializes the variables COMMA and QUOTE to hold a comma and a double-quote character respectively.
107+
# - /^ / { ... }: Processes lines that start with a space (indicating a file name in git log --name-only output).
108+
# - gsub(/^ /, "", a[1]): Removes leading spaces from the first field (commit hash) that was used to indicate a new commit.
109+
# - gsub(/"/, "\"\"", a[2]) escapes double quotes with two double quotes (CSV standard).
110+
# a[2] is the commit author. Double quote escaping is done for every string column
111+
# - commit=...: Constructs the commit information in CSV format, including the year-month of the change, quoted author name, and email.
112+
# - NF && !/^\ / { print "\""$0"\"," commit }: For non-empty lines that do not start with a space (indicating commit information),
113+
# it prints the commit information followed by the file name(s), enclosed in quotes.
114+
#
115+
# - grep -v -F '[bot]': Filters out commits where the commit message includes [bot]
116+
# Used to identify commits made by automated systems or bots.
117+
#
118+
# - sort | uniq -c: Sorts the lines by their content (order of columns essential for that), removes duplicate lines and adds the number of duplicates at the beginning of each line
119+
#- sed -E 's/^ *([0-9]+) (.+)/\2,\1/g': Reformats each line so that the commits count are the last column delimited by a comma.
120+
)
121+
122+
csv_file_size=$(wc -c "${OUTPUT_CSV_FILENAME}" | awk '{print $1}')
123+
csv_lines=$(wc -l "${OUTPUT_CSV_FILENAME}" | awk '{print $1}')
124+
echo "importAggregatedGitLog: File ${OUTPUT_CSV_FILENAME} with ${csv_file_size} bytes and ${csv_lines} lines created."
125+
# ---------
126+
127+
# ----- Import git log data csv
128+
echo "importAggregatedGitLog: Deleting all existing git data in the Graph..."
129+
execute_cypher "${IMPORTS_CYPHER_DIR}/Delete_git_log_data.cypher"
130+
131+
echo "importAggregatedGitLog: Importing aggregated git log data into the Graph..."
132+
time execute_cypher "${IMPORTS_CYPHER_DIR}/Import_aggregated_git_log_csv_data.cypher"
133+
# ---------

scripts/importGitLog.sh

Lines changed: 129 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,129 @@
1+
#!/usr/bin/env bash
2+
3+
# Uses git log to create a comma separated values (CSV) file containing all commits, their author, email address, date and all the file names that were changed with it. The CSV is then imported into Neo4j.
4+
5+
# Note: This script needs the path to a git repository directory. It defaults to SOURCE_DIRECTORY ("source").
6+
# Note: Import will be skipped without an error if the directory is not a git repository.
7+
# Note: This script needs git to be installed.
8+
9+
# Fail on any error ("-e" = exit on first error, "-o pipefail" exist on errors within piped commands)
10+
set -o errexit -o pipefail
11+
12+
# Overrideable Defaults
13+
NEO4J_EDITION=${NEO4J_EDITION:-"community"} # Choose "community" or "enterprise"
14+
NEO4J_VERSION=${NEO4J_VERSION:-"5.16.0"}
15+
TOOLS_DIRECTORY=${TOOLS_DIRECTORY:-"tools"} # Get the tools directory (defaults to "tools")
16+
SOURCE_DIRECTORY=${SOURCE_DIRECTORY:-"source"} # Get the source repository directory (defaults to "source")
17+
18+
# Default and initial values for command line options
19+
repository="${SOURCE_DIRECTORY}"
20+
21+
# Read command line options
22+
USAGE="importGitLog: Usage: $0 [--repository <git repository directory>(default=source)]"
23+
while [[ $# -gt 0 ]]; do
24+
key="$1"
25+
case $key in
26+
--repository)
27+
repository="$2"
28+
# Check if the explicitly given repository is a valid directory
29+
if [ ! -d "${repository}" ] ; then
30+
echo "importGitLog: Error: The given repository <${repository}> is not a directory" >&2
31+
echo "${USAGE}" >&2
32+
exit 1
33+
fi
34+
shift
35+
;;
36+
*)
37+
echo "importGitLog: Error: Unknown option: ${key}"
38+
echo "${USAGE}" >&2
39+
exit 1
40+
esac
41+
shift
42+
done
43+
44+
# Check if the repository is actually a git repository
45+
if ! (cd "${repository}" || exit; git rev-parse --git-dir 2> /dev/null || exit); then
46+
echo "importGitLog: Import skipped. ${repository} is not a git repository."
47+
exit 0
48+
fi
49+
50+
echo "importGitLog: repository=${repository}"
51+
52+
## Get this "scripts" directory if not already set
53+
# Even if $BASH_SOURCE is made for Bourne-like shells it is also supported by others and therefore here the preferred solution.
54+
# CDPATH reduces the scope of the cd command to potentially prevent unintended directory changes.
55+
# This way non-standard tools like readlink aren't needed.
56+
SCRIPTS_DIR=${SCRIPTS_DIR:-$( CDPATH=. cd -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd -P )} # Repository directory containing the shell scripts
57+
echo "importGitLog: SCRIPTS_DIR=$SCRIPTS_DIR"
58+
59+
# Get the "cypher" directory by taking the path of this script and going two directory up and then to "cypher".
60+
CYPHER_DIR=${CYPHER_DIR:-"${SCRIPTS_DIR}/../cypher"}
61+
echo "importGitLog: CYPHER_DIR=${CYPHER_DIR}"
62+
63+
# Define functions (like execute_cypher and execute_cypher_summarized) to execute cypher queries from within a given file
64+
source "${SCRIPTS_DIR}/executeQueryFunctions.sh"
65+
66+
# Internal constants
67+
IMPORTS_CYPHER_DIR="${CYPHER_DIR}/Imports"
68+
NEO4J_INSTALLATION_NAME="neo4j-${NEO4J_EDITION}-${NEO4J_VERSION}"
69+
NEO4J_INSTALLATION_DIRECTORY="${TOOLS_DIRECTORY}/${NEO4J_INSTALLATION_NAME}"
70+
NEO4J_FULL_IMPORT_DIRECTORY=$(cd "${NEO4J_INSTALLATION_DIRECTORY}/import"; pwd)
71+
OUTPUT_CSV_FILENAME="${NEO4J_FULL_IMPORT_DIRECTORY}/gitLog.csv"
72+
73+
# ----- Create a CSV file with git log data containing all commits and their changed files
74+
echo "importGitLog: Creating ${OUTPUT_CSV_FILENAME} from git log..."
75+
76+
(
77+
# Git log needs to be executed in the directory of the repository.
78+
# This is done in a sub shell to automatically return to the previous directory.
79+
cd "${repository}" || exit
80+
81+
# Prints the header line of the CSV file with the names of the columns.
82+
echo "hash,author,email,timestamp,timestamp_unix,message,filename" > "${OUTPUT_CSV_FILENAME}"
83+
84+
# Prints the git log in CSV format including the changed files.
85+
# Includes quoted strings, double quote escaping and supports commas in strings.
86+
git log --no-merges --pretty=format:' %h,,,%an,,,%ae,,,%aI,,,%ct,,,%s' --name-only | \
87+
awk 'BEGIN { COMMA=",";QUOTE="\"" } /^ / { split($0, a, ",,,"); gsub(/^ /, "", a[1]); gsub(/"/, "\"\"", a[2]); gsub(/"/, "\"\"", a[3]); gsub(/"/, "\"\"", a[6]); gsub(/\\/, " ", a[6]); commit=a[1] COMMA QUOTE a[2] QUOTE COMMA QUOTE a[3] QUOTE COMMA a[4] COMMA a[5] COMMA QUOTE a[6] QUOTE } NF && !/^\ / { print commit ",\""$0"\"" }' | \
88+
grep -v -F '[bot]' >> "${OUTPUT_CSV_FILENAME}"
89+
# Explanation:
90+
#
91+
# - --no-merges: Excludes merge commits from the log.
92+
# - %h: Abbreviated commit hash
93+
# - %an: Author name
94+
# - %ae: Author email
95+
# - %aI: Author date, ISO 8601 format
96+
# - %ct: Commit date, Unix timestamp
97+
# - %s: Subject of the commit
98+
# - --name-only: Lists the files affected by each commit.
99+
# - --pretty=format starts with a space that is needed to detect the start of a line.
100+
# - The chosen delimiters ,,, are used to separate these fields to make parsing easier.
101+
# It is very unlikely that they appear in the contents and will be used as an intermediate step before escaping.
102+
#
103+
# - BEGIN { COMMA=","; QUOTE="\"" }: Initializes the variables COMMA and QUOTE to hold a comma and a double-quote character respectively.
104+
# - /^ / { ... }: Processes lines that start with a space (indicating a file name in git log --name-only output).
105+
# - gsub(/^ /, "", a[1]): Removes leading spaces from the first field (commit hash) that was used to indicate a new commit.
106+
# - gsub(/"/, "\"\"", a[6]) escapes double quotes with two double quotes (CSV standard).
107+
# a[6] is the commit message column. Double quote escaping is done for every string column
108+
# - gsub(/\\/, " ", a[6]): Replaces backslashes in the commit message with spaces.
109+
# Otherwise, \" would lead to an error since it would be seen as an non escaped double quote.
110+
# - commit=...: Constructs the commit information in CSV format, including the quoted author name, author email, and commit message except for the file name.
111+
# - NF && !/^\ / { print commit ",\""$0"\"" }: For non-empty lines that do not start with a space (indicating commit information),
112+
# it prints the commit information followed by the file name(s), enclosed in quotes.
113+
#
114+
# - grep -v -F '[bot]': Filters out commits where the commit message includes [bot]
115+
# Used to identify commits made by automated systems or bots.
116+
)
117+
118+
csv_file_size=$(wc -c "${OUTPUT_CSV_FILENAME}" | awk '{print $1}')
119+
csv_lines=$(wc -l "${OUTPUT_CSV_FILENAME}" | awk '{print $1}')
120+
echo "importGitLog: File ${OUTPUT_CSV_FILENAME} with ${csv_file_size} bytes and ${csv_lines} lines created."
121+
# ---------
122+
123+
# ----- Import git log data csv
124+
echo "importGitLog: Deleting all existing git data in the Graph..."
125+
execute_cypher "${IMPORTS_CYPHER_DIR}/Delete_git_log_data.cypher"
126+
127+
echo "importGitLog: Importing new git log data into the Graph..."
128+
execute_cypher "${IMPORTS_CYPHER_DIR}/Import_git_log_csv_data.cypher"
129+
# ---------

0 commit comments

Comments
 (0)