diff --git a/cypher/Community_Detection/Community_Detection_7d_Modularity_Members.cypher b/cypher/Community_Detection/Community_Detection_7d_Modularity_Members.cypher index e58b29776..52aae8843 100644 --- a/cypher/Community_Detection/Community_Detection_7d_Modularity_Members.cypher +++ b/cypher/Community_Detection/Community_Detection_7d_Modularity_Members.cypher @@ -25,6 +25,6 @@ CALL gds.modularity.stream( RETURN communityId ,memberModularity ,memberCount - ,shortMemberNames - ,memberNames + ,shortMemberNames[0..9] AS someMemberNamesShort + ,memberNames[0..9] AS someMemberNames ORDER BY communityId ASCENDING \ No newline at end of file diff --git a/cypher/Community_Detection/Community_Detection_8d_Conductance.cypher b/cypher/Community_Detection/Community_Detection_8d_Conductance.cypher new file mode 100644 index 000000000..f238ee563 --- /dev/null +++ b/cypher/Community_Detection/Community_Detection_8d_Conductance.cypher @@ -0,0 +1,10 @@ +// Community Detection Conductance + +CALL gds.conductance.stream( + $dependencies_projection + '-cleaned', { + relationshipWeightProperty: $dependencies_projection_weight_property + ,communityProperty: $dependencies_projection_write_property +}) + YIELD community, conductance +RETURN community, conductance +ORDER BY community ASCENDING \ No newline at end of file diff --git a/cypher/Community_Detection/Community_Detection_8d_Conductance_Members.cypher b/cypher/Community_Detection/Community_Detection_8d_Conductance_Members.cypher new file mode 100644 index 000000000..0fd8a832f --- /dev/null +++ b/cypher/Community_Detection/Community_Detection_8d_Conductance_Members.cypher @@ -0,0 +1,30 @@ +// Community Detection Conductance Members + +CALL gds.conductance.stream( + $dependencies_projection + '-cleaned', { + relationshipWeightProperty: $dependencies_projection_weight_property + ,communityProperty: $dependencies_projection_write_property +}) + YIELD community AS communityId, conductance + WITH collect({communityId: communityId, conductance: conductance}) AS communityMetrics + MATCH (member) + WHERE member[$dependencies_projection_write_property] IS NOT NULL + AND $dependencies_projection_node IN LABELS(member) + WITH communityMetrics + ,member[$dependencies_projection_write_property] AS communityId + ,coalesce(member.fqn, member.fileName, member.name) AS memberName + ,coalesce(member.name, replace(last(split(member.fileName, '/')), '.jar', '')) AS shortMemberName + WITH communityMetrics + ,communityId + ,count(DISTINCT memberName) AS memberCount + ,collect(DISTINCT shortMemberName) AS shortMemberNames + ,collect(DISTINCT memberName) AS memberNames + ,reduce(memberConductance = 0, conductance IN communityMetrics | + CASE conductance.communityId WHEN communityId THEN conductance.conductance + ELSE memberConductance END) AS conductance + RETURN communityId + ,conductance + ,memberCount + ,shortMemberNames[0..9] AS someMemberNamesShort + ,memberNames[0..9] AS someMemberNames +ORDER BY communityId ASCENDING \ No newline at end of file diff --git a/cypher/Community_Detection/Community_Detection_9_Community_Metrics.cypher b/cypher/Community_Detection/Community_Detection_9_Community_Metrics.cypher new file mode 100644 index 000000000..5b52856bc --- /dev/null +++ b/cypher/Community_Detection/Community_Detection_9_Community_Metrics.cypher @@ -0,0 +1,44 @@ +// Community Metrics + + CALL gds.conductance.stream( + $dependencies_projection + '-cleaned', { + relationshipWeightProperty: $dependencies_projection_weight_property + ,communityProperty: $dependencies_projection_write_property +}) + YIELD community AS communityId, conductance + WITH collect({communityId: communityId, conductance: conductance}) AS conductances + CALL gds.modularity.stream( + $dependencies_projection + '-cleaned', { + relationshipWeightProperty: $dependencies_projection_weight_property + ,communityProperty: $dependencies_projection_write_property +}) + YIELD communityId, modularity + WITH conductances + ,collect({communityId: communityId, modularity: modularity}) AS modularities + MATCH (member) + WHERE member[$dependencies_projection_write_property] IS NOT NULL + AND $dependencies_projection_node IN LABELS(member) + WITH conductances + ,modularities + ,member[$dependencies_projection_write_property] AS communityId + ,coalesce(member.fqn, member.fileName, member.name) AS memberName + ,coalesce(member.name, replace(last(split(member.fileName, '/')), '.jar', '')) AS shortMemberName + WITH conductances + ,modularities + ,communityId + ,count(DISTINCT memberName) AS memberCount + ,collect(DISTINCT shortMemberName) AS shortMemberNames + ,collect(DISTINCT memberName) AS memberNames + ,reduce(memberConductance = 0, conductance IN conductances | + CASE conductance.communityId WHEN communityId THEN conductance.conductance + ELSE memberConductance END) AS conductance + ,reduce(memberModularity = 0, modularity IN modularities | + CASE modularity.communityId WHEN communityId THEN modularity.modularity + ELSE memberModularity END) AS modularity + RETURN communityId + ,conductance + ,modularity + ,memberCount + ,shortMemberNames[0..9] AS someMemberNamesShort + ,memberNames[0..9] AS someMemberNames +ORDER BY communityId ASCENDING \ No newline at end of file diff --git a/cypher/Community_Detection/Type_communities_that_span_the_most_packages.cypher b/cypher/Community_Detection/Type_communities_that_span_the_most_packages.cypher new file mode 100644 index 000000000..1528cc309 --- /dev/null +++ b/cypher/Community_Detection/Type_communities_that_span_the_most_packages.cypher @@ -0,0 +1,19 @@ +// Communities that span the most packages + + MATCH (a:Artifact)-[:CONTAINS]->(p:Package)-[:CONTAINS]->(t:Type) + WITH replace(last(split(a.fileName, '/')), '.jar', '') AS artifactName + ,t.communityLeidenId AS communityId + ,collect(DISTINCT p.fqn) AS packageNames + ,count(DISTINCT p.fqn) AS packageCount + ,collect(DISTINCT t.fqn) AS typeNames + ,count(DISTINCT t.fqn) AS typeCount + WHERE communityId IS NOT NULL +RETURN artifactName + ,communityId + ,packageCount + ,typeCount + ,packageNames + ,typeNames +ORDER BY packageCount DESCENDING + ,typeCount DESCENDING + ,communityId ASCENDING \ No newline at end of file diff --git a/cypher/Community_Detection/Type_communities_that_span_the_most_packages_with_type_statistics.cypher b/cypher/Community_Detection/Type_communities_that_span_the_most_packages_with_type_statistics.cypher new file mode 100644 index 000000000..28815cdcc --- /dev/null +++ b/cypher/Community_Detection/Type_communities_that_span_the_most_packages_with_type_statistics.cypher @@ -0,0 +1,39 @@ +// Communities that span the most packages with type statistics + + MATCH (a:Artifact)-[:CONTAINS]->(p:Package)-[:CONTAINS]->(t:Type) + WITH replace(last(split(a.fileName, '/')), '.jar', '') AS artifactName + ,t.communityLeidenId AS communityId + ,p.fqn AS packageName + ,collect(DISTINCT p.fqn) AS packageNames + ,count(DISTINCT p.fqn) AS packageCount + ,collect(DISTINCT t.fqn) AS typeNames + ,count(DISTINCT t.fqn) AS typeCount +ORDER BY typeCount ASCENDING + WHERE communityId IS NOT NULL + WITH artifactName + ,communityId + ,collect(DISTINCT packageName) AS packageNames + ,count(DISTINCT packageName) AS packageCount + // The object structure of "packageCommunityTypes" only works in the browser. + // It is only meant to be a helper to see how the communities and their packages are distributed in detail. + //,collect(DISTINCT {package: packageName, numberOfTypes:typeCount}) AS packageCommunityTypes + ,sum(typeCount) AS sumTypeCount + ,min(typeCount) AS minTypeCount + ,max(typeCount) AS maxTypeCount + ,avg(typeCount) AS avgTypeCount + ,stDev(typeCount) AS stdTypeCount + ,percentileDisc(typeCount, 0.5) AS per5TypeCount +RETURN artifactName + ,communityId + ,packageCount + ,sumTypeCount + ,minTypeCount + ,maxTypeCount + ,avgTypeCount + ,stdTypeCount + ,per5TypeCount + //,packageCommunityTypes + ,packageNames +ORDER BY packageCount DESCENDING + ,sumTypeCount DESCENDING + ,communityId ASCENDING \ No newline at end of file diff --git a/cypher/Community_Detection/Type_communities_with_few_members_in_foreign_packages.cypher b/cypher/Community_Detection/Type_communities_with_few_members_in_foreign_packages.cypher new file mode 100644 index 000000000..b17e06c1a --- /dev/null +++ b/cypher/Community_Detection/Type_communities_with_few_members_in_foreign_packages.cypher @@ -0,0 +1,34 @@ +// Type communities with few members in foreign packages + + MATCH (t:Type) + WITH t.communityLeidenId AS communityId + ,count(DISTINCT t.fqn) AS numberOfTypesInCommunity + WHERE communityId IS NOT NULL + MATCH (a:Artifact)-[:CONTAINS]->(p:Package)-[:CONTAINS]->(communityType:Type) + MATCH (p)-[:CONTAINS]->(packageType:Type) + WHERE communityType.communityLeidenId = communityId + AND packageType.communityLeidenId IS NOT NULL + WITH replace(last(split(a.fileName, '/')), '.jar', '') AS artifactName + ,p.fqn AS packageName + ,numberOfTypesInCommunity + ,count(DISTINCT packageType.fqn) AS numberOfTypesInPackage + ,collect(communityType) AS packageTypes +UNWIND packageTypes AS packageType + WITH artifactName + ,packageName + ,packageType.communityLeidenId AS communityId + ,numberOfTypesInPackage + ,numberOfTypesInCommunity + ,count(DISTINCT packageType.fqn) AS numberOfTypes + WHERE numberOfTypes < numberOfTypesInCommunity + AND numberOfTypes < numberOfTypesInPackage +RETURN artifactName + ,packageName + ,communityId + ,numberOfTypesInPackage + ,numberOfTypesInCommunity + ,numberOfTypes +ORDER BY numberOfTypes ASCENDING + ,numberOfTypesInCommunity DESCENDING + ,numberOfTypesInPackage DESCENDING + ,packageName ASCENDING \ No newline at end of file diff --git a/cypher/Community_Detection_Label_Propagation.cypher b/cypher/Community_Detection_Label_Propagation.cypher deleted file mode 100644 index cc5acc957..000000000 --- a/cypher/Community_Detection_Label_Propagation.cypher +++ /dev/null @@ -1,9 +0,0 @@ -//Community Detection Label Propagation -CALL gds.labelPropagation.stream('package-dependencies', { - relationshipWeightProperty: 'weight' - ,maxIterations: 10 -}) -YIELD nodeId, communityId - WITH communityId, gds.util.asNode(nodeId) AS package -RETURN communityId, count(package) AS members, collect(package.fqn) AS packageNames -ORDER BY communityId \ No newline at end of file diff --git a/cypher/Community_Detection_Weakly_Connected_Components.cypher b/cypher/Community_Detection_Weakly_Connected_Components.cypher deleted file mode 100644 index 71c100a5b..000000000 --- a/cypher/Community_Detection_Weakly_Connected_Components.cypher +++ /dev/null @@ -1,9 +0,0 @@ -//Community Detection Weakly Connected Components -CALL gds.wcc.stream('package-dependencies', { - relationshipWeightProperty: 'weight' - ,threshold: 50.0 -}) -YIELD nodeId, componentId - WITH componentId, gds.util.asNode(nodeId) AS package -RETURN componentId, count(package) AS members, collect(package.fqn) AS packageNames -ORDER BY componentId \ No newline at end of file diff --git a/scripts/executeQuery.sh b/scripts/executeQuery.sh index a19ac3890..e9899c60e 100755 --- a/scripts/executeQuery.sh +++ b/scripts/executeQuery.sh @@ -105,6 +105,7 @@ then redColor='\033[0;31m' noColor='\033[0m' echo -e "${redColor}${cypher_query_file_name}: ${cypher_query_result}${noColor}" >&2 + echo -e "${redColor}Parameters: ${query_parameters}${noColor}" >&2 exit 1 fi #echo "executeQuery: Cypher Query OK Result: ${cypher_query_result}" @@ -115,6 +116,7 @@ if [[ -n "${error_message}" ]]; then redColor='\033[0;31m' noColor='\033[0m' echo -e "${redColor}${cypher_query_file_name}: ${error_message}${noColor}" >&2 + echo -e "${redColor}Parameters: ${query_parameters}${noColor}" >&2 exit 1 fi diff --git a/scripts/reports/CommunityCsv.sh b/scripts/reports/CommunityCsv.sh index b62acfb05..5f029ed4c 100755 --- a/scripts/reports/CommunityCsv.sh +++ b/scripts/reports/CommunityCsv.sh @@ -113,6 +113,8 @@ detectCommunitiesWithLouvain() { execute_cypher "${PROJECTION_CYPHER_DIR}/Dependencies_9_Write_Mutated.cypher" "${@}" "${writePropertyNameIntermediate}" execute_cypher "${PROJECTION_CYPHER_DIR}/Dependencies_10_Delete_Label.cypher" "${@}" "${writePropertyName}" "${writeLabelName}" execute_cypher "${PROJECTION_CYPHER_DIR}/Dependencies_11_Add_Label.cypher" "${@}" "${writePropertyName}" "${writeLabelName}" + + calculateCommunityMetrics "${@}" "${writePropertyName}" } # Community Detection using the Leiden Algorithm @@ -157,19 +159,8 @@ detectCommunitiesWithLeiden() { execute_cypher "${PROJECTION_CYPHER_DIR}/Dependencies_9_Write_Mutated.cypher" "${@}" "${writePropertyNameIntermediate}" execute_cypher "${PROJECTION_CYPHER_DIR}/Dependencies_10_Delete_Label.cypher" "${@}" "${writePropertyName}" "${writeLabelName}" execute_cypher "${PROJECTION_CYPHER_DIR}/Dependencies_11_Add_Label.cypher" "${@}" "${writePropertyName}" "${writeLabelName}" -} -# Write modularity for Leiden communities -# -# Required Parameters: -# - dependencies_projection=... -# Name prefix for the in-memory projection name for dependencies. Example: "package" -# - dependencies_projection_weight_property=... -# Name of the node property that contains the dependency weight. Example: "weight" -writeLeidenModularity() { - local COMMUNITY_DETECTION_CYPHER_DIR="${CYPHER_DIR}/Community_Detection" - local writePropertyName="dependencies_projection_write_property=communityLeidenId" - execute_cypher "${COMMUNITY_DETECTION_CYPHER_DIR}/Community_Detection_7e_Write_Modularity.cypher" "${@}" "${writePropertyName}" + calculateCommunityMetrics "${@}" "${writePropertyName}" } # Community Detection using the Weakly Connected Components Algorithm @@ -205,6 +196,8 @@ detectCommunitiesWithWeaklyConnectedComponents() { execute_cypher "${PROJECTION_CYPHER_DIR}/Dependencies_9_Write_Mutated.cypher" "${@}" "${writePropertyName}" execute_cypher "${PROJECTION_CYPHER_DIR}/Dependencies_10_Delete_Label.cypher" "${@}" "${writePropertyName}" "${writeLabelName}" execute_cypher "${PROJECTION_CYPHER_DIR}/Dependencies_11_Add_Label.cypher" "${@}" "${writePropertyName}" "${writeLabelName}" + + calculateCommunityMetrics "${@}" "${writePropertyName}" } # Community Detection using the Label Propagation Algorithm @@ -240,6 +233,8 @@ detectCommunitiesWithLabelPropagation() { execute_cypher "${PROJECTION_CYPHER_DIR}/Dependencies_9_Write_Mutated.cypher" "${@}" "${writePropertyName}" execute_cypher "${PROJECTION_CYPHER_DIR}/Dependencies_10_Delete_Label.cypher" "${@}" "${writePropertyName}" "${writeLabelName}" execute_cypher "${PROJECTION_CYPHER_DIR}/Dependencies_11_Add_Label.cypher" "${@}" "${writePropertyName}" "${writeLabelName}" + + calculateCommunityMetrics "${@}" "${writePropertyName}" } # Community Detection using the K-Core Decomposition Algorithm @@ -274,6 +269,8 @@ detectCommunitiesWithKCoreDecomposition() { execute_cypher "${PROJECTION_CYPHER_DIR}/Dependencies_9_Write_Mutated.cypher" "${@}" "${writePropertyName}" execute_cypher "${PROJECTION_CYPHER_DIR}/Dependencies_10_Delete_Label.cypher" "${@}" "${writePropertyName}" "${writeLabelName}" execute_cypher "${PROJECTION_CYPHER_DIR}/Dependencies_11_Add_Label.cypher" "${@}" "${writePropertyName}" "${writeLabelName}" + + calculateCommunityMetrics "${@}" "${writePropertyName}" } # Community Detection using the Approximate Maximum k-cut Algorithm @@ -310,6 +307,63 @@ detectCommunitiesWithApproximateMaximumKCut() { execute_cypher "${PROJECTION_CYPHER_DIR}/Dependencies_9_Write_Mutated.cypher" "${@}" "${writePropertyName}" execute_cypher "${PROJECTION_CYPHER_DIR}/Dependencies_10_Delete_Label.cypher" "${@}" "${writePropertyName}" "${writeLabelName}" execute_cypher "${PROJECTION_CYPHER_DIR}/Dependencies_11_Add_Label.cypher" "${@}" "${writePropertyName}" "${writeLabelName}" + + calculateCommunityMetrics "${@}" "${writePropertyName}" +} + +# Calculates community metrics including "Modularity" and "Conductance". +# +# +# Required Parameters: +# - dependencies_projection=... +# Name prefix for the in-memory projection name for dependencies. Example: "package" +# - writePropertyName=... +# Name of the property that contains the communitiy id +# - dependencies_projection_weight_property=... +# Name of the node property that contains the dependency weight. Example: "weight" +calculateCommunityMetrics() { + local COMMUNITY_DETECTION_CYPHER_DIR="${CYPHER_DIR}/Community_Detection" + + local nodeLabel + nodeLabel=$( extractQueryParameter "dependencies_projection_node" "${@}") + + local propertyName + propertyName=$( extractQueryParameter "dependencies_projection_write_property" "${@}") + + local fileNamePrefix + fileNamePrefix="${FULL_REPORT_DIRECTORY}/${nodeLabel}_${propertyName}_Community_" + + # Print results to CSV + local combinedMetrics + if combinedMetrics=$( execute_cypher "${COMMUNITY_DETECTION_CYPHER_DIR}/Community_Detection_9_Community_Metrics.cypher" "${@}" ); then + echo "${combinedMetrics}" > "${fileNamePrefix}_Metrics.csv" + else + # Combined metrics failed. Trying one by one at least get those that doesn't fail. + local modularity + if modularity=$( execute_cypher "${COMMUNITY_DETECTION_CYPHER_DIR}/Community_Detection_7d_Modularity_Members.cypher" "${@}" ); then + echo "${modularity}" > "${fileNamePrefix}_Modularity.csv" + fi + local conductance + if conductance=$( execute_cypher "${COMMUNITY_DETECTION_CYPHER_DIR}/Community_Detection_8d_Conductance_Members.cypher" "${@}" ); then + echo "${conductance}" > "${fileNamePrefix}_Conductance.csv" + fi + fi + # Continue even if there were metrics that failed since they aren't essential + # and there seem to be open issues like: + # gds.modularity.stream ArrayIndexOutOfBoundsException: Index -1 out of bounds for length 100 +} + +# Write modularity for Leiden communities +# +# Required Parameters: +# - dependencies_projection=... +# Name prefix for the in-memory projection name for dependencies. Example: "package" +# - dependencies_projection_weight_property=... +# Name of the node property that contains the dependency weight. Example: "weight" +writeLeidenModularity() { + local COMMUNITY_DETECTION_CYPHER_DIR="${CYPHER_DIR}/Community_Detection" + local writePropertyName="dependencies_projection_write_property=communityLeidenId" + execute_cypher "${COMMUNITY_DETECTION_CYPHER_DIR}/Community_Detection_7e_Write_Modularity.cypher" "${@}" "${writePropertyName}" } # Compare the results of different community detection algorighms @@ -389,5 +443,7 @@ detectCommunities "${TYPE_PROJECTION}" "${TYPE_NODE}" "${TYPE_WEIGHT}" "${TYPE_G # Type Community Detection - Special CSV Queries after update execute_cypher "${CYPHER_DIR}/Community_Detection/Which_type_community_spans_several_artifacts_and_how_are_the_types_distributed.cypher" > "${FULL_REPORT_DIRECTORY}/Type_Communities_Leiden_That_Span_Multiple_Artifacts.csv" +execute_cypher "${CYPHER_DIR}/Community_Detection/Type_communities_with_few_members_in_foreign_packages.cypher" > "${FULL_REPORT_DIRECTORY}/Type_communities_with_few_members_in_foreign_packages.csv" +execute_cypher "${CYPHER_DIR}/Community_Detection/Type_communities_that_span_the_most_packages_with_type_statistics.cypher" > "${FULL_REPORT_DIRECTORY}/Type_communities_that_span_the_most_packages_with_type_statistics.csv" echo "communityCsv: $(date +'%Y-%m-%dT%H:%M:%S%z') Successfully finished" \ No newline at end of file