From 7024a0b6183b111c3fd673c1b5330edc5979e8b1 Mon Sep 17 00:00:00 2001 From: miconis Date: Tue, 5 Nov 2024 10:35:15 +0100 Subject: [PATCH] initial commit --- .gitignore | 38 + .idea/codeStyles/Project.xml | 7 + .idea/codeStyles/codeStyleConfig.xml | 5 + .idea/encodings.xml | 17 + .idea/misc.xml | 14 + .idea/packagesearch.xml | 6 + .idea/scala_compiler.xml | 6 + .idea/uiDesigner.xml | 124 +++ .idea/vcs.xml | 6 + .idea/workspace.xml | 229 ++++++ .../README.markdown | 7 + .../dhp-build-assembly-resources/pom.xml | 26 + .../resources/assemblies/oozie-installer.xml | 32 + .../src/main/resources/assemblies/tests.xml | 24 + .../resources/commands/get_working_dir.sh | 3 + .../resources/commands/print_working_dir.sh | 5 + .../main/resources/commands/readme.markdown | 5 + .../main/resources/commands/run_workflow.sh | 10 + .../resources/commands/upload_workflow.sh | 34 + .../main/resources/project-default.properties | 7 + .../README.markdown | 6 + .../dhp-build-properties-maven-plugin/pom.xml | 132 ++++ .../GenerateOoziePropertiesMojo.java | 76 ++ .../WritePredefinedProjectProperties.java | 447 +++++++++++ .../GenerateOoziePropertiesMojoTest.java | 108 +++ .../WritePredefinedProjectPropertiesTest.java | 391 ++++++++++ .../plugin/properties/included.properties | 1 + .../test.properties | 2 + dhp-build/dhp-code-style/pom.xml | 48 ++ .../main/resources/eclipse/formatter_aosp.xml | 252 ++++++ .../main/resources/eclipse/formatter_dnet.xml | 727 ++++++++++++++++++ .../resources/eclipse/formatter_google.xml | 337 ++++++++ dhp-build/pom.xml | 30 + dhp-raid/job-override.properties | 6 + dhp-raid/pom.xml | 545 +++++++++++++ .../dnetlib/raid/jobs/AbstractSparkJob.java | 74 ++ .../raid/jobs/SparkCreateEmbeddings.java | 217 ++++++ .../raid/jobs/SparkCreateEmbeddingsW2V.java | 213 +++++ .../raid/jobs/SparkRAiDClustering.java | 128 +++ .../support/ArgumentApplicationParser.java | 95 +++ .../eu/dnetlib/raid/support/EdgeParam.java | 33 + .../raid/support/OptionsParameter.java | 39 + .../eu/dnetlib/raid/support/RAiDConfig.java | 75 ++ .../dnetlib/raid/support/RandomWalkParam.java | 53 ++ .../parameters/createClusters_parameters.json | 32 + .../createEmbeddings_parameters.json | 32 + .../raid/oozie_app/config-default.xml | 18 + .../resources/raid/oozie_app/workflow.xml | 147 ++++ .../eu/dnetlib/raid/graph/AliasOps.scala | 75 ++ .../eu/dnetlib/raid/graph/GraphOps.scala | 67 ++ .../eu/dnetlib/raid/graph/GraphUtil.scala | 53 ++ .../scala/eu/dnetlib/raid/graph/package.scala | 13 + .../eu/dnetlib/raid/walker/RandomWalk.scala | 170 ++++ .../eu/dnetlib/raid/RAiDInferenceTest.java | 141 ++++ .../eu/dnetlib/raid/config/raid.conf.json | 40 + .../eu/dnetlib/raid/examples/graph/dataset | 3 + .../dnetlib/raid/examples/graph/publication | 4 + .../eu/dnetlib/raid/examples/graph/relation | 29 + .../eu/dnetlib/raid/examples/graph/software | 4 + pom.xml | 445 +++++++++++ 60 files changed, 5913 insertions(+) create mode 100644 .gitignore create mode 100644 .idea/codeStyles/Project.xml create mode 100644 .idea/codeStyles/codeStyleConfig.xml create mode 100644 .idea/encodings.xml create mode 100644 .idea/misc.xml create mode 100644 .idea/packagesearch.xml create mode 100644 .idea/scala_compiler.xml create mode 100644 .idea/uiDesigner.xml create mode 100644 .idea/vcs.xml create mode 100644 .idea/workspace.xml create mode 100644 dhp-build/dhp-build-assembly-resources/README.markdown create mode 100644 dhp-build/dhp-build-assembly-resources/pom.xml create mode 100644 dhp-build/dhp-build-assembly-resources/src/main/resources/assemblies/oozie-installer.xml create mode 100644 dhp-build/dhp-build-assembly-resources/src/main/resources/assemblies/tests.xml create mode 100644 dhp-build/dhp-build-assembly-resources/src/main/resources/commands/get_working_dir.sh create mode 100644 dhp-build/dhp-build-assembly-resources/src/main/resources/commands/print_working_dir.sh create mode 100644 dhp-build/dhp-build-assembly-resources/src/main/resources/commands/readme.markdown create mode 100644 dhp-build/dhp-build-assembly-resources/src/main/resources/commands/run_workflow.sh create mode 100644 dhp-build/dhp-build-assembly-resources/src/main/resources/commands/upload_workflow.sh create mode 100644 dhp-build/dhp-build-assembly-resources/src/main/resources/project-default.properties create mode 100644 dhp-build/dhp-build-properties-maven-plugin/README.markdown create mode 100644 dhp-build/dhp-build-properties-maven-plugin/pom.xml create mode 100644 dhp-build/dhp-build-properties-maven-plugin/src/main/java/eu/dnetlib/maven/plugin/properties/GenerateOoziePropertiesMojo.java create mode 100644 dhp-build/dhp-build-properties-maven-plugin/src/main/java/eu/dnetlib/maven/plugin/properties/WritePredefinedProjectProperties.java create mode 100644 dhp-build/dhp-build-properties-maven-plugin/src/test/java/eu/dnetlib/maven/plugin/properties/GenerateOoziePropertiesMojoTest.java create mode 100644 dhp-build/dhp-build-properties-maven-plugin/src/test/java/eu/dnetlib/maven/plugin/properties/WritePredefinedProjectPropertiesTest.java create mode 100644 dhp-build/dhp-build-properties-maven-plugin/src/test/resources/eu/dnetlib/maven/plugin/properties/included.properties create mode 100644 dhp-build/dhp-build-properties-maven-plugin/test.properties create mode 100644 dhp-build/dhp-code-style/pom.xml create mode 100644 dhp-build/dhp-code-style/src/main/resources/eclipse/formatter_aosp.xml create mode 100644 dhp-build/dhp-code-style/src/main/resources/eclipse/formatter_dnet.xml create mode 100644 dhp-build/dhp-code-style/src/main/resources/eclipse/formatter_google.xml create mode 100644 dhp-build/pom.xml create mode 100644 dhp-raid/job-override.properties create mode 100644 dhp-raid/pom.xml create mode 100644 dhp-raid/src/main/java/eu/dnetlib/raid/jobs/AbstractSparkJob.java create mode 100644 dhp-raid/src/main/java/eu/dnetlib/raid/jobs/SparkCreateEmbeddings.java create mode 100644 dhp-raid/src/main/java/eu/dnetlib/raid/jobs/SparkCreateEmbeddingsW2V.java create mode 100644 dhp-raid/src/main/java/eu/dnetlib/raid/jobs/SparkRAiDClustering.java create mode 100644 dhp-raid/src/main/java/eu/dnetlib/raid/support/ArgumentApplicationParser.java create mode 100644 dhp-raid/src/main/java/eu/dnetlib/raid/support/EdgeParam.java create mode 100644 dhp-raid/src/main/java/eu/dnetlib/raid/support/OptionsParameter.java create mode 100644 dhp-raid/src/main/java/eu/dnetlib/raid/support/RAiDConfig.java create mode 100644 dhp-raid/src/main/java/eu/dnetlib/raid/support/RandomWalkParam.java create mode 100644 dhp-raid/src/main/resources/jobs/parameters/createClusters_parameters.json create mode 100644 dhp-raid/src/main/resources/jobs/parameters/createEmbeddings_parameters.json create mode 100644 dhp-raid/src/main/resources/raid/oozie_app/config-default.xml create mode 100644 dhp-raid/src/main/resources/raid/oozie_app/workflow.xml create mode 100644 dhp-raid/src/main/scala/eu/dnetlib/raid/graph/AliasOps.scala create mode 100644 dhp-raid/src/main/scala/eu/dnetlib/raid/graph/GraphOps.scala create mode 100644 dhp-raid/src/main/scala/eu/dnetlib/raid/graph/GraphUtil.scala create mode 100644 dhp-raid/src/main/scala/eu/dnetlib/raid/graph/package.scala create mode 100644 dhp-raid/src/main/scala/eu/dnetlib/raid/walker/RandomWalk.scala create mode 100644 dhp-raid/src/test/java/eu/dnetlib/raid/RAiDInferenceTest.java create mode 100644 dhp-raid/src/test/resources/eu/dnetlib/raid/config/raid.conf.json create mode 100644 dhp-raid/src/test/resources/eu/dnetlib/raid/examples/graph/dataset create mode 100644 dhp-raid/src/test/resources/eu/dnetlib/raid/examples/graph/publication create mode 100644 dhp-raid/src/test/resources/eu/dnetlib/raid/examples/graph/relation create mode 100644 dhp-raid/src/test/resources/eu/dnetlib/raid/examples/graph/software create mode 100644 pom.xml diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..5ff6309 --- /dev/null +++ b/.gitignore @@ -0,0 +1,38 @@ +target/ +!.mvn/wrapper/maven-wrapper.jar +!**/src/main/**/target/ +!**/src/test/**/target/ + +### IntelliJ IDEA ### +.idea/modules.xml +.idea/jarRepositories.xml +.idea/compiler.xml +.idea/libraries/ +*.iws +*.iml +*.ipr + +### Eclipse ### +.apt_generated +.classpath +.factorypath +.project +.settings +.springBeans +.sts4-cache + +### NetBeans ### +/nbproject/private/ +/nbbuild/ +/dist/ +/nbdist/ +/.nb-gradle/ +build/ +!**/src/main/**/build/ +!**/src/test/**/build/ + +### VS Code ### +.vscode/ + +### Mac OS ### +.DS_Store \ No newline at end of file diff --git a/.idea/codeStyles/Project.xml b/.idea/codeStyles/Project.xml new file mode 100644 index 0000000..919ce1f --- /dev/null +++ b/.idea/codeStyles/Project.xml @@ -0,0 +1,7 @@ + + + + + + \ No newline at end of file diff --git a/.idea/codeStyles/codeStyleConfig.xml b/.idea/codeStyles/codeStyleConfig.xml new file mode 100644 index 0000000..a55e7a1 --- /dev/null +++ b/.idea/codeStyles/codeStyleConfig.xml @@ -0,0 +1,5 @@ + + + + \ No newline at end of file diff --git a/.idea/encodings.xml b/.idea/encodings.xml new file mode 100644 index 0000000..5540b65 --- /dev/null +++ b/.idea/encodings.xml @@ -0,0 +1,17 @@ + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml new file mode 100644 index 0000000..132404b --- /dev/null +++ b/.idea/misc.xml @@ -0,0 +1,14 @@ + + + + + + + + + + \ No newline at end of file diff --git a/.idea/packagesearch.xml b/.idea/packagesearch.xml new file mode 100644 index 0000000..a1bdd94 --- /dev/null +++ b/.idea/packagesearch.xml @@ -0,0 +1,6 @@ + + + + + \ No newline at end of file diff --git a/.idea/scala_compiler.xml b/.idea/scala_compiler.xml new file mode 100644 index 0000000..7a37426 --- /dev/null +++ b/.idea/scala_compiler.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/.idea/uiDesigner.xml b/.idea/uiDesigner.xml new file mode 100644 index 0000000..2b63946 --- /dev/null +++ b/.idea/uiDesigner.xml @@ -0,0 +1,124 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/vcs.xml b/.idea/vcs.xml new file mode 100644 index 0000000..94a25f7 --- /dev/null +++ b/.idea/vcs.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/.idea/workspace.xml b/.idea/workspace.xml new file mode 100644 index 0000000..70be17e --- /dev/null +++ b/.idea/workspace.xml @@ -0,0 +1,229 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + { + "keyToString": { + "RunOnceActivity.OpenProjectViewOnStart": "true", + "RunOnceActivity.ShowReadmeOnStart": "true", + "SHARE_PROJECT_CONFIGURATION_FILES": "true", + "WebServerToolWindowFactoryState": "false", + "jdk.selected.JAVA_MODULE": "1.8", + "last_opened_file_path": "/Users/miconis/IdeaProjects/RAiDInference/dhp-raid/src/main/scala/eu/dnetlib/raid/clustering/dbscan", + "node.js.detected.package.eslint": "true", + "node.js.detected.package.tslint": "true", + "node.js.selected.package.eslint": "(autodetect)", + "node.js.selected.package.tslint": "(autodetect)", + "nodejs_package_manager_path": "npm", + "project.structure.last.edited": "Modules", + "project.structure.proportion": "0.15", + "project.structure.side.proportion": "0.2", + "settings.editor.selected.configurable": "reference.settings.project.maven.repository.indices", + "spring.configuration.checksum": "8dfb42a635b85ca46f37a8c405f6a723", + "vue.rearranger.settings.migration": "true" + } +} + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + 1729084633471 + + + + + + \ No newline at end of file diff --git a/dhp-build/dhp-build-assembly-resources/README.markdown b/dhp-build/dhp-build-assembly-resources/README.markdown new file mode 100644 index 0000000..a345dff --- /dev/null +++ b/dhp-build/dhp-build-assembly-resources/README.markdown @@ -0,0 +1,7 @@ +Module utilized by `dhp-workflows`. + +Contains all required resources by this parent module: + +* assembly XML definitions +* build shell scripts +* oozie package commands for uploading, running and monitoring oozie workflows diff --git a/dhp-build/dhp-build-assembly-resources/pom.xml b/dhp-build/dhp-build-assembly-resources/pom.xml new file mode 100644 index 0000000..80736b9 --- /dev/null +++ b/dhp-build/dhp-build-assembly-resources/pom.xml @@ -0,0 +1,26 @@ + + + + 4.0.0 + + + eu.dnetlib + dhp-build + 1.0.0-SNAPSHOT + + + dhp-build-assembly-resources + jar + + This module contains a set of scripts supporting the build lifecycle for the dnet-hadoop project + + + + + org.apache.maven.plugins + maven-compiler-plugin + ${maven.compiler.plugin.version} + + + + diff --git a/dhp-build/dhp-build-assembly-resources/src/main/resources/assemblies/oozie-installer.xml b/dhp-build/dhp-build-assembly-resources/src/main/resources/assemblies/oozie-installer.xml new file mode 100644 index 0000000..1419c5b --- /dev/null +++ b/dhp-build/dhp-build-assembly-resources/src/main/resources/assemblies/oozie-installer.xml @@ -0,0 +1,32 @@ + + + oozie-installer + + dir + + + + + true + ${project.build.directory}/assembly-resources/commands + + / + + **/* + + 0755 + unix + + + / + diff --git a/dhp-build/dhp-build-assembly-resources/src/main/resources/assemblies/tests.xml b/dhp-build/dhp-build-assembly-resources/src/main/resources/assemblies/tests.xml new file mode 100644 index 0000000..bf679e6 --- /dev/null +++ b/dhp-build/dhp-build-assembly-resources/src/main/resources/assemblies/tests.xml @@ -0,0 +1,24 @@ + + + tests + + jar + + false + + + ${project.build.testOutputDirectory} + + + + + + \ No newline at end of file diff --git a/dhp-build/dhp-build-assembly-resources/src/main/resources/commands/get_working_dir.sh b/dhp-build/dhp-build-assembly-resources/src/main/resources/commands/get_working_dir.sh new file mode 100644 index 0000000..e9d55f0 --- /dev/null +++ b/dhp-build/dhp-build-assembly-resources/src/main/resources/commands/get_working_dir.sh @@ -0,0 +1,3 @@ +#!/bin/bash +hadoop fs -get ${workingDir} + diff --git a/dhp-build/dhp-build-assembly-resources/src/main/resources/commands/print_working_dir.sh b/dhp-build/dhp-build-assembly-resources/src/main/resources/commands/print_working_dir.sh new file mode 100644 index 0000000..c79839e --- /dev/null +++ b/dhp-build/dhp-build-assembly-resources/src/main/resources/commands/print_working_dir.sh @@ -0,0 +1,5 @@ +#!/bin/bash +echo "" +echo "---->Contents of the working directory" +hadoop fs -ls ${workingDir} + diff --git a/dhp-build/dhp-build-assembly-resources/src/main/resources/commands/readme.markdown b/dhp-build/dhp-build-assembly-resources/src/main/resources/commands/readme.markdown new file mode 100644 index 0000000..3e049c1 --- /dev/null +++ b/dhp-build/dhp-build-assembly-resources/src/main/resources/commands/readme.markdown @@ -0,0 +1,5 @@ +Execute the scripts in the following order: + +1. `upload_workflow.sh` +2. `run_workflow.sh` +3. `print_working_dir.sh` or `get_working_dir.sh` diff --git a/dhp-build/dhp-build-assembly-resources/src/main/resources/commands/run_workflow.sh b/dhp-build/dhp-build-assembly-resources/src/main/resources/commands/run_workflow.sh new file mode 100644 index 0000000..fee3d77 --- /dev/null +++ b/dhp-build/dhp-build-assembly-resources/src/main/resources/commands/run_workflow.sh @@ -0,0 +1,10 @@ +#!/bin/bash + +if [ $# = 0 ] ; then + oozie job -oozie ${oozieServiceLoc} -config job.properties -run +else + oozie job -oozie ${oozieServiceLoc} -config $1/job.properties -run +fi + + + diff --git a/dhp-build/dhp-build-assembly-resources/src/main/resources/commands/upload_workflow.sh b/dhp-build/dhp-build-assembly-resources/src/main/resources/commands/upload_workflow.sh new file mode 100644 index 0000000..c5d299c --- /dev/null +++ b/dhp-build/dhp-build-assembly-resources/src/main/resources/commands/upload_workflow.sh @@ -0,0 +1,34 @@ +#!/bin/bash +exec 3>&1 +BASH_XTRACEFD=3 +set -x ## print every executed command + + +if [ $# = 0 ] ; then + target_dir_root=`pwd`'/${oozieAppDir}' +else + target_dir_root=`readlink -f $1`'/${oozieAppDir}' +fi + +# initial phase, creating symbolic links to jars in all subworkflows +# currently disabled +#libDir=$target_dir_root'/lib' +#dirs=`find $target_dir_root/* -maxdepth 10 -type d` +#for dir in $dirs +#do +# if [ -f $dir/workflow.xml ] +# then +# echo "creating symbolic links to jars in directory: $dir/lib" +# if [ ! -d "$dir/lib" ]; then +# mkdir $dir/lib +# fi +# find $libDir -type f -exec ln -s \{\} $dir/lib \; +# fi +#done + + +#uploading +hadoop fs -rm -r ${sandboxDir} +hadoop fs -mkdir -p ${sandboxDir} +hadoop fs -mkdir -p ${workingDir} +hadoop fs -put $target_dir_root ${sandboxDir} diff --git a/dhp-build/dhp-build-assembly-resources/src/main/resources/project-default.properties b/dhp-build/dhp-build-assembly-resources/src/main/resources/project-default.properties new file mode 100644 index 0000000..84a56f1 --- /dev/null +++ b/dhp-build/dhp-build-assembly-resources/src/main/resources/project-default.properties @@ -0,0 +1,7 @@ +#sandboxName when not provided explicitly will be generated +sandboxName=${sandboxName} +sandboxDir=/user/${dhp.hadoop.frontend.user.name}/${sandboxName} +workingDir=${sandboxDir}/working_dir +oozie.wf.application.path = ${nameNode}${sandboxDir}/${oozieAppDir} +oozieTopWfApplicationPath = ${oozie.wf.application.path} + diff --git a/dhp-build/dhp-build-properties-maven-plugin/README.markdown b/dhp-build/dhp-build-properties-maven-plugin/README.markdown new file mode 100644 index 0000000..66234e8 --- /dev/null +++ b/dhp-build/dhp-build-properties-maven-plugin/README.markdown @@ -0,0 +1,6 @@ +Maven plugin module utilized by `dhp-workflows` for proper `job.properties` file building. + +It is based on http://site.kuali.org/maven/plugins/properties-maven-plugin/1.3.2/write-project-properties-mojo.html and supplemented with: + +* handling includePropertyKeysFromFiles property allowing writing only properties listed in given property files +As a final outcome only properties listed in `` element and listed as a keys in files from `` element will be written to output file. diff --git a/dhp-build/dhp-build-properties-maven-plugin/pom.xml b/dhp-build/dhp-build-properties-maven-plugin/pom.xml new file mode 100644 index 0000000..6264219 --- /dev/null +++ b/dhp-build/dhp-build-properties-maven-plugin/pom.xml @@ -0,0 +1,132 @@ + + + + 4.0.0 + + + eu.dnetlib + dhp-build + 1.0.0-SNAPSHOT + + + dhp-build-properties-maven-plugin + 1.0.0-SNAPSHOT + maven-plugin + + This module is a maven plugin implementing custom properties substitutions in the build lifecycle + + + + org.apache.maven + maven-plugin-api + 3.6.3 + provided + + + org.apache.maven + maven-project + 2.2.1 + provided + + + org.apache.maven + maven-artifact + 2.2.1 + provided + + + + org.kuali.maven.plugins + properties-maven-plugin + ${properties.maven.plugin.version} + + + com.google.code.findbugs + annotations + 3.0.1 + provided + + + com.google.code.findbugs + jsr305 + 3.0.1 + provided + + + + + org.mockito + mockito-core + ${mockito-core.version} + test + + + + org.mockito + mockito-junit-jupiter + ${mockito-core.version} + test + + + + + + + target + target/classes + ${project.artifactId}-${project.version} + target/test-classes + + + org.apache.maven.plugins + maven-compiler-plugin + ${maven.compiler.plugin.version} + + + org.apache.maven.plugins + maven-source-plugin + + + attach-sources + verify + + jar-no-fork + + + + + + org.apache.maven.plugins + maven-javadoc-plugin + + true + + + + + + + + + org.apache.maven.plugins + maven-plugin-plugin + 3.2 + + true + + + + mojo-descriptor + process-classes + + descriptor + + + + + + + + + + diff --git a/dhp-build/dhp-build-properties-maven-plugin/src/main/java/eu/dnetlib/maven/plugin/properties/GenerateOoziePropertiesMojo.java b/dhp-build/dhp-build-properties-maven-plugin/src/main/java/eu/dnetlib/maven/plugin/properties/GenerateOoziePropertiesMojo.java new file mode 100644 index 0000000..10a25fd --- /dev/null +++ b/dhp-build/dhp-build-properties-maven-plugin/src/main/java/eu/dnetlib/maven/plugin/properties/GenerateOoziePropertiesMojo.java @@ -0,0 +1,76 @@ + +package eu.dnetlib.maven.plugin.properties; + +import java.io.File; +import java.util.ArrayList; +import java.util.List; + +import org.apache.commons.lang.ArrayUtils; +import org.apache.commons.lang.StringUtils; +import org.apache.maven.plugin.AbstractMojo; +import org.apache.maven.plugin.MojoExecutionException; +import org.apache.maven.plugin.MojoFailureException; + +/** + * Generates oozie properties which were not provided from commandline. + * + * @author mhorst + * @goal generate-properties + */ +public class GenerateOoziePropertiesMojo extends AbstractMojo { + + public static final String PROPERTY_NAME_WF_SOURCE_DIR = "workflow.source.dir"; + public static final String PROPERTY_NAME_SANDBOX_NAME = "sandboxName"; + + private final String[] limiters = { + "dhp", "dnetlib", "eu" + }; + + @Override + public void execute() throws MojoExecutionException, MojoFailureException { + if (System.getProperties().containsKey(PROPERTY_NAME_WF_SOURCE_DIR) + && !System.getProperties().containsKey(PROPERTY_NAME_SANDBOX_NAME)) { + String generatedSandboxName = generateSandboxName( + System.getProperties().getProperty(PROPERTY_NAME_WF_SOURCE_DIR)); + if (generatedSandboxName != null) { + System.getProperties().setProperty(PROPERTY_NAME_SANDBOX_NAME, generatedSandboxName); + } else { + System.out + .println( + "unable to generate sandbox name from path: " + + System.getProperties().getProperty(PROPERTY_NAME_WF_SOURCE_DIR)); + } + } + } + + /** + * Generates sandbox name from workflow source directory. + * + * @param wfSourceDir + * @return generated sandbox name + */ + private String generateSandboxName(String wfSourceDir) { + // utilize all dir names until finding one of the limiters + List sandboxNameParts = new ArrayList(); + String[] tokens = StringUtils.split(wfSourceDir, File.separatorChar); + ArrayUtils.reverse(tokens); + if (tokens.length > 0) { + for (String token : tokens) { + for (String limiter : limiters) { + if (limiter.equals(token)) { + return sandboxNameParts.size() > 0 + ? StringUtils.join(sandboxNameParts.toArray()) + : null; + } + } + if (sandboxNameParts.size() > 0) { + sandboxNameParts.add(0, File.separator); + } + sandboxNameParts.add(0, token); + } + return StringUtils.join(sandboxNameParts.toArray()); + } else { + return null; + } + } +} diff --git a/dhp-build/dhp-build-properties-maven-plugin/src/main/java/eu/dnetlib/maven/plugin/properties/WritePredefinedProjectProperties.java b/dhp-build/dhp-build-properties-maven-plugin/src/main/java/eu/dnetlib/maven/plugin/properties/WritePredefinedProjectProperties.java new file mode 100644 index 0000000..d195ca8 --- /dev/null +++ b/dhp-build/dhp-build-properties-maven-plugin/src/main/java/eu/dnetlib/maven/plugin/properties/WritePredefinedProjectProperties.java @@ -0,0 +1,447 @@ +/** + * Licensed under the Educational Community License, Version 2.0 (the "License"); you may not use + * this file except in compliance with the License. You may obtain a copy of the License at + * + *

http://www.opensource.org/licenses/ecl2.php + * + *

Unless required by applicable law or agreed to in writing, software distributed under the + * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either + * express or implied. See the License for the specific language governing permissions and + * limitations under the License. + */ + +package eu.dnetlib.maven.plugin.properties; + +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.Date; +import java.util.List; +import java.util.Map.Entry; +import java.util.Properties; +import java.util.Set; + +import org.apache.commons.io.FileUtils; +import org.apache.commons.io.IOUtils; +import org.apache.commons.lang.StringUtils; +import org.apache.maven.plugin.AbstractMojo; +import org.apache.maven.plugin.MojoExecutionException; +import org.apache.maven.plugin.MojoFailureException; +import org.apache.maven.project.MavenProject; +import org.springframework.core.io.DefaultResourceLoader; +import org.springframework.core.io.Resource; +import org.springframework.core.io.ResourceLoader; + +import edu.umd.cs.findbugs.annotations.SuppressFBWarnings; + +/** + * Writes project properties for the keys listed in specified properties files. Based on: + * http://site.kuali.org/maven/plugins/properties-maven-plugin/2.0.1/write-project-properties-mojo.html + * + * @author mhorst + * @goal write-project-properties + */ +public class WritePredefinedProjectProperties extends AbstractMojo { + + private static final String CR = "\r"; + private static final String LF = "\n"; + private static final String TAB = "\t"; + protected static final String PROPERTY_PREFIX_ENV = "env."; + private static final String ENCODING_UTF8 = "utf8"; + + /** @parameter property="properties.includePropertyKeysFromFiles" */ + private String[] includePropertyKeysFromFiles; + + /** + * @parameter default-value="${project}" + * @required + * @readonly + */ + protected MavenProject project; + + /** + * The file that properties will be written to + * + * @parameter property="properties.outputFile" + * default-value="${project.build.directory}/properties/project.properties"; + * @required + */ + protected File outputFile; + + /** + * If true, the plugin will silently ignore any non-existent properties files, and the build will continue + * + * @parameter property="properties.quiet" default-value="true" + */ + private boolean quiet; + + /** + * Comma separated list of characters to escape when writing property values. cr=carriage return, lf=linefeed, + * tab=tab. Any other values are taken literally. + * + * @parameter default-value="cr,lf,tab" property="properties.escapeChars" + */ + private String escapeChars; + + /** + * If true, the plugin will include system properties when writing the properties file. System properties override + * both environment variables and project properties. + * + * @parameter default-value="false" property="properties.includeSystemProperties" + */ + private boolean includeSystemProperties; + + /** + * If true, the plugin will include environment variables when writing the properties file. Environment variables + * are prefixed with "env". Environment variables override project properties. + * + * @parameter default-value="false" property="properties.includeEnvironmentVariables" + */ + private boolean includeEnvironmentVariables; + + /** + * Comma separated set of properties to exclude when writing the properties file + * + * @parameter property="properties.exclude" + */ + private String exclude; + + /** + * Comma separated set of properties to write to the properties file. If provided, only the properties matching + * those supplied here will be written to the properties file. + * + * @parameter property="properties.include" + */ + private String include; + + /* + * (non-Javadoc) + * @see org.apache.maven.plugin.AbstractMojo#execute() + */ + @Override + @SuppressFBWarnings({ + "NP_UNWRITTEN_FIELD", "UWF_UNWRITTEN_FIELD" + }) + public void execute() throws MojoExecutionException, MojoFailureException { + Properties properties = new Properties(); + // Add project properties + properties.putAll(project.getProperties()); + if (includeEnvironmentVariables) { + // Add environment variables, overriding any existing properties with the same key + properties.putAll(getEnvironmentVariables()); + } + if (includeSystemProperties) { + // Add system properties, overriding any existing properties with the same key + properties.putAll(System.getProperties()); + } + + // Remove properties as appropriate + trim(properties, exclude, include); + + String comment = "# " + new Date() + "\n"; + List escapeTokens = getEscapeChars(escapeChars); + + getLog().info("Creating " + outputFile); + writeProperties(outputFile, comment, properties, escapeTokens); + } + + /** + * Provides environment variables. + * + * @return environment variables + */ + protected static Properties getEnvironmentVariables() { + Properties props = new Properties(); + for (Entry entry : System.getenv().entrySet()) { + props.setProperty(PROPERTY_PREFIX_ENV + entry.getKey(), entry.getValue()); + } + return props; + } + + /** + * Removes properties which should not be written. + * + * @param properties + * @param omitCSV + * @param includeCSV + * @throws MojoExecutionException + */ + protected void trim(Properties properties, String omitCSV, String includeCSV) + throws MojoExecutionException { + List omitKeys = getListFromCSV(omitCSV); + for (String key : omitKeys) { + properties.remove(key); + } + + List includeKeys = getListFromCSV(includeCSV); + // mh: including keys from predefined properties + if (includePropertyKeysFromFiles != null && includePropertyKeysFromFiles.length > 0) { + for (String currentIncludeLoc : includePropertyKeysFromFiles) { + if (validate(currentIncludeLoc)) { + Properties p = getProperties(currentIncludeLoc); + for (String key : p.stringPropertyNames()) { + includeKeys.add(key); + } + } + } + } + if (includeKeys != null && !includeKeys.isEmpty()) { + // removing only when include keys provided + Set keys = properties.stringPropertyNames(); + for (String key : keys) { + if (!includeKeys.contains(key)) { + properties.remove(key); + } + } + } + } + + /** + * Checks whether file exists. + * + * @param location + * @return true when exists, false otherwise. + */ + protected boolean exists(String location) { + if (StringUtils.isBlank(location)) { + return false; + } + File file = new File(location); + if (file.exists()) { + return true; + } + ResourceLoader loader = new DefaultResourceLoader(); + Resource resource = loader.getResource(location); + return resource.exists(); + } + + /** + * Validates resource location. + * + * @param location + * @return true when valid, false otherwise + * @throws MojoExecutionException + */ + protected boolean validate(String location) throws MojoExecutionException { + boolean exists = exists(location); + if (exists) { + return true; + } + if (quiet) { + getLog().info("Ignoring non-existent properties file '" + location + "'"); + return false; + } else { + throw new MojoExecutionException("Non-existent properties file '" + location + "'"); + } + } + + /** + * Provides input stream. + * + * @param location + * @return input stream + * @throws IOException + */ + protected InputStream getInputStream(String location) throws IOException { + File file = new File(location); + if (file.exists()) { + return new FileInputStream(location); + } + ResourceLoader loader = new DefaultResourceLoader(); + Resource resource = loader.getResource(location); + return resource.getInputStream(); + } + + /** + * Creates properties for given location. + * + * @param location + * @return properties for given location + * @throws MojoExecutionException + */ + protected Properties getProperties(String location) throws MojoExecutionException { + InputStream in = null; + try { + Properties properties = new Properties(); + in = getInputStream(location); + if (location.toLowerCase().endsWith(".xml")) { + properties.loadFromXML(in); + } else { + properties.load(in); + } + return properties; + } catch (IOException e) { + throw new MojoExecutionException("Error reading properties file " + location, e); + } finally { + IOUtils.closeQuietly(in); + } + } + + /** + * Provides escape characters. + * + * @param escapeChars + * @return escape characters + */ + protected List getEscapeChars(String escapeChars) { + List tokens = getListFromCSV(escapeChars); + List realTokens = new ArrayList(); + for (String token : tokens) { + String realToken = getRealToken(token); + realTokens.add(realToken); + } + return realTokens; + } + + /** + * Provides real token. + * + * @param token + * @return real token + */ + protected String getRealToken(String token) { + if (token.equalsIgnoreCase("CR")) { + return CR; + } else if (token.equalsIgnoreCase("LF")) { + return LF; + } else if (token.equalsIgnoreCase("TAB")) { + return TAB; + } else { + return token; + } + } + + /** + * Returns content. + * + * @param comment + * @param properties + * @param escapeTokens + * @return content + */ + protected String getContent(String comment, Properties properties, List escapeTokens) { + List names = new ArrayList(properties.stringPropertyNames()); + Collections.sort(names); + StringBuilder sb = new StringBuilder(); + if (!StringUtils.isBlank(comment)) { + sb.append(comment); + } + for (String name : names) { + String value = properties.getProperty(name); + String escapedValue = escape(value, escapeTokens); + sb.append(name + "=" + escapedValue + "\n"); + } + return sb.toString(); + } + + /** + * Writes properties to given file. + * + * @param file + * @param comment + * @param properties + * @param escapeTokens + * @throws MojoExecutionException + */ + protected void writeProperties( + File file, String comment, Properties properties, List escapeTokens) + throws MojoExecutionException { + try { + String content = getContent(comment, properties, escapeTokens); + FileUtils.writeStringToFile(file, content, ENCODING_UTF8); + } catch (IOException e) { + throw new MojoExecutionException("Error creating properties file", e); + } + } + + /** + * Escapes characters. + * + * @param s + * @param escapeChars + * @return + */ + protected String escape(String s, List escapeChars) { + String result = s; + for (String escapeChar : escapeChars) { + result = result.replace(escapeChar, getReplacementToken(escapeChar)); + } + return result; + } + + /** + * Provides replacement token. + * + * @param escapeChar + * @return replacement token + */ + protected String getReplacementToken(String escapeChar) { + if (escapeChar.equals(CR)) { + return "\\r"; + } else if (escapeChar.equals(LF)) { + return "\\n"; + } else if (escapeChar.equals(TAB)) { + return "\\t"; + } else { + return "\\" + escapeChar; + } + } + + /** + * Returns list from csv. + * + * @param csv + * @return list of values generated from CSV + */ + protected static final List getListFromCSV(String csv) { + if (StringUtils.isBlank(csv)) { + return new ArrayList(); + } + List list = new ArrayList(); + String[] tokens = StringUtils.split(csv, ","); + for (String token : tokens) { + list.add(token.trim()); + } + return list; + } + + public void setIncludeSystemProperties(boolean includeSystemProperties) { + this.includeSystemProperties = includeSystemProperties; + } + + public void setEscapeChars(String escapeChars) { + this.escapeChars = escapeChars; + } + + public void setIncludeEnvironmentVariables(boolean includeEnvironmentVariables) { + this.includeEnvironmentVariables = includeEnvironmentVariables; + } + + public void setExclude(String exclude) { + this.exclude = exclude; + } + + public void setInclude(String include) { + this.include = include; + } + + public void setQuiet(boolean quiet) { + this.quiet = quiet; + } + + /** + * Sets property files for which keys properties should be included. + * + * @param includePropertyKeysFromFiles + */ + public void setIncludePropertyKeysFromFiles(String[] includePropertyKeysFromFiles) { + if (includePropertyKeysFromFiles != null) { + this.includePropertyKeysFromFiles = Arrays + .copyOf(includePropertyKeysFromFiles, includePropertyKeysFromFiles.length); + } + } +} diff --git a/dhp-build/dhp-build-properties-maven-plugin/src/test/java/eu/dnetlib/maven/plugin/properties/GenerateOoziePropertiesMojoTest.java b/dhp-build/dhp-build-properties-maven-plugin/src/test/java/eu/dnetlib/maven/plugin/properties/GenerateOoziePropertiesMojoTest.java new file mode 100644 index 0000000..cd86358 --- /dev/null +++ b/dhp-build/dhp-build-properties-maven-plugin/src/test/java/eu/dnetlib/maven/plugin/properties/GenerateOoziePropertiesMojoTest.java @@ -0,0 +1,108 @@ + +package eu.dnetlib.maven.plugin.properties; + +import static eu.dnetlib.maven.plugin.properties.GenerateOoziePropertiesMojo.PROPERTY_NAME_SANDBOX_NAME; +import static eu.dnetlib.maven.plugin.properties.GenerateOoziePropertiesMojo.PROPERTY_NAME_WF_SOURCE_DIR; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNull; + +import org.junit.jupiter.api.*; + +/** @author mhorst, claudio.atzori */ +public class GenerateOoziePropertiesMojoTest { + + private static final GenerateOoziePropertiesMojo mojo = new GenerateOoziePropertiesMojo(); + + public void clearSystemProperties() { + System.clearProperty(PROPERTY_NAME_SANDBOX_NAME); + System.clearProperty(PROPERTY_NAME_WF_SOURCE_DIR); + } + + @Test + public void testExecuteEmpty() throws Exception { + clearSystemProperties(); + + // execute + mojo.execute(); + + // assert + assertNull(System.getProperty(PROPERTY_NAME_SANDBOX_NAME)); + } + + @Test + public void testExecuteSandboxNameAlreadySet() throws Exception { + + clearSystemProperties(); + // given + String workflowSourceDir = "eu/dnetlib/dhp/wf/transformers"; + String sandboxName = "originalSandboxName"; + System.setProperty(PROPERTY_NAME_WF_SOURCE_DIR, workflowSourceDir); + System.setProperty(PROPERTY_NAME_SANDBOX_NAME, sandboxName); + + // execute + mojo.execute(); + + // assert + assertEquals(sandboxName, System.getProperty(PROPERTY_NAME_SANDBOX_NAME)); + } + + @Test //fails + public void testExecuteEmptyWorkflowSourceDir() throws Exception { + clearSystemProperties(); + + // given + String workflowSourceDir = ""; + System.setProperty(PROPERTY_NAME_WF_SOURCE_DIR, workflowSourceDir); + + // execute + mojo.execute(); + + // assert + assertNull(System.getProperty(PROPERTY_NAME_SANDBOX_NAME)); + } + + @Test + public void testExecuteNullSandboxNameGenerated() throws Exception { + clearSystemProperties(); + + // given + String workflowSourceDir = "eu/dnetlib/dhp/"; + System.setProperty(PROPERTY_NAME_WF_SOURCE_DIR, workflowSourceDir); + + // execute + mojo.execute(); + + // assert + assertNull(System.getProperty(PROPERTY_NAME_SANDBOX_NAME)); + } + + @Test + public void testExecute() throws Exception { + + clearSystemProperties(); + // given + String workflowSourceDir = "eu/dnetlib/dhp/wf/transformers"; + System.setProperty(PROPERTY_NAME_WF_SOURCE_DIR, workflowSourceDir); + + // execute + mojo.execute(); + + // assert + assertEquals("wf/transformers", System.getProperty(PROPERTY_NAME_SANDBOX_NAME)); + } + + @Test + public void testExecuteWithoutRoot() throws Exception { + + clearSystemProperties(); + // given + String workflowSourceDir = "wf/transformers"; + System.setProperty(PROPERTY_NAME_WF_SOURCE_DIR, workflowSourceDir); + + // execute + mojo.execute(); + + // assert + assertEquals("wf/transformers", System.getProperty(PROPERTY_NAME_SANDBOX_NAME)); + } +} diff --git a/dhp-build/dhp-build-properties-maven-plugin/src/test/java/eu/dnetlib/maven/plugin/properties/WritePredefinedProjectPropertiesTest.java b/dhp-build/dhp-build-properties-maven-plugin/src/test/java/eu/dnetlib/maven/plugin/properties/WritePredefinedProjectPropertiesTest.java new file mode 100644 index 0000000..2cdbae4 --- /dev/null +++ b/dhp-build/dhp-build-properties-maven-plugin/src/test/java/eu/dnetlib/maven/plugin/properties/WritePredefinedProjectPropertiesTest.java @@ -0,0 +1,391 @@ + +package eu.dnetlib.maven.plugin.properties; + +import static eu.dnetlib.maven.plugin.properties.WritePredefinedProjectProperties.PROPERTY_PREFIX_ENV; +import static org.junit.jupiter.api.Assertions.*; +import static org.mockito.Mockito.doReturn; +import static org.mockito.Mockito.lenient; + +import java.io.*; +import java.util.Properties; + +import org.apache.maven.plugin.MojoExecutionException; +import org.apache.maven.project.MavenProject; +import org.junit.jupiter.api.*; +import org.junit.jupiter.api.extension.ExtendWith; +import org.junit.jupiter.api.io.TempDir; +import org.mockito.Mock; +import org.mockito.MockitoAnnotations; +import org.mockito.junit.jupiter.MockitoExtension; + +/** @author mhorst, claudio.atzori */ +@Disabled +@ExtendWith(MockitoExtension.class) +public class WritePredefinedProjectPropertiesTest { + + @Mock + private MavenProject mavenProject; + + private WritePredefinedProjectProperties mojo; + + @TempDir File testFolder; + + public void init(File testFolder) { + MockitoAnnotations.initMocks(this); + mojo = new WritePredefinedProjectProperties(); + mojo.outputFile = getPropertiesFileLocation(testFolder); + mojo.project = mavenProject; + lenient().doReturn(new Properties()).when(mavenProject).getProperties(); + } + + // ----------------------------------- TESTS --------------------------------------------- + + @Test + public void testExecuteEmpty() throws Exception { + init(testFolder); + // execute + mojo.execute(); + + // assert + assertTrue(mojo.outputFile.exists()); + Properties storedProperties = getStoredProperties(mojo.outputFile.getParentFile()); + assertEquals(0, storedProperties.size()); + } + + @Test + public void testExecuteWithProjectProperties() throws Exception { + init(testFolder); + // given + String key = "projectPropertyKey"; + String value = "projectPropertyValue"; + Properties projectProperties = new Properties(); + projectProperties.setProperty(key, value); + doReturn(projectProperties).when(mavenProject).getProperties(); + + // execute + mojo.execute(); + + // assert + assertTrue(mojo.outputFile.exists()); + Properties storedProperties = getStoredProperties(mojo.outputFile.getParentFile()); + assertEquals(1, storedProperties.size()); + assertTrue(storedProperties.containsKey(key)); + assertEquals(value, storedProperties.getProperty(key)); + } + + @Test() + public void testExecuteWithProjectPropertiesAndInvalidOutputFile(@TempDir File testFolder) { + init(testFolder); + // given + String key = "projectPropertyKey"; + String value = "projectPropertyValue"; + Properties projectProperties = new Properties(); + projectProperties.setProperty(key, value); + doReturn(projectProperties).when(mavenProject).getProperties(); + mojo.outputFile = testFolder; + + // execute + Assertions.assertThrows(MojoExecutionException.class, () -> mojo.execute()); + } + + @Test + public void testExecuteWithProjectPropertiesExclusion(@TempDir File testFolder) throws Exception { + init(testFolder); + // given + String key = "projectPropertyKey"; + String value = "projectPropertyValue"; + String excludedKey = "excludedPropertyKey"; + String excludedValue = "excludedPropertyValue"; + Properties projectProperties = new Properties(); + projectProperties.setProperty(key, value); + projectProperties.setProperty(excludedKey, excludedValue); + doReturn(projectProperties).when(mavenProject).getProperties(); + mojo.setExclude(excludedKey); + + // execute + mojo.execute(); + + // assert + assertTrue(mojo.outputFile.exists()); + Properties storedProperties = getStoredProperties(testFolder); + assertEquals(1, storedProperties.size()); + assertTrue(storedProperties.containsKey(key)); + assertEquals(value, storedProperties.getProperty(key)); + } + + @Test + public void testExecuteWithProjectPropertiesInclusion(@TempDir File testFolder) throws Exception { + init(testFolder); + // given + String key = "projectPropertyKey"; + String value = "projectPropertyValue"; + String includedKey = "includedPropertyKey"; + String includedValue = "includedPropertyValue"; + Properties projectProperties = new Properties(); + projectProperties.setProperty(key, value); + projectProperties.setProperty(includedKey, includedValue); + doReturn(projectProperties).when(mavenProject).getProperties(); + mojo.setInclude(includedKey); + + // execute + mojo.execute(); + + // assert + assertTrue(mojo.outputFile.exists()); + Properties storedProperties = getStoredProperties(testFolder); + assertEquals(1, storedProperties.size()); + assertTrue(storedProperties.containsKey(includedKey)); + assertEquals(includedValue, storedProperties.getProperty(includedKey)); + } + + @Test + public void testExecuteIncludingPropertyKeysFromFile(@TempDir File testFolder) throws Exception { + init(testFolder); + // given + String key = "projectPropertyKey"; + String value = "projectPropertyValue"; + String includedKey = "includedPropertyKey"; + String includedValue = "includedPropertyValue"; + Properties projectProperties = new Properties(); + projectProperties.setProperty(key, value); + projectProperties.setProperty(includedKey, includedValue); + doReturn(projectProperties).when(mavenProject).getProperties(); + + File includedPropertiesFile = new File(testFolder, "included.properties"); + Properties includedProperties = new Properties(); + includedProperties.setProperty(includedKey, "irrelevantValue"); + includedProperties.store(new FileWriter(includedPropertiesFile), null); + + mojo.setIncludePropertyKeysFromFiles(new String[] { + includedPropertiesFile.getAbsolutePath() + }); + + // execute + mojo.execute(); + + // assert + assertTrue(mojo.outputFile.exists()); + Properties storedProperties = getStoredProperties(testFolder); + assertEquals(1, storedProperties.size()); + assertTrue(storedProperties.containsKey(includedKey)); + assertEquals(includedValue, storedProperties.getProperty(includedKey)); + } + + @Test + public void testExecuteIncludingPropertyKeysFromClasspathResource(@TempDir File testFolder) + throws Exception { + init(testFolder); + // given + String key = "projectPropertyKey"; + String value = "projectPropertyValue"; + String includedKey = "includedPropertyKey"; + String includedValue = "includedPropertyValue"; + Properties projectProperties = new Properties(); + projectProperties.setProperty(key, value); + projectProperties.setProperty(includedKey, includedValue); + doReturn(projectProperties).when(mavenProject).getProperties(); + + mojo + .setIncludePropertyKeysFromFiles( + new String[] { + "/eu/dnetlib/maven/plugin/properties/included.properties" + }); + + // execute + mojo.execute(); + + // assert + assertTrue(mojo.outputFile.exists()); + Properties storedProperties = getStoredProperties(testFolder); + assertEquals(1, storedProperties.size()); + assertTrue(storedProperties.containsKey(includedKey)); + assertEquals(includedValue, storedProperties.getProperty(includedKey)); + } + + @Test + public void testExecuteIncludingPropertyKeysFromBlankLocation() { + init(testFolder); + // given + String key = "projectPropertyKey"; + String value = "projectPropertyValue"; + String includedKey = "includedPropertyKey"; + String includedValue = "includedPropertyValue"; + Properties projectProperties = new Properties(); + projectProperties.setProperty(key, value); + projectProperties.setProperty(includedKey, includedValue); + doReturn(projectProperties).when(mavenProject).getProperties(); + + mojo.setIncludePropertyKeysFromFiles(new String[] { + "" + }); + + // execute + Assertions.assertThrows(MojoExecutionException.class, () -> mojo.execute()); + } + + @Test + public void testExecuteIncludingPropertyKeysFromXmlFile(@TempDir File testFolder) + throws Exception { + init(testFolder); + // given + String key = "projectPropertyKey"; + String value = "projectPropertyValue"; + String includedKey = "includedPropertyKey"; + String includedValue = "includedPropertyValue"; + Properties projectProperties = new Properties(); + projectProperties.setProperty(key, value); + projectProperties.setProperty(includedKey, includedValue); + doReturn(projectProperties).when(mavenProject).getProperties(); + + File includedPropertiesFile = new File(testFolder, "included.xml"); + Properties includedProperties = new Properties(); + includedProperties.setProperty(includedKey, "irrelevantValue"); + includedProperties.storeToXML(new FileOutputStream(includedPropertiesFile), null); + + mojo.setIncludePropertyKeysFromFiles(new String[] { + includedPropertiesFile.getAbsolutePath() + }); + + // execute + mojo.execute(); + + // assert + assertTrue(mojo.outputFile.exists()); + Properties storedProperties = getStoredProperties(testFolder); + assertEquals(1, storedProperties.size()); + assertTrue(storedProperties.containsKey(includedKey)); + assertEquals(includedValue, storedProperties.getProperty(includedKey)); + } + + @Test + public void testExecuteIncludingPropertyKeysFromInvalidXmlFile(@TempDir File testFolder) + throws Exception { + init(testFolder); + // given + String key = "projectPropertyKey"; + String value = "projectPropertyValue"; + String includedKey = "includedPropertyKey"; + String includedValue = "includedPropertyValue"; + Properties projectProperties = new Properties(); + projectProperties.setProperty(key, value); + projectProperties.setProperty(includedKey, includedValue); + doReturn(projectProperties).when(mavenProject).getProperties(); + + File includedPropertiesFile = new File(testFolder, "included.xml"); + Properties includedProperties = new Properties(); + includedProperties.setProperty(includedKey, "irrelevantValue"); + includedProperties.store(new FileOutputStream(includedPropertiesFile), null); + + mojo.setIncludePropertyKeysFromFiles(new String[] { + includedPropertiesFile.getAbsolutePath() + }); + + // execute + Assertions.assertThrows(MojoExecutionException.class, () -> mojo.execute()); + } + + @Test + public void testExecuteWithQuietModeOn(@TempDir File testFolder) throws Exception { + init(testFolder); + // given + mojo.setQuiet(true); + mojo.setIncludePropertyKeysFromFiles(new String[] { + "invalid location" + }); + + // execute + mojo.execute(); + + // assert + assertTrue(mojo.outputFile.exists()); + Properties storedProperties = getStoredProperties(testFolder); + assertEquals(0, storedProperties.size()); + } + + @Test + public void testExecuteIncludingPropertyKeysFromInvalidFile() { + init(testFolder); + // given + mojo.setIncludePropertyKeysFromFiles(new String[] { + "invalid location" + }); + + // execute + Assertions.assertThrows(MojoExecutionException.class, () -> mojo.execute()); + } + + @Test + public void testExecuteWithEnvironmentProperties(@TempDir File testFolder) throws Exception { + init(testFolder); + // given + mojo.setIncludeEnvironmentVariables(true); + + // execute + mojo.execute(); + + // assert + assertTrue(mojo.outputFile.exists()); + Properties storedProperties = getStoredProperties(testFolder); + assertTrue(storedProperties.size() > 0); + for (Object currentKey : storedProperties.keySet()) { + assertTrue(((String) currentKey).startsWith(PROPERTY_PREFIX_ENV)); + } + } + + @Test + public void testExecuteWithSystemProperties(@TempDir File testFolder) throws Exception { + init(testFolder); + // given + String key = "systemPropertyKey"; + String value = "systemPropertyValue"; + System.setProperty(key, value); + mojo.setIncludeSystemProperties(true); + + // execute + mojo.execute(); + + // assert + assertTrue(mojo.outputFile.exists()); + Properties storedProperties = getStoredProperties(testFolder); + assertTrue(storedProperties.size() > 0); + assertTrue(storedProperties.containsKey(key)); + assertEquals(value, storedProperties.getProperty(key)); + } + + @Test + public void testExecuteWithSystemPropertiesAndEscapeChars(@TempDir File testFolder) + throws Exception { + init(testFolder); + // given + String key = "systemPropertyKey "; + String value = "systemPropertyValue"; + System.setProperty(key, value); + mojo.setIncludeSystemProperties(true); + String escapeChars = "cr,lf,tab,|"; + mojo.setEscapeChars(escapeChars); + + // execute + mojo.execute(); + + // assert + assertTrue(mojo.outputFile.exists()); + Properties storedProperties = getStoredProperties(testFolder); + assertTrue(storedProperties.size() > 0); + assertFalse(storedProperties.containsKey(key)); + assertTrue(storedProperties.containsKey(key.trim())); + assertEquals(value, storedProperties.getProperty(key.trim())); + } + + // ----------------------------------- PRIVATE ------------------------------------------- + + private File getPropertiesFileLocation(File testFolder) { + return new File(testFolder, "test.properties"); + } + + private Properties getStoredProperties(File testFolder) + throws IOException { + Properties properties = new Properties(); + properties.load(new FileInputStream(getPropertiesFileLocation(testFolder))); + return properties; + } +} diff --git a/dhp-build/dhp-build-properties-maven-plugin/src/test/resources/eu/dnetlib/maven/plugin/properties/included.properties b/dhp-build/dhp-build-properties-maven-plugin/src/test/resources/eu/dnetlib/maven/plugin/properties/included.properties new file mode 100644 index 0000000..3c79fe6 --- /dev/null +++ b/dhp-build/dhp-build-properties-maven-plugin/src/test/resources/eu/dnetlib/maven/plugin/properties/included.properties @@ -0,0 +1 @@ +includedPropertyKey=irrelevantValue \ No newline at end of file diff --git a/dhp-build/dhp-build-properties-maven-plugin/test.properties b/dhp-build/dhp-build-properties-maven-plugin/test.properties new file mode 100644 index 0000000..66aeee4 --- /dev/null +++ b/dhp-build/dhp-build-properties-maven-plugin/test.properties @@ -0,0 +1,2 @@ +# Tue Nov 05 10:06:34 CET 2024 +projectPropertyKey=projectPropertyValue diff --git a/dhp-build/dhp-code-style/pom.xml b/dhp-build/dhp-code-style/pom.xml new file mode 100644 index 0000000..a4f3e33 --- /dev/null +++ b/dhp-build/dhp-code-style/pom.xml @@ -0,0 +1,48 @@ + + + + 4.0.0 + + eu.dnetlib + dhp-code-style + 1.0.0-SNAPSHOT + + jar + + This module contains resources supporting common code style conventions + + + + dnet45-snapshots + DNet45 Snapshots + https://maven.d4science.org/nexus/content/repositories/dnet45-snapshots + default + + + dnet45-releases + https://maven.d4science.org/nexus/content/repositories/dnet45-releases + + + + + + + + org.apache.maven.plugins + maven-project-info-reports-plugin + 3.0.0 + + + org.apache.maven.plugins + maven-site-plugin + 3.7.1 + + + + + + + UTF-8 + + + diff --git a/dhp-build/dhp-code-style/src/main/resources/eclipse/formatter_aosp.xml b/dhp-build/dhp-code-style/src/main/resources/eclipse/formatter_aosp.xml new file mode 100644 index 0000000..d58709f --- /dev/null +++ b/dhp-build/dhp-code-style/src/main/resources/eclipse/formatter_aosp.xml @@ -0,0 +1,252 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/dhp-build/dhp-code-style/src/main/resources/eclipse/formatter_dnet.xml b/dhp-build/dhp-code-style/src/main/resources/eclipse/formatter_dnet.xml new file mode 100644 index 0000000..e4d85bf --- /dev/null +++ b/dhp-build/dhp-code-style/src/main/resources/eclipse/formatter_dnet.xml @@ -0,0 +1,727 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/dhp-build/dhp-code-style/src/main/resources/eclipse/formatter_google.xml b/dhp-build/dhp-code-style/src/main/resources/eclipse/formatter_google.xml new file mode 100644 index 0000000..56e5079 --- /dev/null +++ b/dhp-build/dhp-code-style/src/main/resources/eclipse/formatter_google.xml @@ -0,0 +1,337 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/dhp-build/pom.xml b/dhp-build/pom.xml new file mode 100644 index 0000000..03265a6 --- /dev/null +++ b/dhp-build/pom.xml @@ -0,0 +1,30 @@ + + + 4.0.0 + + eu.dnetlib + raid-inference + 1.0.0-SNAPSHOT + ../pom.xml + + dhp-build + pom + + This module is a container for the build tools used in dnet-hadoop + + + dhp-code-style + dhp-build-assembly-resources + dhp-build-properties-maven-plugin + + + + + iis-releases + iis releases plugin repository + http://maven.ceon.pl/artifactory/iis-releases + default + + + + diff --git a/dhp-raid/job-override.properties b/dhp-raid/job-override.properties new file mode 100644 index 0000000..4983822 --- /dev/null +++ b/dhp-raid/job-override.properties @@ -0,0 +1,6 @@ +graphBasePath = /tmp/beta_provision/graph/06_graph_inferred +workingPath = /user/michele.debonis/raid_inference_test/working_dir +clustersPath = /user/michele.debonis/raid_inference_test/clusters +embeddingsPath = /user/michele.debonis/raid_inference_test/embeddings +raidConfPath = /user/michele.debonis/raid.conf.json +numPartitions = 1000 \ No newline at end of file diff --git a/dhp-raid/pom.xml b/dhp-raid/pom.xml new file mode 100644 index 0000000..e022737 --- /dev/null +++ b/dhp-raid/pom.xml @@ -0,0 +1,545 @@ + + + 4.0.0 + + + eu.dnetlib + raid-inference + 1.0.0-SNAPSHOT + ../pom.xml + + dhp-raid + 1.0-SNAPSHOT + + + 1.8 + 2.11 + 2.11.0 + UTF-8 + UTF-8 + 2.2.6 + UTF-8 + 2.7.3 + 2.3.2 + 2.8.1 + 8.0.1 + + + + + + + org.apache.spark + spark-core_${scala.binary.version} + ${spark.version} + + + + org.apache.spark + spark-sql_${scala.binary.version} + ${spark.version} + + + + org.apache.spark + spark-mllib_${scala.binary.version} + ${spark.version} + + + + org.scala-lang + scala-library + ${scala.version} + + + + org.junit.jupiter + junit-jupiter + + + org.junit.jupiter + junit-jupiter + RELEASE + test + + + + com.jayway.jsonpath + json-path + 2.4.0 + + + + eu.dnetlib.dhp + dhp-schemas + ${dhp-schemas.version} + + + + de.lmu.ifi.dbs.elki + elki + + + + com.github.haifengl + smile-core + + + + + + + + org.apache.maven.plugins + maven-compiler-plugin + 3.8.0 + + 1.8 + 1.8 + + + + + net.alchim31.maven + scala-maven-plugin + 3.4.4 + + + scala-compile-first + process-resources + + compile + + + + + + + + + + oozie-package + + + + org.apache.maven.plugins + maven-enforcer-plugin + 1.4.1 + + + enforce-connection-properties-file-existence + initialize + + enforce + + + + + + ${dhpConnectionProperties} + + + The file with connection properties could not be found. Please, create the ${dhpConnectionProperties} file or set the location to another already created file by using + -DdhpConnectionProperties property. + + + + true + + + + + + org.apache.maven.plugins + maven-dependency-plugin + + + copy dependencies + prepare-package + + copy-dependencies + + + ${oozie.package.dependencies.include.scope} + ${oozie.package.dependencies.exclude.scope} + true + + + + + + + org.apache.maven.plugins + maven-jar-plugin + + + attach-test-resources-package + prepare-package + + test-jar + + + ${oozie.package.skip.test.jar} + + + + + + + eu.dnetlib.primer + primer-maven-plugin + 1.2.0 + + + priming + prepare-package + + prime + + + + ${project.build.directory}/dependency/*.jar + ${project.build.directory}/*-tests.jar + ${project.build.directory}/classes + + ${project.build.directory}/dependency + ${project.build.directory}/${primed.dir} + ${workflow.source.dir} + + + + + + + org.kuali.maven.plugins + properties-maven-plugin + ${properties.maven.plugin.version} + + + eu.dnetlib + dhp-build-assembly-resources + 1.0.0-SNAPSHOT + + + + + + reading-dhp-properties + initialize + + read-project-properties + + + + ${dhpConnectionProperties} + + false + + + + read-default-properties + prepare-package + + read-project-properties + + + + classpath:project-default.properties + + true + + + + read-job-properties + prepare-package + + read-project-properties + + + + ${project.build.directory}/${primed.dir}/job.properties + job-override.properties + + true + + + + + + eu.dnetlib + dhp-build-properties-maven-plugin + 1.0.0-SNAPSHOT + + + validate + + generate-properties + + + + + + + write-job-properties + prepare-package + + write-project-properties + + + target/${oozie.package.file.name}/job.properties + + nameNode,jobTracker,queueName,importerQueueName,oozieLauncherQueueName, + workingDir,oozieTopWfApplicationPath,oozieServiceLoc, + sparkDriverMemory,sparkExecutorMemory,sparkExecutorCores, + oozie.wf.application.path,projectVersion,oozie.use.system.libpath, + oozieActionShareLibForSpark1,spark1YarnHistoryServerAddress,spark1EventLogDir, + oozieActionShareLibForSpark2,spark2YarnHistoryServerAddress,spark2EventLogDir, + sparkSqlWarehouseDir + + true + + + ${project.build.directory}/${primed.dir}/job.properties + job-override.properties + + + + + + + + pl.project13.maven + git-commit-id-plugin + 2.1.11 + + + + revision + + + + + true + yyyy-MM-dd'T'HH:mm:ssZ + true + target/${oozie.package.file.name}/${oozieAppDir}/version.properties + + + + org.apache.maven.plugins + maven-assembly-plugin + 3.0.0 + + + eu.dnetlib + dhp-build-assembly-resources + 1.0.0-SNAPSHOT + + + + + assembly-oozie-installer + package + + single + + + false + ${oozie.package.file.name}_shell_scripts + + oozie-installer + + + + + + + + + + + maven-antrun-plugin + + + + installer-copy-custom + process-resources + + run + + + + + + + + + + + package + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + run + + + + + + + + + deploy + + + + org.codehaus.mojo + exec-maven-plugin + 1.5.0 + + + create-target-dir + package + + exec + + + ssh + + ${dhp.hadoop.frontend.user.name}@${dhp.hadoop.frontend.host.name} + -p ${dhp.hadoop.frontend.port.ssh} + -o StrictHostKeyChecking=no + rm -rf ${dhp.hadoop.frontend.temp.dir}/oozie-packages/${sandboxName}/${output.dir.name}/; mkdir -p ${dhp.hadoop.frontend.temp.dir}/oozie-packages/${sandboxName}/${output.dir.name}/ + + + + + upload-oozie-package + package + + exec + + + scp + + -P ${dhp.hadoop.frontend.port.ssh} + -o StrictHostKeyChecking=no + target/${oozie.package.file.name}.tar.gz + ${dhp.hadoop.frontend.user.name}@${dhp.hadoop.frontend.host.name}:${dhp.hadoop.frontend.temp.dir}/oozie-packages/${sandboxName}/${output.dir.name}/${oozie.package.file.name}.tar.gz + + + + + extract-and-upload-to-hdfs + package + + exec + + + ssh + + + ${dhp.hadoop.frontend.user.name}@${dhp.hadoop.frontend.host.name} + -p ${dhp.hadoop.frontend.port.ssh} + -o StrictHostKeyChecking=no + cd ${dhp.hadoop.frontend.temp.dir}/oozie-packages/${sandboxName}/${output.dir.name}/; + tar -zxf oozie-package.tar.gz; + rm ${dhp.hadoop.frontend.temp.dir}/oozie-packages/${sandboxName}/${output.dir.name}/oozie-package.tar.gz; + ./upload_workflow.sh + + + + + + + + + + run + + + + org.codehaus.mojo + exec-maven-plugin + 1.5.0 + + + run-job + package + + exec + + + ssh + + ${oozie.execution.log.file.location} + + ${dhp.hadoop.frontend.user.name}@${dhp.hadoop.frontend.host.name} + -p ${dhp.hadoop.frontend.port.ssh} + -o StrictHostKeyChecking=no + cd ${dhp.hadoop.frontend.temp.dir}/oozie-packages/${sandboxName}/${output.dir.name}/; + ./run_workflow.sh + + + + + show-run-log-on-stdout + package + + exec + + + cat + + ${oozie.execution.log.file.location} + + + + + + + + + + + \ No newline at end of file diff --git a/dhp-raid/src/main/java/eu/dnetlib/raid/jobs/AbstractSparkJob.java b/dhp-raid/src/main/java/eu/dnetlib/raid/jobs/AbstractSparkJob.java new file mode 100644 index 0000000..3fc6780 --- /dev/null +++ b/dhp-raid/src/main/java/eu/dnetlib/raid/jobs/AbstractSparkJob.java @@ -0,0 +1,74 @@ +package eu.dnetlib.raid.jobs; + +import eu.dnetlib.raid.support.ArgumentApplicationParser; +import eu.dnetlib.raid.support.RAiDConfig; +import org.apache.commons.io.IOUtils; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.spark.SparkConf; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.SaveMode; +import org.apache.spark.sql.SparkSession; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStreamReader; +import java.io.Serializable; +import java.util.stream.Collectors; + +public abstract class AbstractSparkJob implements Serializable { + + protected static final int NUM_PARTITIONS = 1000; + protected static final int SEED = 42; + protected static final String EMBEDDING_COL = "embedding"; + protected static final String STRING_ID_COL = "id"; + protected static final String LONG_ID_COL = "longId"; + protected static final String RANDOM_WALK_COL = "random_walk"; + protected static final String LABEL_ID_COL = "labelId"; + protected static final String LABELS_COL = "labels"; + protected static final String PARTITION_COL = "partition"; + + public ArgumentApplicationParser parser; // parameters for the spark action + public SparkSession spark; // the spark session + + public AbstractSparkJob() {} + + public AbstractSparkJob(ArgumentApplicationParser parser, SparkSession spark) { + + this.parser = parser; + this.spark = spark; + } + + abstract void run() throws Exception; + + protected static SparkSession getSparkSession(SparkConf conf) { + return SparkSession.builder().config(conf).getOrCreate(); + } + + protected static void save(Dataset dataset, String outPath, SaveMode mode) { + dataset.write().option("compression", "gzip").mode(mode).json(outPath); + } + + protected static String readFileFromHDFS(String filePath) throws IOException { + + Path path=new Path(filePath); + FileSystem fs = FileSystem.get(new Configuration()); + BufferedReader br=new BufferedReader(new InputStreamReader(fs.open(path))); + try { + return String.join("", br.lines().collect(Collectors.toList())); + } finally { + br.close(); + } + } + + public static String readResource(String path, Class clazz) throws IOException { + return IOUtils.toString(clazz.getResourceAsStream(path)); + } + + protected static RAiDConfig loadRAiDConfig(String raidConfPath) throws IOException { + return RAiDConfig.load( + readFileFromHDFS(raidConfPath) + ); + } +} \ No newline at end of file diff --git a/dhp-raid/src/main/java/eu/dnetlib/raid/jobs/SparkCreateEmbeddings.java b/dhp-raid/src/main/java/eu/dnetlib/raid/jobs/SparkCreateEmbeddings.java new file mode 100644 index 0000000..ef88839 --- /dev/null +++ b/dhp-raid/src/main/java/eu/dnetlib/raid/jobs/SparkCreateEmbeddings.java @@ -0,0 +1,217 @@ +package eu.dnetlib.raid.jobs; + +import com.jayway.jsonpath.JsonPath; +import eu.dnetlib.dhp.schema.oaf.Relation; +import eu.dnetlib.raid.graph.GraphUtil; +import eu.dnetlib.raid.support.ArgumentApplicationParser; +import eu.dnetlib.raid.support.EdgeParam; +import eu.dnetlib.raid.support.RAiDConfig; +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.graphx.Edge; +import org.apache.spark.ml.feature.Normalizer; +import org.apache.spark.ml.feature.Word2Vec; +import org.apache.spark.rdd.RDD; +import org.apache.spark.sql.*; +import org.apache.spark.sql.catalyst.encoders.RowEncoder; +import org.apache.spark.sql.types.DataTypes; +import org.apache.spark.sql.types.StructField; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import scala.Tuple2; + +import java.util.ArrayList; +import java.util.List; +import java.util.Optional; + +import static org.apache.spark.sql.functions.*; + + +public class SparkCreateEmbeddings extends AbstractSparkJob{ + + private static final Logger log = LoggerFactory.getLogger(SparkCreateEmbeddings.class); + private static final String ID_PATH = "$.id"; + private static final String DELETEDBYINFERENCE_PATH = "$.dataInfo.deletedbyinference"; + private static final Encoder REL_BEAN_ENC = Encoders.bean(Relation.class); + + public SparkCreateEmbeddings(ArgumentApplicationParser parser, SparkSession spark) { + super(parser, spark); + } + + public static void main(String[] args) throws Exception { + + ArgumentApplicationParser parser = new ArgumentApplicationParser( + readResource("/jobs/parameters/createEmbeddings_parameters.json", SparkCreateEmbeddings.class) + ); + + parser.parseArgument(args); + + SparkConf conf = new SparkConf(); + + new SparkCreateEmbeddings( + parser, + getSparkSession(conf) + ).run(); + } + + @Override + public void run() throws Exception { + + // read oozie parameters + final String graphBasePath = parser.get("graphBasePath"); + final String workingPath = parser.get("workingPath"); + final String outputPath = parser.get("outputPath"); + final String raidConfPath = parser.get("raidConfPath"); + final int numPartitions = Optional + .ofNullable(parser.get("numPartitions")) + .map(Integer::valueOf) + .orElse(NUM_PARTITIONS); + + log.info("graphBasePath: '{}'", graphBasePath); + log.info("workingPath: '{}'", workingPath); + log.info("outputPath: '{}'", outputPath); + log.info("raidConfPath: '{}'", raidConfPath); + log.info("numPartitions: '{}'", numPartitions); + + JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); + RAiDConfig config = loadRAiDConfig(raidConfPath); + + Dataset nodeDS = prepareNodes(sc, graphBasePath, config); + log.info("Number of nodes: {}", nodeDS.count()); + + RDD> vertices = createVertices(nodeDS); + + Dataset relations = spark + .read() + .schema(REL_BEAN_ENC.schema()) + .json(graphBasePath + "/relation"); + log.info("Number of relations: {}", relations.count()); + + Dataset labelDS = spark.createDataFrame( + new ArrayList<>(), DataTypes.createStructType( + new StructField[]{ + DataTypes.createStructField(LONG_ID_COL, DataTypes.LongType, false), + DataTypes.createStructField(LABEL_ID_COL, DataTypes.StringType, false) + })); + + for(EdgeParam edgeParam: config.getEdges()) { + + RDD> edges = createEdges(nodeDS, relations, edgeParam.getMetapath(), edgeParam.getName()); + + Dataset connectedComponents = GraphUtil.createConnectedComponents(spark, vertices, edges, 1); + + //this is getting rid of those connected components of a single element (they are not meaningful ?) + Dataset counts = connectedComponents + .groupBy(LABEL_ID_COL) + .agg(functions.count(LABEL_ID_COL).alias("count")) + .filter(functions.col("count").gt(1)); + connectedComponents = connectedComponents + .join(counts, connectedComponents.col(LABEL_ID_COL).equalTo(counts.col(LABEL_ID_COL)), "inner") + .select(connectedComponents.col(LONG_ID_COL), connectedComponents.col(LABEL_ID_COL)); + + labelDS = labelDS.union(connectedComponents); + + } + + Dataset labels = labelDS + .groupBy(col(LONG_ID_COL)) + .agg(collect_list(labelDS.col(LABEL_ID_COL)).as(LABELS_COL)); + + Dataset result = nodeDS + .join(labels, nodeDS.col(LONG_ID_COL).equalTo(labels.col(LONG_ID_COL)), "left") + .select(col(STRING_ID_COL), col(LABELS_COL)); + + log.info("Labels generated: {}", result.count()); + + Dataset word2vecEmbeddings = new Word2Vec() + .setInputCol(LABELS_COL) + .setOutputCol("word2vec_" + EMBEDDING_COL) + .setVectorSize(config.getParams().getOrDefault("embeddingSize", 128).intValue()) + .setStepSize(config.getParams().getOrDefault("learningRate", 0.01).doubleValue()) + .setMinCount(config.getParams().getOrDefault("minCount", 1).intValue()) + .setMaxIter(config.getParams().getOrDefault("maxIter", 10).intValue()) + .fit(result) + .transform(result); + + Dataset embeddings = new Normalizer() + .setInputCol("word2vec_" + EMBEDDING_COL) + .setOutputCol(EMBEDDING_COL) + .setP(2.0) + .transform(word2vecEmbeddings) + .select(col(STRING_ID_COL), col(EMBEDDING_COL)); + + embeddings.write().save(outputPath); + + } + + public Dataset prepareNodes(JavaSparkContext sc, String graphBasePath, RAiDConfig config) { + // create nodes + JavaRDD nodes = sc.emptyRDD(); + for (String nodeType: config.getNodes()) { + nodes = nodes.union( + sc.textFile(graphBasePath + "/" + nodeType) + .filter(json -> !((boolean) JsonPath.read(json, DELETEDBYINFERENCE_PATH))) + .map(json -> JsonPath.read(json, ID_PATH))); + } + + return spark.createDataset( + nodes.zipWithIndex().map(n -> RowFactory.create(n._1(), n._2())).rdd(), + RowEncoder.apply( + DataTypes.createStructType( + new StructField[]{ + DataTypes.createStructField(STRING_ID_COL, DataTypes.StringType, false), + DataTypes.createStructField(LONG_ID_COL, DataTypes.LongType, false) + }) + )); + } + + public RDD> createVertices(Dataset nodeDS) { + return nodeDS + .toJavaRDD() + .map(row -> new Tuple2<>(row.get(1), row.getString(0))) + .rdd(); + } + + public RDD> createEdges(Dataset nodeDS, Dataset relations, List metapath, String edgeName) throws Exception { + + Dataset edgesRow; + switch(metapath.size()) { + case 1: + edgesRow = relations + .where(col("relClass").equalTo(metapath.get(0))) + .select(col("source").as("src"), col("target").as("dst"), col("relClass")); + break; + case 2: + Dataset edges_1 = relations + .where(col("relClass").equalTo(metapath.get(0))) + .select(col("source").as("source_1"), col("target").as("target_1")); + Dataset edges_2 = relations + .where(col("relClass").equalTo(metapath.get(1))) + .select(col("source").as("source_2"), col("target").as("target_2")); + + edgesRow = edges_1 + .join(edges_2, edges_1.col("target_1").equalTo(edges_2.col("source_2"))) + .select(col("source_1").as("src"), col("target_2").as("dst")) + .withColumn("relClass", lit(edgeName)); + break; + default: + throw new Exception("Metapath size not allowed"); + } + // join with nodes to get longs instead of string ids + return edgesRow + .join(nodeDS, edgesRow.col("src").equalTo(nodeDS.col(STRING_ID_COL))) + .select(col(LONG_ID_COL).as("src"), col("dst"), col("relClass")) + .join(nodeDS, edgesRow.col("dst").equalTo(nodeDS.col(STRING_ID_COL))) + .select(col("src"), col(LONG_ID_COL).as("dst"), col("relClass")) + .toJavaRDD() + .map(row -> + new Edge<>( + row.getLong(0), + row.getLong(1), + row.getString(2) + ) + ) + .rdd(); + } +} diff --git a/dhp-raid/src/main/java/eu/dnetlib/raid/jobs/SparkCreateEmbeddingsW2V.java b/dhp-raid/src/main/java/eu/dnetlib/raid/jobs/SparkCreateEmbeddingsW2V.java new file mode 100644 index 0000000..0aad7e1 --- /dev/null +++ b/dhp-raid/src/main/java/eu/dnetlib/raid/jobs/SparkCreateEmbeddingsW2V.java @@ -0,0 +1,213 @@ +package eu.dnetlib.raid.jobs; + +import com.jayway.jsonpath.JsonPath; +import eu.dnetlib.dhp.schema.oaf.Relation; +import eu.dnetlib.raid.graph.GraphUtil; +import eu.dnetlib.raid.support.ArgumentApplicationParser; +import eu.dnetlib.raid.support.EdgeParam; +import eu.dnetlib.raid.support.RAiDConfig; +import eu.dnetlib.raid.support.RandomWalkParam; +import eu.dnetlib.raid.walker.RandomWalk; +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.ml.feature.Normalizer; +import org.apache.spark.ml.feature.Word2Vec; +import org.apache.spark.ml.feature.Word2VecModel; + +import org.apache.spark.rdd.RDD; +import org.apache.spark.sql.*; +import org.apache.spark.sql.catalyst.encoders.RowEncoder; +import org.apache.spark.sql.types.DataTypes; +import org.apache.spark.sql.types.Metadata; +import org.apache.spark.sql.types.StructField; +import org.apache.spark.sql.types.StructType; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.ArrayList; +import java.util.List; +import java.util.Optional; + +import static org.apache.spark.sql.functions.col; +import static org.apache.spark.sql.functions.lit; + +public class SparkCreateEmbeddingsW2V extends AbstractSparkJob{ + + private static final Logger log = LoggerFactory.getLogger(SparkCreateEmbeddingsW2V.class); + + private static final String ID_PATH = "$.id"; + private static final String DELETEDBYINFERENCE_PATH = "$.dataInfo.deletedbyinference"; + private static final Encoder REL_BEAN_ENC = Encoders.bean(Relation.class); + + public SparkCreateEmbeddingsW2V(ArgumentApplicationParser parser, SparkSession spark) { + super(parser, spark); + } + + public static void main(String[] args) throws Exception { + + ArgumentApplicationParser parser = new ArgumentApplicationParser( + readResource("/jobs/parameters/createEmbeddings_parameters.json", SparkCreateEmbeddingsW2V.class) + ); + + parser.parseArgument(args); + + SparkConf conf = new SparkConf(); + + new SparkCreateEmbeddingsW2V( + parser, + getSparkSession(conf) + ).run(); + } + + @Override + public void run() throws Exception { + + // read oozie parameters + final String graphBasePath = parser.get("graphBasePath"); + final String workingPath = parser.get("workingPath"); + final String outputPath = parser.get("outputPath"); + final String raidConfPath = parser.get("raidConfPath"); + final int numPartitions = Optional + .ofNullable(parser.get("numPartitions")) + .map(Integer::valueOf) + .orElse(NUM_PARTITIONS); + + log.info("graphBasePath: '{}'", graphBasePath); + log.info("workingPath: '{}'", workingPath); + log.info("outputPath: '{}'", outputPath); + log.info("raidConfPath: '{}'", raidConfPath); + log.info("numPartitions: '{}'", numPartitions); + + JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); + RAiDConfig config = loadRAiDConfig(raidConfPath); + + // create nodes + JavaRDD nodes = sc.emptyRDD(); + for (String nodeType: config.getNodes()) { + nodes = nodes.union( + sc.textFile(graphBasePath + "/" + nodeType) + .filter(json -> !((boolean) JsonPath.read(json, DELETEDBYINFERENCE_PATH))) + .map(json -> JsonPath.read(json, ID_PATH))); + } + RDD nodeRDD = nodes.zipWithIndex().map(n -> RowFactory.create(n._1(), n._2())).rdd(); + + Dataset nodeDS = spark.createDataset(nodeRDD, RowEncoder.apply( + new StructType( + new StructField[]{ + new StructField(STRING_ID_COL, DataTypes.StringType, false, Metadata.empty()), + new StructField(LONG_ID_COL, DataTypes.LongType, false, Metadata.empty()) + }) + )); + log.info("Number of nodes: {}", nodeDS.count()); + + Dataset relations = spark + .read() + .schema(REL_BEAN_ENC.schema()) + .json(graphBasePath + "/relation"); + log.info("Number of relations: {}", relations.count()); + + // create random walks + Dataset randomWalks = spark.createDataFrame( + new ArrayList<>(), new StructType( + new StructField[]{ + new StructField(RANDOM_WALK_COL, DataTypes.createArrayType(DataTypes.IntegerType), false, Metadata.empty()) + })); + for(EdgeParam edgeParam: config.getEdges()) { + + log.info("Creating '{}' edges with '{}' metapath", edgeParam.getName(), edgeParam.getMetapath()); + Dataset edges = createEdges(nodeDS, relations, edgeParam.getMetapath()); + log.info("Number of '{}' edges: {}", edgeParam.getName(), edges.count()); + + + RandomWalkParam randomWalkParam = config.getRandomWalks().get(edgeParam.getName()); + + Dataset randomWalk = new RandomWalk() + .setNumWalk(randomWalkParam.getNumWalks()) + .setWalkLength(randomWalkParam.getWalkLength()) + .setQ(randomWalkParam.getQ()) + .setP(randomWalkParam.getP()) + .setOutputCol(RANDOM_WALK_COL) + .randomWalk(edges); + log.info("Number of random walks for '{}' edges: {}", edgeParam.getName(), randomWalk.count()); + + randomWalks = randomWalks.union(randomWalk); + + } + + randomWalks.cache(); + + log.info("Creating embeddings"); + Word2VecModel word2VecModel = new Word2Vec() + .setMaxSentenceLength(config.getParams().getOrDefault("maxWalkLength", 100).intValue()) + .setMinCount(config.getParams().getOrDefault("minCount", 2).intValue()) + .setWindowSize(config.getParams().getOrDefault("windowSize", 5).intValue()) + .setVectorSize(config.getParams().getOrDefault("embeddingSize", 128).intValue()) + .setMaxIter(config.getParams().getOrDefault("maxIter", 20).intValue()) + .setStepSize(config.getParams().getOrDefault("learningRate", 0.1).doubleValue()) + .setSeed(SEED) + .setNumPartitions(numPartitions) + .setInputCol(RANDOM_WALK_COL) + .setOutputCol(EMBEDDING_COL) + .fit(randomWalks); + + Dataset embeddings = word2VecModel + .getVectors() + .toDF(LONG_ID_COL, EMBEDDING_COL); + + Normalizer normalizer = new Normalizer() + .setInputCol(EMBEDDING_COL) + .setOutputCol("normalized_" + EMBEDDING_COL) + .setP(2.0); + + embeddings = normalizer + .transform(embeddings) + .select(col(LONG_ID_COL), col("normalized_" + EMBEDDING_COL).as(EMBEDDING_COL)); + + Dataset result = nodeDS + .join(embeddings, nodeDS.col(LONG_ID_COL).equalTo(embeddings.col(LONG_ID_COL))) + .select(col(STRING_ID_COL), col(EMBEDDING_COL)); + + log.info("Number of generated embeddings: {}", result.count()); + + result.write().save(outputPath); + + } + + public Dataset createEdges(Dataset nodeDS, Dataset relations, List metapath) throws Exception { + + Dataset edges; + switch(metapath.size()) { + case 1: + edges = relations + .where(col("relClass").equalTo(metapath.get(0))) + .select(col("source").as("src"), col("target").as("dst")); + break; + case 2: + Dataset edges_1 = relations + .where(col("relClass").equalTo(metapath.get(0))) + .select(col("source").as("source_1"), col("target").as("target_1")); + Dataset edges_2 = relations + .where(col("relClass").equalTo(metapath.get(1))) + .select(col("source").as("source_2"), col("target").as("target_2")); + + edges = edges_1 + .join(edges_2, edges_1.col("target_1").equalTo(edges_2.col("source_2"))) + .select(col("source_1").as("src"), col("target_2").as("dst")); + break; + default: + throw new Exception("Metapath size not allowed"); + } + + // join with nodes to get longs instead of string ids + edges = edges + .join(nodeDS, edges.col("src").equalTo(nodeDS.col(STRING_ID_COL))) + .select(col(LONG_ID_COL).as("src"), col("dst")) + .join(nodeDS, edges.col("dst").equalTo(nodeDS.col(STRING_ID_COL))) + .select(col("src"), col(LONG_ID_COL).as("dst")) + .withColumn("weight", lit(1.0)); + + return GraphUtil.preProcess(edges, "src", "dst", "weight"); + } + +} diff --git a/dhp-raid/src/main/java/eu/dnetlib/raid/jobs/SparkRAiDClustering.java b/dhp-raid/src/main/java/eu/dnetlib/raid/jobs/SparkRAiDClustering.java new file mode 100644 index 0000000..a50290d --- /dev/null +++ b/dhp-raid/src/main/java/eu/dnetlib/raid/jobs/SparkRAiDClustering.java @@ -0,0 +1,128 @@ +package eu.dnetlib.raid.jobs; + +import com.google.common.collect.Iterators; +import eu.dnetlib.raid.support.ArgumentApplicationParser; +import eu.dnetlib.raid.support.RAiDConfig; +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.api.java.function.MapPartitionsFunction; +import org.apache.spark.ml.clustering.KMeans; +import org.apache.spark.ml.linalg.DenseVector; +import org.apache.spark.sql.*; +import org.apache.spark.sql.types.DataTypes; +import org.apache.spark.sql.types.StructField; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import smile.clustering.DBSCAN; + +import java.util.ArrayList; +import java.util.List; +import java.util.Optional; + +import static org.apache.spark.sql.functions.col; + +public class SparkRAiDClustering extends AbstractSparkJob { + + private static final Logger log = LoggerFactory.getLogger(SparkRAiDClustering.class); + + public SparkRAiDClustering(ArgumentApplicationParser parser, SparkSession spark) { + super(parser, spark); + } + + public static void main(String[] args) throws Exception { + + ArgumentApplicationParser parser = new ArgumentApplicationParser( + readResource("/jobs/parameters/createClusters_parameters.json", SparkCreateEmbeddingsW2V.class) + ); + + parser.parseArgument(args); + + SparkConf conf = new SparkConf(); + + new SparkRAiDClustering( + parser, + getSparkSession(conf) + ).run(); + } + + @Override + public void run() throws Exception { + + // read oozie parameters + final String inputPath = parser.get("inputPath"); + final String workingPath = parser.get("workingPath"); + final String outputPath = parser.get("outputPath"); + final String raidConfPath = parser.get("raidConfPath"); + final int numPartitions = Optional + .ofNullable(parser.get("numPartitions")) + .map(Integer::valueOf) + .orElse(NUM_PARTITIONS); + + log.info("inputPath: '{}'", inputPath); + log.info("workingPath: '{}'", workingPath); + log.info("outputPath: '{}'", outputPath); + log.info("raidConfPath: '{}'", raidConfPath); + log.info("numPartitions: '{}'", numPartitions); + + JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); + RAiDConfig config = loadRAiDConfig(raidConfPath); + + MapPartitionsFunction dbScan = partition -> { + + List points = new ArrayList<>(); + List ids = new ArrayList<>(); + int index = 0; //partition index + + while (partition.hasNext()) { + Row row = partition.next(); + index = row.getAs(PARTITION_COL); + double[] embedding = ((DenseVector) row.getAs(EMBEDDING_COL)).toArray(); + ids.add(row.getString(row.fieldIndex(STRING_ID_COL))); + points.add(embedding); + } + + if (points.size() > 0 ) { + + DBSCAN dbscan = DBSCAN.fit( + points.toArray(new double[0][]), + config.getParams().getOrDefault("minPts", 1).intValue(), + config.getParams().getOrDefault("epsilon", 0.1).doubleValue() + ); + + List results = new ArrayList<>(); + for (int i = 0; i < dbscan.y.length; i++) { + int clusterId = dbscan.y[i] == Integer.MAX_VALUE ? -1 : index * 1000 + dbscan.y[i]; + Row resultRow = RowFactory.create(ids.get(i), clusterId); + results.add(resultRow); + } + + return results.iterator(); + + } + else { + return Iterators.emptyIterator(); + } + + }; + + final Dataset embeddings = spark.read().load(inputPath); + + Dataset clusteredData = new KMeans() + .setK(numPartitions) + .setSeed(SEED) + .setFeaturesCol(EMBEDDING_COL) + .setPredictionCol(PARTITION_COL) + .setMaxIter(config.getParams().getOrDefault("partitioningMaxIter", 10).intValue()) + .fit(embeddings) + .transform(embeddings) + .repartition(col(PARTITION_COL)) + .mapPartitions(dbScan, Encoders.kryo(Row.class)); + + Dataset result = spark + .createDataFrame(clusteredData.rdd(), DataTypes.createStructType(new StructField[]{DataTypes.createStructField(STRING_ID_COL, DataTypes.StringType, false), DataTypes.createStructField(LABEL_ID_COL, DataTypes.IntegerType, false)})); + + result.write().save(outputPath); + + } + +} diff --git a/dhp-raid/src/main/java/eu/dnetlib/raid/support/ArgumentApplicationParser.java b/dhp-raid/src/main/java/eu/dnetlib/raid/support/ArgumentApplicationParser.java new file mode 100644 index 0000000..7db489e --- /dev/null +++ b/dhp-raid/src/main/java/eu/dnetlib/raid/support/ArgumentApplicationParser.java @@ -0,0 +1,95 @@ +package eu.dnetlib.raid.support; + +import com.fasterxml.jackson.databind.ObjectMapper; +import org.apache.commons.cli.*; +import org.apache.commons.io.IOUtils; + +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.Serializable; +import java.io.StringWriter; +import java.util.*; +import java.util.zip.GZIPInputStream; +import java.util.zip.GZIPOutputStream; + +public class ArgumentApplicationParser implements Serializable { + + private final Options options = new Options(); + private final Map objectMap = new HashMap<>(); + + private final List compressedValues = new ArrayList<>(); + + public ArgumentApplicationParser(final String json_configuration) throws Exception { + final ObjectMapper mapper = new ObjectMapper(); + final OptionsParameter[] configuration = mapper.readValue(json_configuration, OptionsParameter[].class); + createOptionMap(configuration); + } + + public ArgumentApplicationParser(final OptionsParameter[] configuration) { + createOptionMap(configuration); + } + + private void createOptionMap(final OptionsParameter[] configuration) { + + Arrays + .stream(configuration) + .map( + conf -> { + final Option o = new Option(conf.getParamName(), true, conf.getParamDescription()); + o.setLongOpt(conf.getParamLongName()); + o.setRequired(conf.isParamRequired()); + if (conf.isCompressed()) { + compressedValues.add(conf.getParamLongName()); + } + return o; + }) + .forEach(options::addOption); + + // HelpFormatter formatter = new HelpFormatter(); + // formatter.printHelp("myapp", null, options, null, true); + + } + + public static String decompressValue(final String abstractCompressed) { + try { + byte[] byteArray = org.apache.commons.codec.binary.Base64.decodeBase64(abstractCompressed.getBytes()); + GZIPInputStream gis = new GZIPInputStream(new ByteArrayInputStream(byteArray)); + final StringWriter stringWriter = new StringWriter(); + IOUtils.copy(gis, stringWriter); + return stringWriter.toString(); + } catch (Throwable e) { + System.out.println("Wrong value to decompress:" + abstractCompressed); + throw new RuntimeException(e); + } + } + + public static String compressArgument(final String value) throws Exception { + ByteArrayOutputStream out = new ByteArrayOutputStream(); + GZIPOutputStream gzip = new GZIPOutputStream(out); + gzip.write(value.getBytes()); + gzip.close(); + return java.util.Base64.getEncoder().encodeToString(out.toByteArray()); + } + + public void parseArgument(final String[] args) throws Exception { + CommandLineParser parser = new BasicParser(); + CommandLine cmd = parser.parse(options, args); + Arrays + .stream(cmd.getOptions()) + .forEach( + it -> objectMap + .put( + it.getLongOpt(), + compressedValues.contains(it.getLongOpt()) + ? decompressValue(it.getValue()) + : it.getValue())); + } + + public String get(final String key) { + return objectMap.get(key); + } + + public Map getObjectMap() { + return objectMap; + } +} diff --git a/dhp-raid/src/main/java/eu/dnetlib/raid/support/EdgeParam.java b/dhp-raid/src/main/java/eu/dnetlib/raid/support/EdgeParam.java new file mode 100644 index 0000000..7960c23 --- /dev/null +++ b/dhp-raid/src/main/java/eu/dnetlib/raid/support/EdgeParam.java @@ -0,0 +1,33 @@ +package eu.dnetlib.raid.support; +import java.io.Serializable; +import java.util.List; + +public class EdgeParam implements Serializable { + + private String name; + private List metapath; + + public EdgeParam() { + } + + public EdgeParam(String name, List metapath) { + this.name = name; + this.metapath = metapath; + } + + public String getName() { + return name; + } + + public void setName(String name) { + this.name = name; + } + + public List getMetapath() { + return metapath; + } + + public void setMetapath(List metapath) { + this.metapath = metapath; + } +} diff --git a/dhp-raid/src/main/java/eu/dnetlib/raid/support/OptionsParameter.java b/dhp-raid/src/main/java/eu/dnetlib/raid/support/OptionsParameter.java new file mode 100644 index 0000000..c4c1bfa --- /dev/null +++ b/dhp-raid/src/main/java/eu/dnetlib/raid/support/OptionsParameter.java @@ -0,0 +1,39 @@ +package eu.dnetlib.raid.support; + +import java.io.Serializable; + +public class OptionsParameter implements Serializable { + + private String paramName; + private String paramLongName; + private String paramDescription; + private boolean paramRequired; + private boolean compressed; + + public OptionsParameter() { + } + + public String getParamName() { + return paramName; + } + + public String getParamLongName() { + return paramLongName; + } + + public String getParamDescription() { + return paramDescription; + } + + public boolean isParamRequired() { + return paramRequired; + } + + public boolean isCompressed() { + return compressed; + } + + public void setCompressed(boolean compressed) { + this.compressed = compressed; + } +} \ No newline at end of file diff --git a/dhp-raid/src/main/java/eu/dnetlib/raid/support/RAiDConfig.java b/dhp-raid/src/main/java/eu/dnetlib/raid/support/RAiDConfig.java new file mode 100644 index 0000000..370acda --- /dev/null +++ b/dhp-raid/src/main/java/eu/dnetlib/raid/support/RAiDConfig.java @@ -0,0 +1,75 @@ +package eu.dnetlib.raid.support; + +import com.fasterxml.jackson.databind.ObjectMapper; +import com.google.gson.JsonIOException; + +import java.io.IOException; +import java.io.Serializable; +import java.util.List; +import java.util.Map; + +public class RAiDConfig implements Serializable { + + private List nodes; // list of nodes to be considered + + private List edges; //list of edges to be created + + private Map randomWalks; //list of random walk parameters + + private Map params; + + public RAiDConfig() { + + } + + public RAiDConfig(List nodes, List edges, Map randomWalks, Map params) { + this.nodes = nodes; + this.edges = edges; + this.randomWalks = randomWalks; + this.params = params; + } + + public List getNodes() { + return nodes; + } + + public void setNodes(List nodes) { + this.nodes = nodes; + } + + public List getEdges() { + return edges; + } + + public void setEdges(List edges) { + this.edges = edges; + } + + public Map getRandomWalks() { + return randomWalks; + } + + public void setRandomWalks(Map randomWalks) { + this.randomWalks = randomWalks; + } + + public Map getParams() { + return params; + } + + public void setParams(Map params) { + this.params = params; + } + + public static RAiDConfig load(final String json) { + + final RAiDConfig config; + try { + config = new ObjectMapper().readValue(json, RAiDConfig.class); + return config; + } catch (IOException e) { + throw new JsonIOException("Error in parsing configuration json", e); + } + + } +} diff --git a/dhp-raid/src/main/java/eu/dnetlib/raid/support/RandomWalkParam.java b/dhp-raid/src/main/java/eu/dnetlib/raid/support/RandomWalkParam.java new file mode 100644 index 0000000..be74582 --- /dev/null +++ b/dhp-raid/src/main/java/eu/dnetlib/raid/support/RandomWalkParam.java @@ -0,0 +1,53 @@ +package eu.dnetlib.raid.support; + +import java.io.Serializable; + +public class RandomWalkParam implements Serializable { + + private double p; + private double q; + private int walkLength; + private int numWalks; + + public RandomWalkParam() { + } + + public RandomWalkParam(double p, double q, int walkLength, int numWalks) { + this.p = p; + this.q = q; + this.walkLength = walkLength; + this.numWalks = numWalks; + } + + public double getP() { + return p; + } + + public void setP(double p) { + this.p = p; + } + + public double getQ() { + return q; + } + + public void setQ(double q) { + this.q = q; + } + + public int getWalkLength() { + return walkLength; + } + + public void setWalkLength(int walkLength) { + this.walkLength = walkLength; + } + + public int getNumWalks() { + return numWalks; + } + + public void setNumWalks(int numWalks) { + this.numWalks = numWalks; + } +} diff --git a/dhp-raid/src/main/resources/jobs/parameters/createClusters_parameters.json b/dhp-raid/src/main/resources/jobs/parameters/createClusters_parameters.json new file mode 100644 index 0000000..b2693f0 --- /dev/null +++ b/dhp-raid/src/main/resources/jobs/parameters/createClusters_parameters.json @@ -0,0 +1,32 @@ +[ + { + "paramName": "i", + "paramLongName": "inputPath", + "paramDescription": "input embeddings", + "paramRequired": true + }, + { + "paramName": "w", + "paramLongName": "workingPath", + "paramDescription": "working directory for the computations", + "paramRequired": true + }, + { + "paramName": "o", + "paramLongName": "outputPath", + "paramDescription": "output path", + "paramRequired": true + }, + { + "paramName": "rc", + "paramLongName": "raidConfPath", + "paramDescription": "RAiD configuration path", + "paramRequired": true + }, + { + "paramName": "np", + "paramLongName": "numPartitions", + "paramDescription": "number of partitions for the spark job", + "paramRequired": false + } +] \ No newline at end of file diff --git a/dhp-raid/src/main/resources/jobs/parameters/createEmbeddings_parameters.json b/dhp-raid/src/main/resources/jobs/parameters/createEmbeddings_parameters.json new file mode 100644 index 0000000..c329775 --- /dev/null +++ b/dhp-raid/src/main/resources/jobs/parameters/createEmbeddings_parameters.json @@ -0,0 +1,32 @@ +[ + { + "paramName": "g", + "paramLongName": "graphBasePath", + "paramDescription": "the input graph base path", + "paramRequired": true + }, + { + "paramName": "w", + "paramLongName": "workingPath", + "paramDescription": "working directory for the computations", + "paramRequired": true + }, + { + "paramName": "o", + "paramLongName": "outputPath", + "paramDescription": "output path", + "paramRequired": true + }, + { + "paramName": "rc", + "paramLongName": "raidConfPath", + "paramDescription": "RAiD configuration path", + "paramRequired": true + }, + { + "paramName": "np", + "paramLongName": "numPartitions", + "paramDescription": "number of partitions for the spark job", + "paramRequired": false + } +] \ No newline at end of file diff --git a/dhp-raid/src/main/resources/raid/oozie_app/config-default.xml b/dhp-raid/src/main/resources/raid/oozie_app/config-default.xml new file mode 100644 index 0000000..2e0ed9a --- /dev/null +++ b/dhp-raid/src/main/resources/raid/oozie_app/config-default.xml @@ -0,0 +1,18 @@ + + + jobTracker + yarnRM + + + nameNode + hdfs://nameservice1 + + + oozie.use.system.libpath + true + + + oozie.action.sharelib.for.spark + spark2 + + \ No newline at end of file diff --git a/dhp-raid/src/main/resources/raid/oozie_app/workflow.xml b/dhp-raid/src/main/resources/raid/oozie_app/workflow.xml new file mode 100644 index 0000000..ced46fd --- /dev/null +++ b/dhp-raid/src/main/resources/raid/oozie_app/workflow.xml @@ -0,0 +1,147 @@ + + + + graphBasePath + path for the input graph + + + raidConfPath + configuration for RAiD inference + + + workingPath + path of the working directory + + + numPartitions + number of partitions for the spark files + + + sparkDriverMemory + memory for driver process + + + sparkExecutorMemory + memory for individual executor + + + sparkExecutorCores + number of cores used by single executor + + + oozieActionShareLibForSpark2 + oozie action sharelib for spark 2.* + + + spark2ExtraListeners + com.cloudera.spark.lineage.NavigatorAppListener + spark 2.* extra listeners classname + + + spark2SqlQueryExecutionListeners + com.cloudera.spark.lineage.NavigatorQueryListener + spark 2.* sql query execution listeners classname + + + spark2YarnHistoryServerAddress + spark 2.* yarn history server address + + + spark2EventLogDir + spark 2.* event log dir location + + + + + ${jobTracker} + ${nameNode} + + + mapreduce.job.queuename + ${queueName} + + + oozie.launcher.mapred.job.queue.name + ${oozieLauncherQueueName} + + + oozie.action.sharelib.for.spark + ${oozieActionShareLibForSpark2} + + + mapreduce.map.java.opts + -Xmx6144m + + + mapreduce.reduce.java.opts + -Xmx6144m + + + + + + + + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] + + + + + yarn + cluster + Create Embeddings + eu.dnetlib.raid.jobs.SparkCreateEmbeddings + dhp-raid-${projectVersion}.jar + + --num-executors=32 + --executor-memory=16G + --executor-cores=4 + --driver-memory=8G + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.shuffle.partitions=3840 + --conf spark.dynamicAllocation.enabled=true + + --graphBasePath${graphBasePath} + --workingPath${workingPath} + --outputPath${embeddingsPath} + --raidConfPath${raidConfPath} + --numPartitions${numPartitions} + + + + + + + + yarn + cluster + Create Clusters + eu.dnetlib.raid.jobs.SparkRAiDClustering + dhp-raid-${projectVersion}.jar + + --num-executors=32 + --executor-memory=16G + --executor-cores=4 + --driver-memory=8G + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.shuffle.partitions=3840 + --conf spark.dynamicAllocation.enabled=true + + --inputPath${embeddingsPath} + --workingPath${workingPath} + --outputPath${clustersPath} + --raidConfPath${raidConfPath} + --numPartitions${numPartitions} + + + + + + + \ No newline at end of file diff --git a/dhp-raid/src/main/scala/eu/dnetlib/raid/graph/AliasOps.scala b/dhp-raid/src/main/scala/eu/dnetlib/raid/graph/AliasOps.scala new file mode 100644 index 0000000..1a45c1a --- /dev/null +++ b/dhp-raid/src/main/scala/eu/dnetlib/raid/graph/AliasOps.scala @@ -0,0 +1,75 @@ +package eu.dnetlib.raid.graph + +import scala.collection.mutable.ArrayBuffer + +/** + * alias method sampling + */ +object AliasOps { + + + def normalize(probabilities: Array[Double]): Array[Double] = { + val sum = probabilities.sum + probabilities.map(x => x / sum) + } + + def setupAlias(probabilities: Array[Double]): (Array[Int], Array[Double]) = { + + val K = probabilities.length + val J = Array.fill(K)(0) + val q = Array.fill(K)(0.0) + + val smaller = new ArrayBuffer[Int]() + val larger = new ArrayBuffer[Int]() + + probabilities.zipWithIndex.foreach { case (prob, kk) => + q(kk) = K * prob + if (q(kk) < 1.0) { + smaller.append(kk) + } else { + larger.append(kk) + } + } + + while (smaller.nonEmpty && larger.nonEmpty) { + val small = smaller.remove(smaller.length - 1) + val large = larger.remove(larger.length - 1) + + J(small) = large + q(large) = q(large) + q(small) - 1.0 + if (q(large) < 1.0) { + smaller.append(large) + } else { + larger.append(large) + } + } + (J, q) + } + + + def drawAlias(J: Array[Int], q: Array[Double]): Int = { + val K = J.length + val kk = math.floor(math.random * K).toInt + if (math.random < q(kk)) kk + else J(kk) + } + + + def main(args: Array[String]): Unit = { + + val probabilities = Array(0.28, 0.32, 0.1, 0.3) + + val jq = setupAlias(probabilities) + val res = new ArrayBuffer[Int]() + for (_ <- 0 until 1000) { + val index = drawAlias(jq._1, jq._2) + res.append(index) + } + + for (i <- probabilities.indices) { + println(i + ": " + res.count(x => x == i)) + } + + } + +} diff --git a/dhp-raid/src/main/scala/eu/dnetlib/raid/graph/GraphOps.scala b/dhp-raid/src/main/scala/eu/dnetlib/raid/graph/GraphOps.scala new file mode 100644 index 0000000..d4dc444 --- /dev/null +++ b/dhp-raid/src/main/scala/eu/dnetlib/raid/graph/GraphOps.scala @@ -0,0 +1,67 @@ +package eu.dnetlib.raid.graph + +import org.apache.spark.graphx._ +import org.apache.spark.rdd.RDD +import org.apache.spark.sql.SparkSession + +/** + * initial graph with node alias and edge alias + */ +object GraphOps { + + def setupEdgeAlias(p: Double = 1.0, q: Double = 1.0)(srcId: Long, + srcNeighbors: Array[(Long, Double)], + dstNeighbors: Array[(Long, Double)]): (Array[Int], Array[Double]) = { + + val unNormalizedProbabilities = dstNeighbors.map { case (dstNeighborId, weight) => + var unNormProb = weight / q + if (srcId == dstNeighborId) unNormProb = weight / p + else if (srcNeighbors.exists(_._1 == dstNeighborId)) unNormProb = weight + unNormProb + } + AliasOps.setupAlias(AliasOps.normalize(unNormalizedProbabilities)) + } + + + def setupNodeAlias(neighbors: Array[(Long, Double)]): (Array[Int], Array[Double]) = { + val unNormalizedProbabilities = neighbors.map(_._2) + AliasOps.setupAlias(AliasOps.normalize(unNormalizedProbabilities)) + } + + + def initTransitionProb(spark: SparkSession, + edges: RDD[Edge[EdgeAttr]], + vertices: RDD[(VertexId, NodeAttr)], + p: Double, + q: Double): Graph[NodeAttr, EdgeAttr] = { + + val bcP = spark.sparkContext.broadcast(p) + val bcQ = spark.sparkContext.broadcast(q) + + val graph = Graph.apply(vertices, edges).mapVertices[NodeAttr] { + case (_, nodeAttr) => + val (j, q) = GraphOps.setupNodeAlias(nodeAttr.neighbors) + nodeAttr.J = j + nodeAttr.q = q + nodeAttr + }.mapTriplets { + + edgeTriplet: EdgeTriplet[NodeAttr, EdgeAttr] => + + val (j, q) = GraphOps.setupEdgeAlias(bcP.value, bcQ.value)( + edgeTriplet.srcId, + edgeTriplet.srcAttr.neighbors, + edgeTriplet.dstAttr.neighbors) + + edgeTriplet.attr.J = j + edgeTriplet.attr.q = q + + edgeTriplet.attr.dstNeighbors = edgeTriplet.dstAttr.neighbors.map(_._1) + + edgeTriplet.attr + } + + graph + } + +} diff --git a/dhp-raid/src/main/scala/eu/dnetlib/raid/graph/GraphUtil.scala b/dhp-raid/src/main/scala/eu/dnetlib/raid/graph/GraphUtil.scala new file mode 100644 index 0000000..3bc043c --- /dev/null +++ b/dhp-raid/src/main/scala/eu/dnetlib/raid/graph/GraphUtil.scala @@ -0,0 +1,53 @@ +package eu.dnetlib.raid.graph + +import eu.dnetlib.raid.jobs.AbstractSparkJob +import org.apache.spark.api.java.JavaRDD +import org.apache.spark.api.java.JavaRDD.fromRDD +import org.apache.spark.graphx.{Edge, Graph, VertexId, VertexRDD} +import org.apache.spark.rdd.RDD +import org.apache.spark.sql.{DataFrame, Encoders, Row, RowFactory, SparkSession} +import org.apache.spark.sql.functions.{col, sum} +import org.apache.spark.sql.types.{LongType, StringType, StructField, StructType} + +object GraphUtil { + + // process the graph to make sure it can be fit into randomWalk + def preProcess(dataFrame: DataFrame, srcCol: String = "src", dstCol: String = "dst", weightCol: String = "weight"): DataFrame = { + + val spark = dataFrame.sparkSession + import spark.implicits._ + + val filteredDF = dataFrame.filter(col(srcCol) =!= col(dstCol)) + + val uniqueEdges = filteredDF.map(x => { + val src = x.getAs[Long](srcCol) + val dst = x.getAs[Long](dstCol) + val weight = x.getAs[Double](weightCol) + val srcDst = Array(src, dst).sorted.mkString(",") + (srcDst, weight) + }).toDF("edge", "weight") + + val weightedEdges = uniqueEdges.groupBy("edge").agg( + sum("weight") + ).flatMap(x => { + val edge = x.getString(0).split(",").map(_.toLong) + val weight = x.getDouble(1) + Array((edge(0), edge(1), weight), (edge(1), edge(0), weight)) + }).toDF(srcCol, dstCol, weightCol) + + weightedEdges + + } + + def createConnectedComponents(spark: SparkSession, vertices: RDD[(Long, String)], edges: RDD[Edge[String]], maxIter: Int): DataFrame = { + + val graph = Graph.apply(vertices,edges,"unknown") + val cc : RDD[Row] = graph.connectedComponents(maxIter).vertices.map(tuple => Row(tuple._1, tuple._2)).rdd + + spark.createDataFrame(cc.rdd, StructType(Seq( + StructField("longId", LongType, nullable = false), + StructField("labelId", LongType, nullable = false) + ))) + } +} + diff --git a/dhp-raid/src/main/scala/eu/dnetlib/raid/graph/package.scala b/dhp-raid/src/main/scala/eu/dnetlib/raid/graph/package.scala new file mode 100644 index 0000000..32776b3 --- /dev/null +++ b/dhp-raid/src/main/scala/eu/dnetlib/raid/graph/package.scala @@ -0,0 +1,13 @@ +package eu.dnetlib.raid + +package object graph { + + case class NodeAttr(var neighbors: Array[(Long, Double)] = Array.empty[(Long, Double)], + var J: Array[Int] = Array.empty[Int], + var q: Array[Double] = Array.empty[Double]) extends Serializable + + case class EdgeAttr(var dstNeighbors: Array[Long] = Array.empty[Long], + var J: Array[Int] = Array.empty[Int], + var q: Array[Double] = Array.empty[Double]) extends Serializable + +} diff --git a/dhp-raid/src/main/scala/eu/dnetlib/raid/walker/RandomWalk.scala b/dhp-raid/src/main/scala/eu/dnetlib/raid/walker/RandomWalk.scala new file mode 100644 index 0000000..2830110 --- /dev/null +++ b/dhp-raid/src/main/scala/eu/dnetlib/raid/walker/RandomWalk.scala @@ -0,0 +1,170 @@ +package eu.dnetlib.raid.walker + +import eu.dnetlib.raid.graph.{AliasOps, EdgeAttr, GraphOps, NodeAttr} +import org.apache.spark.graphx.{Edge, Graph} +import org.apache.spark.rdd.RDD +import org.apache.spark.sql.DataFrame +import org.apache.spark.sql.functions.collect_list + +import scala.collection.mutable.ArrayBuffer + + +class RandomWalk extends Serializable { + + var p: Double = 1.0 + var q: Double = 1.0 + var numWalk: Int = 10 + var walkLength: Int = 10 + var bcMaxDegree: Int = 30 + var weightLog: Boolean = false + + var srcCol: String = "src" + var dstCol: String = "dst" + var weightCol: String = "weight" + var outputCol: String = "sequence" + + + /** + * initialize the graph + * + * @param dataFrame input dataFrame + */ + def initGraph(dataFrame: DataFrame): Graph[NodeAttr, EdgeAttr] = { + val spark = dataFrame.sparkSession + import spark.implicits._ + + val edges = dataFrame.map(x => { + val src = x.getAs[Long](srcCol) + val dst = x.getAs[Long](dstCol) + Edge(src, dst, EdgeAttr()) + }) + + val vertices = dataFrame.groupBy(srcCol).agg( + collect_list(dstCol), + collect_list(weightCol) + ).map(x => { + val src = x.getLong(0) + val dstSeq = x.getSeq[Long](1) + var weightSeq = x.getSeq[Double](2) + if (weightLog) weightSeq = weightSeq.map(x => math.log1p(x)) + val dstWeight = dstSeq.zip(weightSeq).sortBy(_._2).reverse.take(bcMaxDegree) + (src, NodeAttr(neighbors = dstWeight.map(x => (x._1, x._2)).toArray)) + }) + + GraphOps.initTransitionProb(spark, edges.rdd, vertices.rdd, p, q) + + } + + def setWeightLog(weightLog: Boolean): this.type = { + this.weightLog = weightLog + this + } + + def setBcMaxDegree(bcMaxDegree: Int): this.type = { + this.bcMaxDegree = bcMaxDegree + this + } + + def setP(p: Double): this.type = { + this.p = p + this + } + + def setQ(p: Double): this.type = { + this.q = q + this + } + + def setNumWalk(numWalk: Int): this.type = { + this.numWalk = numWalk + this + } + + def setWalkLength(walkLength: Int): this.type = { + this.walkLength = walkLength + this + } + + def setSrcCol(srcCol: String): this.type = { + this.srcCol = srcCol + this + } + + def setDstCol(dstCol: String): this.type = { + this.dstCol = dstCol + this + } + + def setWeightCol(weightCol: String): this.type = { + this.weightCol = weightCol + this + } + + def setOutputCol(outputCol: String): this.type = { + this.outputCol = outputCol + this + } + + def randomWalk(dataFrame: DataFrame): DataFrame = { + + val spark = dataFrame.sparkSession + import spark.implicits._ + + val graph = initGraph(dataFrame) + + val edges = graph.edges.map { x => + (x.srcId + "," + x.dstId, x.attr) + }.repartition(1000).cache() + edges.first() + + val vertices = graph.vertices.cache() + vertices.first() + + var result: RDD[(String, ArrayBuffer[Long])] = null + + for (_ <- 0 until numWalk) { + + var path: RDD[(String, ArrayBuffer[Long])] = firstWalk(vertices) + path.first() + + for (_ <- 1 until walkLength) { + path = edges.join(path).map { x => + val edgeAttr = x._2._1 + val pathBuffer = x._2._2 + val nextNodeIndex = AliasOps.drawAlias(edgeAttr.J, edgeAttr.q) + val nextNode = edgeAttr.dstNeighbors(nextNodeIndex) + pathBuffer.append(nextNode) + val currId = pathBuffer.last + val prevId = pathBuffer(pathBuffer.length - 2) + (prevId + "," + currId, pathBuffer) + } + path.first() + } + + if (result == null) { + result = path + result.first() + } else { + result = result.union(path) + result.first() + } + + } + result.map { x => x._2.map(_.toString).toArray }.toDF(outputCol) + } + + def firstWalk(vertices: RDD[(Long, NodeAttr)]): RDD[(String, ArrayBuffer[Long])] = { + val result = vertices.map { x => + val nodeAttr = x._2 + val startNode = x._1 + val nextNodeIndex = AliasOps.drawAlias(nodeAttr.J, nodeAttr.q) + val nextNode = nodeAttr.neighbors(nextNodeIndex)._1 + val buffer = new ArrayBuffer[Long]() + buffer.append(startNode) + buffer.append(nextNode) + (startNode + "," + nextNode, buffer) + } + result + } + +} diff --git a/dhp-raid/src/test/java/eu/dnetlib/raid/RAiDInferenceTest.java b/dhp-raid/src/test/java/eu/dnetlib/raid/RAiDInferenceTest.java new file mode 100644 index 0000000..4963d0c --- /dev/null +++ b/dhp-raid/src/test/java/eu/dnetlib/raid/RAiDInferenceTest.java @@ -0,0 +1,141 @@ +package eu.dnetlib.raid; + +import eu.dnetlib.raid.jobs.AbstractSparkJob; +import eu.dnetlib.raid.jobs.SparkCreateEmbeddings; +import eu.dnetlib.raid.jobs.SparkRAiDClustering; +import eu.dnetlib.raid.support.ArgumentApplicationParser; +import eu.dnetlib.raid.support.RAiDConfig; +import org.apache.commons.io.FileUtils; +import org.apache.commons.io.IOUtils; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.sql.SparkSession; +import org.junit.jupiter.api.*; + +import java.io.BufferedReader; +import java.io.File; +import java.io.IOException; +import java.io.InputStreamReader; +import java.net.URISyntaxException; +import java.nio.file.Paths; +import java.util.stream.Collectors; + +@TestInstance(TestInstance.Lifecycle.PER_CLASS) +@TestMethodOrder(MethodOrderer.OrderAnnotation.class) +public class RAiDInferenceTest { + + static SparkSession spark; + static RAiDConfig config; + static JavaSparkContext context; + + final String graphBasePath = Paths + .get(RAiDInferenceTest.class.getResource("/eu/dnetlib/raid/examples/graph").toURI()) + .toFile() + .getAbsolutePath(); + + final static String workingPath = "/tmp/working_dir"; + final static String embeddingsPath = "/tmp/embeddings"; + final static String clustersPath = "/tmp/clusters"; + + final static String numPartitions = "3"; + + final String raidConfPath = Paths + .get(RAiDInferenceTest.class.getResource("/eu/dnetlib/raid/config/raid.conf.json").toURI()) + .toFile() + .getAbsolutePath(); + + public RAiDInferenceTest() throws URISyntaxException { + } + + public static void cleanup() throws IOException { + //remove directories to clean workspace + FileUtils.deleteDirectory(new File(workingPath)); + FileUtils.deleteDirectory(new File(clustersPath)); + FileUtils.deleteDirectory(new File(embeddingsPath)); + } + + @BeforeAll + public void setup() throws IOException { + + cleanup(); + + config = RAiDConfig.load(readFileFromHDFS(raidConfPath)); + + spark = SparkSession + .builder() + .appName("RAiD inference") + .master("local[*]") + .getOrCreate(); + context = JavaSparkContext.fromSparkContext(spark.sparkContext()); + + } + + @AfterAll + public static void finalCleanUp() throws IOException { + cleanup(); + } + + protected static String readFileFromHDFS(String filePath) throws IOException { + + Path path=new Path(filePath); + FileSystem fs = FileSystem.get(new Configuration()); + BufferedReader br=new BufferedReader(new InputStreamReader(fs.open(path))); + try { + return String.join("", br.lines().collect(Collectors.toList())); + } finally { + br.close(); + } + } + + public static String readResource(String path, Class clazz) throws IOException { + return IOUtils.toString( + clazz.getResourceAsStream(path)); + } + + @Test + @Order(1) + public void createRAiDEmbeddingsTest() throws Exception { + + ArgumentApplicationParser parser = new ArgumentApplicationParser(readResource("/jobs/parameters/createEmbeddings_parameters.json", SparkCreateEmbeddings.class)); + + parser.parseArgument( + new String[] { + "-g", graphBasePath, + "-rc", raidConfPath, + "-o", embeddingsPath, + "-w", workingPath, + "-np", numPartitions + }); + + new SparkCreateEmbeddings( + parser, + spark + ).run(); + + } + + @Test + @Order(2) + public void createClusters() throws Exception { + + ArgumentApplicationParser parser = new ArgumentApplicationParser(readResource("/jobs/parameters/createClusters_parameters.json", SparkRAiDClustering.class)); + + parser.parseArgument( + new String[] { + "-i", embeddingsPath, + "-rc", raidConfPath, + "-o", clustersPath, + "-w", workingPath, + "-np", numPartitions + } + ); + + new SparkRAiDClustering( + parser, + spark + ).run(); + } + +} diff --git a/dhp-raid/src/test/resources/eu/dnetlib/raid/config/raid.conf.json b/dhp-raid/src/test/resources/eu/dnetlib/raid/config/raid.conf.json new file mode 100644 index 0000000..b8dfeb0 --- /dev/null +++ b/dhp-raid/src/test/resources/eu/dnetlib/raid/config/raid.conf.json @@ -0,0 +1,40 @@ +{ + "nodes": ["publication", "software", "dataset"], + "edges": [ + { + "name": "coproduced", + "metapath": ["isProducedBy", "produces"] + }, + { + "name": "coinstitution", + "metapath": ["hasAuthorInstitutionOf", "isAuthorInstitutionOf"] + }, + { + "name": "part", + "metapath": ["IsPartOf"] + }, + { + "name": "version", + "metapath": ["IsVersionOf"] + }, + { + "name": "supplement", + "metapath": ["IsSupplementTo"] + } + ], + "randomWalks": { + "coproduced": {"p": 1.0, "q": 2.0, "walkLength": 3, "numWalks": 10}, + "coinstitution": {"p": 1.0, "q": 2.0, "walkLength": 3, "numWalks": 1}, + "part": {"p": 1.0, "q": 0.25, "walkLength": 3, "numWalks": 2}, + "version": {"p": 1.0, "q": 0.25, "walkLength": 4, "numWalks": 5}, + "supplement": {"p": 1.0, "q": 0.25, "walkLength": 3, "numWalks": 3} + }, + "params": { + "embeddingSize": 128, + "maxIter": 20, + "partitioningMaxIter": 50, + "learningRate": 0.01, + "epsilon": 0.5, + "minPts": 5 + } +} \ No newline at end of file diff --git a/dhp-raid/src/test/resources/eu/dnetlib/raid/examples/graph/dataset b/dhp-raid/src/test/resources/eu/dnetlib/raid/examples/graph/dataset new file mode 100644 index 0000000..7f2f66a --- /dev/null +++ b/dhp-raid/src/test/resources/eu/dnetlib/raid/examples/graph/dataset @@ -0,0 +1,3 @@ +{"dataInfo": {"deletedbyinference": false}, "author": [],"description": [ { "dataInfo": { "deletedbyinference": false, "inferred": false, "invisible": false, "provenanceaction": { "classid": "sysimport:crosswalk:repository", "classname": "Harvested", "schemeid": "dnet:provenanceActions", "schemename": "dnet:provenanceActions" }, "trust": "0.9" }, "value": "Description of dataset 1" }],"id": "50|dataset::1","pid": [],"subject": [],"title": [ { "dataInfo": { "deletedbyinference": false, "inferred": false, "invisible": false, "provenanceaction": { "classid": "sysimport:crosswalk:repository", "classname": "Harvested", "schemeid": "dnet:provenanceActions", "schemename": "dnet:provenanceActions" }, "trust": "0.9" }, "qualifier": { "classid": "main title", "classname": "main title", "schemeid": "dnet:dataCite_title", "schemename": "dnet:dataCite_title" }, "value": "Dataset 1" }]} +{"dataInfo": {"deletedbyinference": false}, "author": [],"description": [ { "dataInfo": { "deletedbyinference": false, "inferred": false, "invisible": false, "provenanceaction": { "classid": "sysimport:crosswalk:repository", "classname": "Harvested", "schemeid": "dnet:provenanceActions", "schemename": "dnet:provenanceActions" }, "trust": "0.9" }, "value": "Description of dataset 2" }],"id": "50|dataset::2","pid": [],"subject": [],"title": [ { "dataInfo": { "deletedbyinference": false, "inferred": false, "invisible": false, "provenanceaction": { "classid": "sysimport:crosswalk:repository", "classname": "Harvested", "schemeid": "dnet:provenanceActions", "schemename": "dnet:provenanceActions" }, "trust": "0.9" }, "qualifier": { "classid": "main title", "classname": "main title", "schemeid": "dnet:dataCite_title", "schemename": "dnet:dataCite_title" }, "value": "Dataset 2" }]} +{"dataInfo": {"deletedbyinference": false}, "author": [],"description": [ { "dataInfo": { "deletedbyinference": false, "inferred": false, "invisible": false, "provenanceaction": { "classid": "sysimport:crosswalk:repository", "classname": "Harvested", "schemeid": "dnet:provenanceActions", "schemename": "dnet:provenanceActions" }, "trust": "0.9" }, "value": "Description of dataset 3" }],"id": "50|dataset::3","pid": [],"subject": [],"title": [ { "dataInfo": { "deletedbyinference": false, "inferred": false, "invisible": false, "provenanceaction": { "classid": "sysimport:crosswalk:repository", "classname": "Harvested", "schemeid": "dnet:provenanceActions", "schemename": "dnet:provenanceActions" }, "trust": "0.9" }, "qualifier": { "classid": "main title", "classname": "main title", "schemeid": "dnet:dataCite_title", "schemename": "dnet:dataCite_title" }, "value": "Dataset 3" }]} \ No newline at end of file diff --git a/dhp-raid/src/test/resources/eu/dnetlib/raid/examples/graph/publication b/dhp-raid/src/test/resources/eu/dnetlib/raid/examples/graph/publication new file mode 100644 index 0000000..19e2707 --- /dev/null +++ b/dhp-raid/src/test/resources/eu/dnetlib/raid/examples/graph/publication @@ -0,0 +1,4 @@ +{"dataInfo": {"deletedbyinference": false}, "author": [], "description": [{"dataInfo": {"deletedbyinference": false, "inferred": false, "invisible": false, "provenanceaction": {"classid": "sysimport:crosswalk:repository", "classname": "Harvested", "schemeid": "dnet:provenanceActions", "schemename": "dnet:provenanceActions"}, "trust": "0.9" }, "value": "Description of publication 1"}], "id": "50|publication::1", "pid": [], "title": [ { "dataInfo": { "deletedbyinference": false, "inferred": false, "invisible": false, "provenanceaction": {"classid": "sysimport:crosswalk:repository", "classname": "Harvested", "schemeid": "dnet:provenanceActions", "schemename": "dnet:provenanceActions"}, "trust": "0.9"}, "qualifier": { "classid": "main title", "classname": "main title", "schemeid": "dnet:dataCite_title", "schemename": "dnet:dataCite_title"},"value": "Publication 1"}]} +{"dataInfo": {"deletedbyinference": false}, "author": [], "description": [{"dataInfo": {"deletedbyinference": false, "inferred": false, "invisible": false, "provenanceaction": {"classid": "sysimport:crosswalk:repository", "classname": "Harvested", "schemeid": "dnet:provenanceActions", "schemename": "dnet:provenanceActions"}, "trust": "0.9" }, "value": "Description of publication 2"}], "id": "50|publication::2", "pid": [], "title": [ { "dataInfo": { "deletedbyinference": false, "inferred": false, "invisible": false, "provenanceaction": {"classid": "sysimport:crosswalk:repository", "classname": "Harvested", "schemeid": "dnet:provenanceActions", "schemename": "dnet:provenanceActions"}, "trust": "0.9"}, "qualifier": { "classid": "main title", "classname": "main title", "schemeid": "dnet:dataCite_title", "schemename": "dnet:dataCite_title"},"value": "Publication 2"}]} +{"dataInfo": {"deletedbyinference": false}, "author": [], "description": [{"dataInfo": {"deletedbyinference": false, "inferred": false, "invisible": false, "provenanceaction": {"classid": "sysimport:crosswalk:repository", "classname": "Harvested", "schemeid": "dnet:provenanceActions", "schemename": "dnet:provenanceActions"}, "trust": "0.9" }, "value": "Description of publication 3"}], "id": "50|publication::3", "pid": [], "title": [ { "dataInfo": { "deletedbyinference": false, "inferred": false, "invisible": false, "provenanceaction": {"classid": "sysimport:crosswalk:repository", "classname": "Harvested", "schemeid": "dnet:provenanceActions", "schemename": "dnet:provenanceActions"}, "trust": "0.9"}, "qualifier": { "classid": "main title", "classname": "main title", "schemeid": "dnet:dataCite_title", "schemename": "dnet:dataCite_title"},"value": "Publication 3"}]} +{"dataInfo": {"deletedbyinference": false}, "author": [], "description": [{"dataInfo": {"deletedbyinference": false, "inferred": false, "invisible": false, "provenanceaction": {"classid": "sysimport:crosswalk:repository", "classname": "Harvested", "schemeid": "dnet:provenanceActions", "schemename": "dnet:provenanceActions"}, "trust": "0.9" }, "value": "Description of publication 4"}], "id": "50|publication::4", "pid": [], "title": [ { "dataInfo": { "deletedbyinference": false, "inferred": false, "invisible": false, "provenanceaction": {"classid": "sysimport:crosswalk:repository", "classname": "Harvested", "schemeid": "dnet:provenanceActions", "schemename": "dnet:provenanceActions"}, "trust": "0.9"}, "qualifier": { "classid": "main title", "classname": "main title", "schemeid": "dnet:dataCite_title", "schemename": "dnet:dataCite_title"},"value": "Publication 4"}]} \ No newline at end of file diff --git a/dhp-raid/src/test/resources/eu/dnetlib/raid/examples/graph/relation b/dhp-raid/src/test/resources/eu/dnetlib/raid/examples/graph/relation new file mode 100644 index 0000000..5c5fddc --- /dev/null +++ b/dhp-raid/src/test/resources/eu/dnetlib/raid/examples/graph/relation @@ -0,0 +1,29 @@ +{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.900"},"lastupdatetimestamp":1727879293497,"properties":[],"relClass":"isProducedBy","relType":"resultProject","source":"50|publication::1","subRelType":"provision","target":"40|project::1","validated":false} +{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.900"},"lastupdatetimestamp":1727879293497,"properties":[],"relClass":"produces","relType":"projectResult","source":"40|project::1","subRelType":"provision","target":"50|publication::1","validated":false} +{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.900"},"lastupdatetimestamp":1727879293497,"properties":[],"relClass":"isProducedBy","relType":"resultProject","source":"50|software::1","subRelType":"provision","target":"40|project::1","validated":false} +{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.900"},"lastupdatetimestamp":1727879293497,"properties":[],"relClass":"produces","relType":"projectResult","source":"40|project::1","subRelType":"provision","target":"50|software::1","validated":false} +{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.900"},"lastupdatetimestamp":1727879293497,"properties":[],"relClass":"isProducedBy","relType":"resultProject","source":"50|dataset::1","subRelType":"provision","target":"40|project::1","validated":false} +{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.900"},"lastupdatetimestamp":1727879293497,"properties":[],"relClass":"produces","relType":"projectResult","source":"40|project::1","subRelType":"provision","target":"50|dataset::1","validated":false} +{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.900"},"lastupdatetimestamp":1727879293497,"properties":[],"relClass":"isProducedBy","relType":"resultProject","source":"50|publication::2","subRelType":"provision","target":"40|project::2","validated":false} +{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.900"},"lastupdatetimestamp":1727879293497,"properties":[],"relClass":"produces","relType":"projectResult","source":"40|project::2","subRelType":"provision","target":"50|publication::2","validated":false} +{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.900"},"lastupdatetimestamp":1727879293497,"properties":[],"relClass":"isProducedBy","relType":"resultProject","source":"50|software::3","subRelType":"provision","target":"40|project::2","validated":false} +{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.900"},"lastupdatetimestamp":1727879293497,"properties":[],"relClass":"produces","relType":"projectResult","source":"40|project::2","subRelType":"provision","target":"50|software::3","validated":false} +{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.900"},"lastupdatetimestamp":1727879293497,"properties":[],"relClass":"isProducedBy","relType":"resultProject","source":"50|publication::3","subRelType":"provision","target":"40|project::2","validated":false} +{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.900"},"lastupdatetimestamp":1727879293497,"properties":[],"relClass":"produces","relType":"projectResult","source":"40|project::2","subRelType":"provision","target":"50|publication::3","validated":false} +{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.900"},"lastupdatetimestamp":1727879293497,"properties":[],"relClass":"isProducedBy","relType":"resultProject","source":"50|publication::4","subRelType":"provision","target":"40|project::3","validated":false} +{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.900"},"lastupdatetimestamp":1727879293497,"properties":[],"relClass":"produces","relType":"projectResult","source":"40|project::3","subRelType":"provision","target":"50|publication::4","validated":false} +{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.900"},"lastupdatetimestamp":1727879293497,"properties":[],"relClass":"isProducedBy","relType":"resultProject","source":"50|software::4","subRelType":"provision","target":"40|project::3","validated":false} +{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.900"},"lastupdatetimestamp":1727879293497,"properties":[],"relClass":"produces","relType":"projectResult","source":"40|project::3","subRelType":"provision","target":"50|software::4","validated":false} +{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.900"},"lastupdatetimestamp":1727879293497,"properties":[],"relClass":"isProducedBy","relType":"resultProject","source":"50|dataset::3","subRelType":"provision","target":"40|project::3","validated":false} +{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.900"},"lastupdatetimestamp":1727879293497,"properties":[],"relClass":"produces","relType":"projectResult","source":"40|project::3","subRelType":"provision","target":"50|dataset::3","validated":false} +{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.900"},"lastupdatetimestamp":1727879293497,"properties":[],"relClass":"isProducedBy","relType":"resultProject","source":"50|dataset::2","subRelType":"provision","target":"40|project::2","validated":false} +{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.900"},"lastupdatetimestamp":1727879293497,"properties":[],"relClass":"produces","relType":"projectResult","source":"40|project::2","subRelType":"provision","target":"50|dataset::2","validated":false} +{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.900"},"lastupdatetimestamp":1727879293497,"properties":[],"relClass":"IsVersionOf","relType":"resultResult","source":"50|software::2","subRelType":"provision","target":"50|software::1","validated":false} +{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.900"},"lastupdatetimestamp":1727879293497,"properties":[],"relClass":"IsPartOf","relType":"resultResult","source":"50|publication::2","subRelType":"provision","target":"50|publication::3","validated":false} +{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.900"},"lastupdatetimestamp":1727879293497,"properties":[],"relClass":"IsSupplementTo","relType":"resultResult","source":"50|dataset::1","subRelType":"provision","target":"50|publication::1","validated":false} +{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.900"},"lastupdatetimestamp":1727879293497,"properties":[],"relClass":"IsSupplementTo","relType":"resultResult","source":"50|dataset::2","subRelType":"provision","target":"50|publication::3","validated":false} +{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.900"},"lastupdatetimestamp":1727879293497,"properties":[],"relClass":"IsSupplementTo","relType":"resultResult","source":"50|dataset::3","subRelType":"provision","target":"50|publication::4","validated":false} +{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.900"},"lastupdatetimestamp":1727879293497,"properties":[],"relClass":"hasAuthorInstitutionOf","relType":"resultOrganization","source":"50|publication::2","subRelType":"provision","target":"20|organization::1","validated":false} +{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.900"},"lastupdatetimestamp":1727879293497,"properties":[],"relClass":"isAuthorInstitutionOf","relType":"organizationResult","source":"20|organization::1","subRelType":"provision","target":"50|publication::2","validated":false} +{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.900"},"lastupdatetimestamp":1727879293497,"properties":[],"relClass":"hasAuthorInstitutionOf","relType":"resultOrganization","source":"50|dataset::3","subRelType":"provision","target":"20|organization::1","validated":false} +{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.900"},"lastupdatetimestamp":1727879293497,"properties":[],"relClass":"isAuthorInstitutionOf","relType":"organizationResult","source":"20|organization::1","subRelType":"provision","target":"50|dataset::3","validated":false} \ No newline at end of file diff --git a/dhp-raid/src/test/resources/eu/dnetlib/raid/examples/graph/software b/dhp-raid/src/test/resources/eu/dnetlib/raid/examples/graph/software new file mode 100644 index 0000000..a7c4f38 --- /dev/null +++ b/dhp-raid/src/test/resources/eu/dnetlib/raid/examples/graph/software @@ -0,0 +1,4 @@ +{"dataInfo": {"deletedbyinference": false}, "author": [],"description": [{"dataInfo": {"deletedbyinference": false,"inferenceprovenance": "","inferred": false,"invisible": false,"provenanceaction": {"classid": "sysimport:crosswalk:datasetarchive","classname": "Harvested","schemeid": "dnet:provenanceActions","schemename": "dnet:provenanceActions"},"trust": "0.9"},"value": "Description of software 1"}],"id": "50|software::1","originalId": ["amphoranet","50|__bioTools__::393e61aebe3ef9bc8701c8dc843e08f2"],"pid": [],"source": [],"subject": [],"title": [{"dataInfo": {"deletedbyinference": false,"inferenceprovenance": "","inferred": false,"invisible": false,"provenanceaction": {"classid": "sysimport:crosswalk:datasetarchive","classname": "Harvested","schemeid": "dnet:provenanceActions","schemename": "dnet:provenanceActions"},"trust": "0.9"},"qualifier": {"classid": "main title","classname": "main title","schemeid": "dnet:dataCite_title","schemename": "dnet:dataCite_title"},"value": "Software 1"}]} +{"dataInfo": {"deletedbyinference": false}, "author": [],"description": [{"dataInfo": {"deletedbyinference": false,"inferenceprovenance": "","inferred": false,"invisible": false,"provenanceaction": {"classid": "sysimport:crosswalk:datasetarchive","classname": "Harvested","schemeid": "dnet:provenanceActions","schemename": "dnet:provenanceActions"},"trust": "0.9"},"value": "Description of software 2"}],"id": "50|software::2","originalId": ["amphoranet","50|__bioTools__::393e61aebe3ef9bc8701c8dc843e08f2"],"pid": [],"source": [],"subject": [],"title": [{"dataInfo": {"deletedbyinference": false,"inferenceprovenance": "","inferred": false,"invisible": false,"provenanceaction": {"classid": "sysimport:crosswalk:datasetarchive","classname": "Harvested","schemeid": "dnet:provenanceActions","schemename": "dnet:provenanceActions"},"trust": "0.9"},"qualifier": {"classid": "main title","classname": "main title","schemeid": "dnet:dataCite_title","schemename": "dnet:dataCite_title"},"value": "Software 2"}]} +{"dataInfo": {"deletedbyinference": false}, "author": [],"description": [{"dataInfo": {"deletedbyinference": false,"inferenceprovenance": "","inferred": false,"invisible": false,"provenanceaction": {"classid": "sysimport:crosswalk:datasetarchive","classname": "Harvested","schemeid": "dnet:provenanceActions","schemename": "dnet:provenanceActions"},"trust": "0.9"},"value": "Description of software 3"}],"id": "50|software::3","originalId": ["amphoranet","50|__bioTools__::393e61aebe3ef9bc8701c8dc843e08f2"],"pid": [],"source": [],"subject": [],"title": [{"dataInfo": {"deletedbyinference": false,"inferenceprovenance": "","inferred": false,"invisible": false,"provenanceaction": {"classid": "sysimport:crosswalk:datasetarchive","classname": "Harvested","schemeid": "dnet:provenanceActions","schemename": "dnet:provenanceActions"},"trust": "0.9"},"qualifier": {"classid": "main title","classname": "main title","schemeid": "dnet:dataCite_title","schemename": "dnet:dataCite_title"},"value": "Software 3"}]} +{"dataInfo": {"deletedbyinference": false}, "author": [],"description": [{"dataInfo": {"deletedbyinference": false,"inferenceprovenance": "","inferred": false,"invisible": false,"provenanceaction": {"classid": "sysimport:crosswalk:datasetarchive","classname": "Harvested","schemeid": "dnet:provenanceActions","schemename": "dnet:provenanceActions"},"trust": "0.9"},"value": "Description of software 4"}],"id": "50|software::4","originalId": ["amphoranet","50|__bioTools__::393e61aebe3ef9bc8701c8dc843e08f2"],"pid": [],"source": [],"subject": [],"title": [{"dataInfo": {"deletedbyinference": false,"inferenceprovenance": "","inferred": false,"invisible": false,"provenanceaction": {"classid": "sysimport:crosswalk:datasetarchive","classname": "Harvested","schemeid": "dnet:provenanceActions","schemename": "dnet:provenanceActions"},"trust": "0.9"},"qualifier": {"classid": "main title","classname": "main title","schemeid": "dnet:dataCite_title","schemename": "dnet:dataCite_title"},"value": "Software 4"}]} \ No newline at end of file diff --git a/pom.xml b/pom.xml new file mode 100644 index 0000000..fbb588a --- /dev/null +++ b/pom.xml @@ -0,0 +1,445 @@ + + + + 4.0.0 + + eu.dnetlib + raid-inference + 1.0.0-SNAPSHOT + + pom + + http://www.d-net.research-infrastructures.eu + + + + The Apache Software License, Version 2.0 + http://www.apache.org/licenses/LICENSE-2.0.txt + repo + A business-friendly OSS license + + + + + scm:git:https://code-repo.d4science.org/D-Net/raid-inference.git + raid-inference-1.0.0 + + + + dhp-raid + dhp-build + + + + Redmine + https://issue.openaire.research-infrastructures.eu/projects/openaire + + + + + dnet45-releases + D-Net 45 Releases + https://maven.d4science.org/nexus/content/repositories/dnet45-releases + default + + + + + + + dnet45-releases + D-Net 45 releases + https://maven.d4science.org/nexus/content/repositories/dnet45-releases + default + + false + + + true + + + + + dnet45-snapshots + D-Net 45 snapshots + https://maven.d4science.org/nexus/content/repositories/dnet45-snapshots + default + + true + + + false + + + + dnet-deps + dnet-dependencies + https://maven.d4science.org/nexus/content/repositories/dnet-deps + default + + + + cloudera + Cloudera Repository + https://repository.cloudera.com/artifactory/cloudera-repos + + true + + + false + + + + + ceon + Ceon Repository + https://maven.ceon.pl/artifactory/repo + + true + + + false + + + + + target + target/classes + ${project.artifactId}-${project.version} + target/test-classes + + + + + + + + + + + org.apache.maven.plugins + maven-compiler-plugin + 3.6.0 + + 1.8 + 1.8 + ${project.build.sourceEncoding} + + + + + org.apache.maven.plugins + maven-jar-plugin + 3.0.2 + + + + org.apache.maven.plugins + maven-source-plugin + 3.0.1 + + + attach-sources + verify + + jar-no-fork + + + + + + + org.apache.maven.plugins + maven-surefire-plugin + 2.19.1 + + + org.junit.jupiter + junit-jupiter + ${junit-jupiter.version} + + + + false + + + + + org.apache.maven.plugins + maven-javadoc-plugin + 2.10.4 + + true + + + + + org.apache.maven.plugins + maven-dependency-plugin + 3.0.0 + + + + org.apache.maven.plugins + maven-failsafe-plugin + 2.13 + + + integration-test + + integration-test + + + + verify + + verify + + + + + + + + + + + + org.apache.maven.plugins + maven-release-plugin + 2.5.3 + + + + + + + + iis-releases + iis releases plugin repository + http://maven.ceon.pl/artifactory/iis-releases + default + + + + + + oozie-package + src/test/resources/define/path/pointing/to/directory/holding/oozie_app + oozie_app + + UTF-8 + UTF-8 + + 2.2.2 + 15.0 + + 2.2.0 + + 2.6.5 + 3.3.3 + [8.0.1] + + 3.5 + 2.4 + 3.2.1 + 1.1.3 + + 4.9 + 2.11.8 + + false + 3.6.0 + + + default + default + default + primed + + runtime + + true + + ${user.home}/.dhp/application.properties + + ${maven.build.timestamp} + + ${project.version} + true + 2.0.1 + 5.6.1 + ../dhp-build/dhp-build-assembly-resources/target/dhp-build-assembly-resources-${project.version}.jar + + + + + + + edu.cmu + secondstring + 1.0.0 + + + org.antlr + stringtemplate + 3.2 + + + + com.fasterxml.jackson.core + jackson-databind + ${jackson.version} + provided + + + com.fasterxml.jackson.dataformat + jackson-dataformat-xml + ${jackson.version} + provided + + + com.fasterxml.jackson.module + jackson-module-jsonSchema + ${jackson.version} + provided + + + com.fasterxml.jackson.core + jackson-core + ${jackson.version} + provided + + + com.fasterxml.jackson.core + jackson-annotations + ${jackson.version} + provided + + + + org.mockito + mockito-core + ${mockito-core.version} + test + + + + org.mockito + mockito-junit-jupiter + ${mockito-core.version} + test + + + + org.apache.commons + commons-math3 + 3.6.1 + + + + com.google.guava + guava + ${google.guava.version} + + + com.google.code.gson + gson + ${google.gson.version} + + + + org.apache.commons + commons-lang3 + ${commons.lang.version} + + + + commons-io + commons-io + ${commons.io.version} + + + commons-collections + commons-collections + ${commons.collections.version} + + + commons-logging + commons-logging + ${commons.logging.version} + + + org.apache.spark + spark-core_2.11 + ${spark.version} + provided + + + org.apache.spark + spark-graphx_2.11 + ${spark.version} + provided + + + org.apache.spark + spark-sql_2.11 + ${spark.version} + provided + + + + org.junit.jupiter + junit-jupiter + ${junit-jupiter.version} + test + + + + org.scala-lang + scala-library + ${scala.version} + + + + org.apache.oozie + oozie-client + 5.1.0 + + + com.jayway.jsonpath + json-path + 2.4.0 + + + + com.ibm.icu + icu4j + 70.1 + + + + eu.dnetlib.dhp + dhp-schemas + ${dhp-schemas.version} + + + + de.lmu.ifi.dbs.elki + elki + 0.7.5 + + + + com.github.haifengl + smile-core + 2.5.3 + + + + + + + + +