diff --git a/.gitignore b/.gitignore
index 28ec2ec194..a208e171fe 100644
--- a/.gitignore
+++ b/.gitignore
@@ -3,6 +3,8 @@
*.iws
*.ipr
*.iml
+*.ipr
+*.iws
*~
.vscode
.classpath
@@ -21,5 +23,5 @@
/*/build
/build
spark-warehouse
-/*/*/job-override.properties
+/**/job-override.properties
diff --git a/dhp-build/dhp-build-assembly-resources/pom.xml b/dhp-build/dhp-build-assembly-resources/pom.xml
index 834af77fa9..0c4637def4 100644
--- a/dhp-build/dhp-build-assembly-resources/pom.xml
+++ b/dhp-build/dhp-build-assembly-resources/pom.xml
@@ -6,7 +6,7 @@
eu.dnetlib.dhp
dhp-build
- 1.0.5-SNAPSHOT
+ 1.1.6-SNAPSHOT
dhp-build-assembly-resources
diff --git a/dhp-build/dhp-build-properties-maven-plugin/pom.xml b/dhp-build/dhp-build-properties-maven-plugin/pom.xml
index 4f99d5298b..308d787157 100644
--- a/dhp-build/dhp-build-properties-maven-plugin/pom.xml
+++ b/dhp-build/dhp-build-properties-maven-plugin/pom.xml
@@ -6,7 +6,7 @@
eu.dnetlib.dhp
dhp-build
- 1.0.5-SNAPSHOT
+ 1.1.6-SNAPSHOT
dhp-build-properties-maven-plugin
@@ -76,6 +76,41 @@
+
+
+
+
+ org.eclipse.m2e
+ lifecycle-mapping
+ 1.0.0
+
+
+
+
+
+
+ org.apache.maven.plugins
+
+
+ maven-plugin-plugin
+
+
+ [3.2,)
+
+
+ descriptor
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/dhp-build/dhp-build-properties-maven-plugin/src/test/java/eu/dnetlib/maven/plugin/properties/GenerateOoziePropertiesMojoTest.java b/dhp-build/dhp-build-properties-maven-plugin/src/test/java/eu/dnetlib/maven/plugin/properties/GenerateOoziePropertiesMojoTest.java
index 6f55828ef4..a2cb8e0f1e 100644
--- a/dhp-build/dhp-build-properties-maven-plugin/src/test/java/eu/dnetlib/maven/plugin/properties/GenerateOoziePropertiesMojoTest.java
+++ b/dhp-build/dhp-build-properties-maven-plugin/src/test/java/eu/dnetlib/maven/plugin/properties/GenerateOoziePropertiesMojoTest.java
@@ -1,22 +1,21 @@
package eu.dnetlib.maven.plugin.properties;
+import org.junit.jupiter.api.BeforeEach;
+import org.junit.jupiter.api.Test;
+
import static eu.dnetlib.maven.plugin.properties.GenerateOoziePropertiesMojo.PROPERTY_NAME_SANDBOX_NAME;
import static eu.dnetlib.maven.plugin.properties.GenerateOoziePropertiesMojo.PROPERTY_NAME_WF_SOURCE_DIR;
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertNull;
-
-import org.junit.Before;
-import org.junit.Test;
+import static org.junit.jupiter.api.Assertions.*;
/**
- * @author mhorst
+ * @author mhorst, claudio.atzori
*
*/
public class GenerateOoziePropertiesMojoTest {
private GenerateOoziePropertiesMojo mojo = new GenerateOoziePropertiesMojo();
- @Before
+ @BeforeEach
public void clearSystemProperties() {
System.clearProperty(PROPERTY_NAME_SANDBOX_NAME);
System.clearProperty(PROPERTY_NAME_WF_SOURCE_DIR);
@@ -28,7 +27,7 @@ public class GenerateOoziePropertiesMojoTest {
mojo.execute();
// assert
- assertNull(System.getProperty(PROPERTY_NAME_SANDBOX_NAME));
+ assertNull(System.getProperty(PROPERTY_NAME_SANDBOX_NAME));
}
@Test
diff --git a/dhp-build/dhp-build-properties-maven-plugin/src/test/java/eu/dnetlib/maven/plugin/properties/WritePredefinedProjectPropertiesTest.java b/dhp-build/dhp-build-properties-maven-plugin/src/test/java/eu/dnetlib/maven/plugin/properties/WritePredefinedProjectPropertiesTest.java
index 51d9575ffd..4b72130787 100644
--- a/dhp-build/dhp-build-properties-maven-plugin/src/test/java/eu/dnetlib/maven/plugin/properties/WritePredefinedProjectPropertiesTest.java
+++ b/dhp-build/dhp-build-properties-maven-plugin/src/test/java/eu/dnetlib/maven/plugin/properties/WritePredefinedProjectPropertiesTest.java
@@ -1,51 +1,41 @@
package eu.dnetlib.maven.plugin.properties;
-import static eu.dnetlib.maven.plugin.properties.WritePredefinedProjectProperties.PROPERTY_PREFIX_ENV;
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertFalse;
-import static org.junit.Assert.assertTrue;
-import static org.mockito.Mockito.doReturn;
-
-import java.io.File;
-import java.io.FileInputStream;
-import java.io.FileNotFoundException;
-import java.io.FileOutputStream;
-import java.io.FileWriter;
-import java.io.IOException;
-import java.util.Properties;
-
import org.apache.maven.plugin.MojoExecutionException;
import org.apache.maven.project.MavenProject;
-import org.junit.Before;
-import org.junit.Rule;
-import org.junit.Test;
-import org.junit.rules.TemporaryFolder;
-import org.junit.runner.RunWith;
+import org.junit.jupiter.api.*;
+import org.junit.jupiter.api.extension.ExtendWith;
+import org.junit.jupiter.api.io.TempDir;
import org.mockito.Mock;
-import org.mockito.runners.MockitoJUnitRunner;
+import org.mockito.MockitoAnnotations;
+import org.mockito.junit.jupiter.MockitoExtension;
+import java.io.*;
+import java.util.Properties;
+
+import static eu.dnetlib.maven.plugin.properties.WritePredefinedProjectProperties.PROPERTY_PREFIX_ENV;
+import static org.junit.jupiter.api.Assertions.*;
+import static org.mockito.Mockito.doReturn;
+import static org.mockito.Mockito.lenient;
/**
- * @author mhorst
+ * @author mhorst, claudio.atzori
*
*/
-@RunWith(MockitoJUnitRunner.class)
+@ExtendWith(MockitoExtension.class)
public class WritePredefinedProjectPropertiesTest {
- @Rule
- public TemporaryFolder testFolder = new TemporaryFolder();
-
@Mock
private MavenProject mavenProject;
private WritePredefinedProjectProperties mojo;
- @Before
- public void init() {
+ @BeforeEach
+ public void init(@TempDir File testFolder) {
+ MockitoAnnotations.initMocks(this);
mojo = new WritePredefinedProjectProperties();
- mojo.outputFile = getPropertiesFileLocation();
+ mojo.outputFile = getPropertiesFileLocation(testFolder);
mojo.project = mavenProject;
- doReturn(new Properties()).when(mavenProject).getProperties();
+ lenient().doReturn(new Properties()).when(mavenProject).getProperties();
}
// ----------------------------------- TESTS ---------------------------------------------
@@ -57,7 +47,7 @@ public class WritePredefinedProjectPropertiesTest {
// assert
assertTrue(mojo.outputFile.exists());
- Properties storedProperties = getStoredProperties();
+ Properties storedProperties = getStoredProperties(mojo.outputFile.getParentFile());
assertEquals(0, storedProperties.size());
}
@@ -75,28 +65,28 @@ public class WritePredefinedProjectPropertiesTest {
// assert
assertTrue(mojo.outputFile.exists());
- Properties storedProperties = getStoredProperties();
+ Properties storedProperties = getStoredProperties(mojo.outputFile.getParentFile());
assertEquals(1, storedProperties.size());
assertTrue(storedProperties.containsKey(key));
assertEquals(value, storedProperties.getProperty(key));
}
- @Test(expected=MojoExecutionException.class)
- public void testExecuteWithProjectPropertiesAndInvalidOutputFile() throws Exception {
+ @Test()
+ public void testExecuteWithProjectPropertiesAndInvalidOutputFile(@TempDir File testFolder) {
// given
String key = "projectPropertyKey";
String value = "projectPropertyValue";
Properties projectProperties = new Properties();
projectProperties.setProperty(key, value);
doReturn(projectProperties).when(mavenProject).getProperties();
- mojo.outputFile = testFolder.getRoot();
+ mojo.outputFile = testFolder;
// execute
- mojo.execute();
+ Assertions.assertThrows(MojoExecutionException.class, () -> mojo.execute());
}
@Test
- public void testExecuteWithProjectPropertiesExclusion() throws Exception {
+ public void testExecuteWithProjectPropertiesExclusion(@TempDir File testFolder) throws Exception {
// given
String key = "projectPropertyKey";
String value = "projectPropertyValue";
@@ -113,14 +103,14 @@ public class WritePredefinedProjectPropertiesTest {
// assert
assertTrue(mojo.outputFile.exists());
- Properties storedProperties = getStoredProperties();
+ Properties storedProperties = getStoredProperties(testFolder);
assertEquals(1, storedProperties.size());
assertTrue(storedProperties.containsKey(key));
assertEquals(value, storedProperties.getProperty(key));
}
@Test
- public void testExecuteWithProjectPropertiesInclusion() throws Exception {
+ public void testExecuteWithProjectPropertiesInclusion(@TempDir File testFolder) throws Exception {
// given
String key = "projectPropertyKey";
String value = "projectPropertyValue";
@@ -137,14 +127,14 @@ public class WritePredefinedProjectPropertiesTest {
// assert
assertTrue(mojo.outputFile.exists());
- Properties storedProperties = getStoredProperties();
+ Properties storedProperties = getStoredProperties(testFolder);
assertEquals(1, storedProperties.size());
assertTrue(storedProperties.containsKey(includedKey));
assertEquals(includedValue, storedProperties.getProperty(includedKey));
}
@Test
- public void testExecuteIncludingPropertyKeysFromFile() throws Exception {
+ public void testExecuteIncludingPropertyKeysFromFile(@TempDir File testFolder) throws Exception {
// given
String key = "projectPropertyKey";
String value = "projectPropertyValue";
@@ -155,7 +145,7 @@ public class WritePredefinedProjectPropertiesTest {
projectProperties.setProperty(includedKey, includedValue);
doReturn(projectProperties).when(mavenProject).getProperties();
- File includedPropertiesFile = new File(testFolder.getRoot(), "included.properties");
+ File includedPropertiesFile = new File(testFolder, "included.properties");
Properties includedProperties = new Properties();
includedProperties.setProperty(includedKey, "irrelevantValue");
includedProperties.store(new FileWriter(includedPropertiesFile), null);
@@ -167,14 +157,14 @@ public class WritePredefinedProjectPropertiesTest {
// assert
assertTrue(mojo.outputFile.exists());
- Properties storedProperties = getStoredProperties();
+ Properties storedProperties = getStoredProperties(testFolder);
assertEquals(1, storedProperties.size());
assertTrue(storedProperties.containsKey(includedKey));
assertEquals(includedValue, storedProperties.getProperty(includedKey));
}
@Test
- public void testExecuteIncludingPropertyKeysFromClasspathResource() throws Exception {
+ public void testExecuteIncludingPropertyKeysFromClasspathResource(@TempDir File testFolder) throws Exception {
// given
String key = "projectPropertyKey";
String value = "projectPropertyValue";
@@ -192,14 +182,14 @@ public class WritePredefinedProjectPropertiesTest {
// assert
assertTrue(mojo.outputFile.exists());
- Properties storedProperties = getStoredProperties();
+ Properties storedProperties = getStoredProperties(testFolder);
assertEquals(1, storedProperties.size());
assertTrue(storedProperties.containsKey(includedKey));
assertEquals(includedValue, storedProperties.getProperty(includedKey));
}
- @Test(expected=MojoExecutionException.class)
- public void testExecuteIncludingPropertyKeysFromBlankLocation() throws Exception {
+ @Test
+ public void testExecuteIncludingPropertyKeysFromBlankLocation() {
// given
String key = "projectPropertyKey";
String value = "projectPropertyValue";
@@ -213,11 +203,11 @@ public class WritePredefinedProjectPropertiesTest {
mojo.setIncludePropertyKeysFromFiles(new String[] {""});
// execute
- mojo.execute();
+ Assertions.assertThrows(MojoExecutionException.class, () -> mojo.execute());
}
@Test
- public void testExecuteIncludingPropertyKeysFromXmlFile() throws Exception {
+ public void testExecuteIncludingPropertyKeysFromXmlFile(@TempDir File testFolder) throws Exception {
// given
String key = "projectPropertyKey";
String value = "projectPropertyValue";
@@ -228,7 +218,7 @@ public class WritePredefinedProjectPropertiesTest {
projectProperties.setProperty(includedKey, includedValue);
doReturn(projectProperties).when(mavenProject).getProperties();
- File includedPropertiesFile = new File(testFolder.getRoot(), "included.xml");
+ File includedPropertiesFile = new File(testFolder, "included.xml");
Properties includedProperties = new Properties();
includedProperties.setProperty(includedKey, "irrelevantValue");
includedProperties.storeToXML(new FileOutputStream(includedPropertiesFile), null);
@@ -240,14 +230,14 @@ public class WritePredefinedProjectPropertiesTest {
// assert
assertTrue(mojo.outputFile.exists());
- Properties storedProperties = getStoredProperties();
+ Properties storedProperties = getStoredProperties(testFolder);
assertEquals(1, storedProperties.size());
assertTrue(storedProperties.containsKey(includedKey));
assertEquals(includedValue, storedProperties.getProperty(includedKey));
}
- @Test(expected=MojoExecutionException.class)
- public void testExecuteIncludingPropertyKeysFromInvalidXmlFile() throws Exception {
+ @Test
+ public void testExecuteIncludingPropertyKeysFromInvalidXmlFile(@TempDir File testFolder) throws Exception {
// given
String key = "projectPropertyKey";
String value = "projectPropertyValue";
@@ -258,7 +248,7 @@ public class WritePredefinedProjectPropertiesTest {
projectProperties.setProperty(includedKey, includedValue);
doReturn(projectProperties).when(mavenProject).getProperties();
- File includedPropertiesFile = new File(testFolder.getRoot(), "included.xml");
+ File includedPropertiesFile = new File(testFolder, "included.xml");
Properties includedProperties = new Properties();
includedProperties.setProperty(includedKey, "irrelevantValue");
includedProperties.store(new FileOutputStream(includedPropertiesFile), null);
@@ -266,11 +256,11 @@ public class WritePredefinedProjectPropertiesTest {
mojo.setIncludePropertyKeysFromFiles(new String[] {includedPropertiesFile.getAbsolutePath()});
// execute
- mojo.execute();
+ Assertions.assertThrows(MojoExecutionException.class, () -> mojo.execute());
}
@Test
- public void testExecuteWithQuietModeOn() throws Exception {
+ public void testExecuteWithQuietModeOn(@TempDir File testFolder) throws Exception {
// given
mojo.setQuiet(true);
mojo.setIncludePropertyKeysFromFiles(new String[] {"invalid location"});
@@ -280,21 +270,21 @@ public class WritePredefinedProjectPropertiesTest {
// assert
assertTrue(mojo.outputFile.exists());
- Properties storedProperties = getStoredProperties();
+ Properties storedProperties = getStoredProperties(testFolder);
assertEquals(0, storedProperties.size());
}
- @Test(expected=MojoExecutionException.class)
- public void testExecuteIncludingPropertyKeysFromInvalidFile() throws Exception {
+ @Test
+ public void testExecuteIncludingPropertyKeysFromInvalidFile() {
// given
mojo.setIncludePropertyKeysFromFiles(new String[] {"invalid location"});
// execute
- mojo.execute();
+ Assertions.assertThrows(MojoExecutionException.class, () -> mojo.execute());
}
@Test
- public void testExecuteWithEnvironmentProperties() throws Exception {
+ public void testExecuteWithEnvironmentProperties(@TempDir File testFolder) throws Exception {
// given
mojo.setIncludeEnvironmentVariables(true);
@@ -303,7 +293,7 @@ public class WritePredefinedProjectPropertiesTest {
// assert
assertTrue(mojo.outputFile.exists());
- Properties storedProperties = getStoredProperties();
+ Properties storedProperties = getStoredProperties(testFolder);
assertTrue(storedProperties.size() > 0);
for (Object currentKey : storedProperties.keySet()) {
assertTrue(((String)currentKey).startsWith(PROPERTY_PREFIX_ENV));
@@ -311,7 +301,7 @@ public class WritePredefinedProjectPropertiesTest {
}
@Test
- public void testExecuteWithSystemProperties() throws Exception {
+ public void testExecuteWithSystemProperties(@TempDir File testFolder) throws Exception {
// given
String key = "systemPropertyKey";
String value = "systemPropertyValue";
@@ -323,14 +313,14 @@ public class WritePredefinedProjectPropertiesTest {
// assert
assertTrue(mojo.outputFile.exists());
- Properties storedProperties = getStoredProperties();
+ Properties storedProperties = getStoredProperties(testFolder);
assertTrue(storedProperties.size() > 0);
assertTrue(storedProperties.containsKey(key));
assertEquals(value, storedProperties.getProperty(key));
}
@Test
- public void testExecuteWithSystemPropertiesAndEscapeChars() throws Exception {
+ public void testExecuteWithSystemPropertiesAndEscapeChars(@TempDir File testFolder) throws Exception {
// given
String key = "systemPropertyKey ";
String value = "systemPropertyValue";
@@ -344,7 +334,7 @@ public class WritePredefinedProjectPropertiesTest {
// assert
assertTrue(mojo.outputFile.exists());
- Properties storedProperties = getStoredProperties();
+ Properties storedProperties = getStoredProperties(testFolder);
assertTrue(storedProperties.size() > 0);
assertFalse(storedProperties.containsKey(key));
assertTrue(storedProperties.containsKey(key.trim()));
@@ -353,13 +343,13 @@ public class WritePredefinedProjectPropertiesTest {
// ----------------------------------- PRIVATE -------------------------------------------
- private File getPropertiesFileLocation() {
- return new File(testFolder.getRoot(), "test.properties");
+ private File getPropertiesFileLocation(File testFolder) {
+ return new File(testFolder, "test.properties");
}
- private Properties getStoredProperties() throws FileNotFoundException, IOException {
+ private Properties getStoredProperties(File testFolder) throws FileNotFoundException, IOException {
Properties properties = new Properties();
- properties.load(new FileInputStream(getPropertiesFileLocation()));
+ properties.load(new FileInputStream(getPropertiesFileLocation(testFolder)));
return properties;
}
}
diff --git a/dhp-build/pom.xml b/dhp-build/pom.xml
index e0b20204c9..e471af76d5 100644
--- a/dhp-build/pom.xml
+++ b/dhp-build/pom.xml
@@ -4,7 +4,7 @@
eu.dnetlib.dhp
dhp
- 1.0.5-SNAPSHOT
+ 1.1.6-SNAPSHOT
dhp-build
pom
diff --git a/dhp-common/pom.xml b/dhp-common/pom.xml
index 345a5475fa..f6283d450c 100644
--- a/dhp-common/pom.xml
+++ b/dhp-common/pom.xml
@@ -5,7 +5,7 @@
eu.dnetlib.dhp
dhp
- 1.0.5-SNAPSHOT
+ 1.1.6-SNAPSHOT
../
@@ -42,6 +42,23 @@
com.rabbitmq
amqp-client
+
+ net.sf.saxon
+ Saxon-HE
+
+
+ org.slf4j
+ jcl-over-slf4j
+
+
+ org.apache.cxf
+ cxf-rt-transports-http
+
+
+ eu.dnetlib
+ cnr-rmi-api
+
+
com.ximpleware
vtd-xml
diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/utils/ISLookupClientFactory.java b/dhp-common/src/main/java/eu/dnetlib/dhp/utils/ISLookupClientFactory.java
new file mode 100644
index 0000000000..c74cf3c111
--- /dev/null
+++ b/dhp-common/src/main/java/eu/dnetlib/dhp/utils/ISLookupClientFactory.java
@@ -0,0 +1,24 @@
+package eu.dnetlib.dhp.utils;
+
+import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.cxf.jaxws.JaxWsProxyFactoryBean;
+
+public class ISLookupClientFactory {
+
+ private static final Log log = LogFactory.getLog(ISLookupClientFactory.class);
+
+ public static ISLookUpService getLookUpService(final String isLookupUrl) {
+ return getServiceStub(ISLookUpService.class, isLookupUrl);
+ }
+
+ @SuppressWarnings("unchecked")
+ private static T getServiceStub(final Class clazz, final String endpoint) {
+ log.info(String.format("creating %s stub from %s", clazz.getName(), endpoint));
+ final JaxWsProxyFactoryBean jaxWsProxyFactory = new JaxWsProxyFactoryBean();
+ jaxWsProxyFactory.setServiceClass(clazz);
+ jaxWsProxyFactory.setAddress(endpoint);
+ return (T) jaxWsProxyFactory.create();
+ }
+}
diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/utils/saxon/AbstractExtensionFunction.java b/dhp-common/src/main/java/eu/dnetlib/dhp/utils/saxon/AbstractExtensionFunction.java
new file mode 100644
index 0000000000..bd39624404
--- /dev/null
+++ b/dhp-common/src/main/java/eu/dnetlib/dhp/utils/saxon/AbstractExtensionFunction.java
@@ -0,0 +1,32 @@
+package eu.dnetlib.dhp.utils.saxon;
+
+import net.sf.saxon.expr.XPathContext;
+import net.sf.saxon.lib.ExtensionFunctionCall;
+import net.sf.saxon.lib.ExtensionFunctionDefinition;
+import net.sf.saxon.om.Sequence;
+import net.sf.saxon.om.StructuredQName;
+import net.sf.saxon.trans.XPathException;
+
+public abstract class AbstractExtensionFunction extends ExtensionFunctionDefinition {
+
+ public static String DEFAULT_SAXON_EXT_NS_URI = "http://www.d-net.research-infrastructures.eu/saxon-extension";
+
+ public abstract String getName();
+ public abstract Sequence doCall(XPathContext context, Sequence[] arguments) throws XPathException;
+
+ @Override
+ public StructuredQName getFunctionQName() {
+ return new StructuredQName("dnet", DEFAULT_SAXON_EXT_NS_URI, getName());
+ }
+
+ @Override
+ public ExtensionFunctionCall makeCallExpression() {
+ return new ExtensionFunctionCall() {
+ @Override
+ public Sequence call(XPathContext context, Sequence[] arguments) throws XPathException {
+ return doCall(context, arguments);
+ }
+ };
+ }
+
+}
\ No newline at end of file
diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/utils/saxon/ExtractYear.java b/dhp-common/src/main/java/eu/dnetlib/dhp/utils/saxon/ExtractYear.java
new file mode 100644
index 0000000000..f90e2a23e8
--- /dev/null
+++ b/dhp-common/src/main/java/eu/dnetlib/dhp/utils/saxon/ExtractYear.java
@@ -0,0 +1,67 @@
+package eu.dnetlib.dhp.utils.saxon;
+
+import net.sf.saxon.expr.XPathContext;
+import net.sf.saxon.om.Item;
+import net.sf.saxon.om.Sequence;
+import net.sf.saxon.trans.XPathException;
+import net.sf.saxon.value.SequenceType;
+import net.sf.saxon.value.StringValue;
+
+import java.text.ParseException;
+import java.text.SimpleDateFormat;
+import java.util.Calendar;
+import java.util.GregorianCalendar;
+
+public class ExtractYear extends AbstractExtensionFunction {
+
+ private static final String[] dateFormats = { "yyyy-MM-dd", "yyyy/MM/dd" };
+
+ @Override
+ public String getName() {
+ return "extractYear";
+ }
+
+ @Override
+ public Sequence doCall(XPathContext context, Sequence[] arguments) throws XPathException {
+ if (arguments == null | arguments.length == 0) {
+ return new StringValue("");
+ }
+ final Item item = arguments[0].head();
+ if (item == null) {
+ return new StringValue("");
+ }
+ return new StringValue(_year(item.getStringValue()));
+ }
+
+ @Override
+ public int getMinimumNumberOfArguments() {
+ return 0;
+ }
+
+ @Override
+ public int getMaximumNumberOfArguments() {
+ return 1;
+ }
+
+ @Override
+ public SequenceType[] getArgumentTypes() {
+ return new SequenceType[] { SequenceType.OPTIONAL_ITEM };
+ }
+
+ @Override
+ public SequenceType getResultType(SequenceType[] suppliedArgumentTypes) {
+ return SequenceType.SINGLE_STRING;
+ }
+
+ private String _year(String s) {
+ Calendar c = new GregorianCalendar();
+ for (String format : dateFormats) {
+ try {
+ c.setTime(new SimpleDateFormat(format).parse(s));
+ String year = String.valueOf(c.get(Calendar.YEAR));
+ return year;
+ } catch (ParseException e) {}
+ }
+ return "";
+ }
+}
diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/utils/saxon/NormalizeDate.java b/dhp-common/src/main/java/eu/dnetlib/dhp/utils/saxon/NormalizeDate.java
new file mode 100644
index 0000000000..634e08788b
--- /dev/null
+++ b/dhp-common/src/main/java/eu/dnetlib/dhp/utils/saxon/NormalizeDate.java
@@ -0,0 +1,66 @@
+package eu.dnetlib.dhp.utils.saxon;
+
+import net.sf.saxon.expr.XPathContext;
+import net.sf.saxon.om.Sequence;
+import net.sf.saxon.trans.XPathException;
+import net.sf.saxon.value.SequenceType;
+import net.sf.saxon.value.StringValue;
+
+import java.text.ParseException;
+import java.text.SimpleDateFormat;
+import java.util.Date;
+
+public class NormalizeDate extends AbstractExtensionFunction {
+
+ private static final String[] normalizeDateFormats = { "yyyy-MM-dd'T'hh:mm:ss", "yyyy-MM-dd", "yyyy/MM/dd", "yyyy" };
+
+ private static final String normalizeOutFormat = new String("yyyy-MM-dd'T'hh:mm:ss'Z'");
+
+ @Override
+ public String getName() {
+ return "normalizeDate";
+ }
+
+ @Override
+ public Sequence doCall(XPathContext context, Sequence[] arguments) throws XPathException {
+ if (arguments == null | arguments.length == 0) {
+ return new StringValue("");
+ }
+ String s = arguments[0].head().getStringValue();
+ return new StringValue(_year(s));
+ }
+
+ @Override
+ public int getMinimumNumberOfArguments() {
+ return 0;
+ }
+
+ @Override
+ public int getMaximumNumberOfArguments() {
+ return 1;
+ }
+
+ @Override
+ public SequenceType[] getArgumentTypes() {
+ return new SequenceType[] { SequenceType.OPTIONAL_ITEM };
+ }
+
+ @Override
+ public SequenceType getResultType(SequenceType[] suppliedArgumentTypes) {
+ return SequenceType.SINGLE_STRING;
+ }
+
+ private String _year(String s) {
+ final String date = s != null ? s.trim() : "";
+
+ for (String format : normalizeDateFormats) {
+ try {
+ Date parse = new SimpleDateFormat(format).parse(date);
+ String res = new SimpleDateFormat(normalizeOutFormat).format(parse);
+ return res;
+ } catch (ParseException e) {}
+ }
+ return "";
+ }
+
+}
diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/utils/saxon/PickFirst.java b/dhp-common/src/main/java/eu/dnetlib/dhp/utils/saxon/PickFirst.java
new file mode 100644
index 0000000000..a221e37c67
--- /dev/null
+++ b/dhp-common/src/main/java/eu/dnetlib/dhp/utils/saxon/PickFirst.java
@@ -0,0 +1,60 @@
+package eu.dnetlib.dhp.utils.saxon;
+
+import net.sf.saxon.expr.XPathContext;
+import net.sf.saxon.om.Item;
+import net.sf.saxon.om.Sequence;
+import net.sf.saxon.trans.XPathException;
+import net.sf.saxon.value.SequenceType;
+import net.sf.saxon.value.StringValue;
+import org.apache.commons.lang3.StringUtils;
+
+public class PickFirst extends AbstractExtensionFunction {
+
+ @Override
+ public String getName() {
+ return "pickFirst";
+ }
+
+ @Override
+ public Sequence doCall(XPathContext context, Sequence[] arguments) throws XPathException {
+ if (arguments == null | arguments.length == 0) {
+ return new StringValue("");
+ }
+
+ final String s1 = getValue(arguments[0]);
+ final String s2 = getValue(arguments[1]);
+
+ return new StringValue(StringUtils.isNotBlank(s1) ? s1 : StringUtils.isNotBlank(s2) ? s2 : "");
+ }
+
+ private String getValue(final Sequence arg) throws XPathException {
+ if (arg != null) {
+ final Item item = arg.head();
+ if (item != null) {
+ return item.getStringValue();
+ }
+ }
+ return "";
+ }
+
+ @Override
+ public int getMinimumNumberOfArguments() {
+ return 0;
+ }
+
+ @Override
+ public int getMaximumNumberOfArguments() {
+ return 2;
+ }
+
+ @Override
+ public SequenceType[] getArgumentTypes() {
+ return new SequenceType[] { SequenceType.OPTIONAL_ITEM };
+ }
+
+ @Override
+ public SequenceType getResultType(SequenceType[] suppliedArgumentTypes) {
+ return SequenceType.SINGLE_STRING;
+ }
+
+}
diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/utils/saxon/SaxonTransformerFactory.java b/dhp-common/src/main/java/eu/dnetlib/dhp/utils/saxon/SaxonTransformerFactory.java
new file mode 100644
index 0000000000..611709ff0d
--- /dev/null
+++ b/dhp-common/src/main/java/eu/dnetlib/dhp/utils/saxon/SaxonTransformerFactory.java
@@ -0,0 +1,30 @@
+package eu.dnetlib.dhp.utils.saxon;
+
+import net.sf.saxon.Configuration;
+import net.sf.saxon.TransformerFactoryImpl;
+
+import javax.xml.transform.Transformer;
+import javax.xml.transform.TransformerException;
+import javax.xml.transform.stream.StreamSource;
+import java.io.StringReader;
+
+public class SaxonTransformerFactory {
+
+ /**
+ * Creates the index record transformer from the given XSLT
+ * @param xslt
+ * @return
+ * @throws TransformerException
+ */
+ public static Transformer newInstance(final String xslt) throws TransformerException {
+
+ final TransformerFactoryImpl factory = new TransformerFactoryImpl();
+ final Configuration conf = factory.getConfiguration();
+ conf.registerExtensionFunction(new ExtractYear());
+ conf.registerExtensionFunction(new NormalizeDate());
+ conf.registerExtensionFunction(new PickFirst());
+
+ return factory.newTransformer(new StreamSource(new StringReader(xslt)));
+ }
+
+}
diff --git a/dhp-common/src/test/java/eu/dnetlib/dhp/application/ArgumentApplicationParserTest.java b/dhp-common/src/test/java/eu/dnetlib/dhp/application/ArgumentApplicationParserTest.java
index fdea3c2d41..f4598ebd49 100644
--- a/dhp-common/src/test/java/eu/dnetlib/dhp/application/ArgumentApplicationParserTest.java
+++ b/dhp-common/src/test/java/eu/dnetlib/dhp/application/ArgumentApplicationParserTest.java
@@ -1,18 +1,13 @@
package eu.dnetlib.dhp.application;
import org.apache.commons.io.IOUtils;
-import org.junit.Test;
+import org.junit.jupiter.api.Test;
-import java.io.ByteArrayOutputStream;
-import java.util.Base64;
-import java.util.zip.GZIPOutputStream;
-
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertNotNull;
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertNotNull;
public class ArgumentApplicationParserTest {
-
@Test
public void testParseParameter() throws Exception {
final String jsonConfiguration = IOUtils.toString(this.getClass().getResourceAsStream("/eu/dnetlib/application/parameters.json"));
diff --git a/dhp-common/src/test/java/eu/dnetlib/dhp/model/mdstore/MetadataRecordTest.java b/dhp-common/src/test/java/eu/dnetlib/dhp/model/mdstore/MetadataRecordTest.java
index 4515429eab..a2bac54baf 100644
--- a/dhp-common/src/test/java/eu/dnetlib/dhp/model/mdstore/MetadataRecordTest.java
+++ b/dhp-common/src/test/java/eu/dnetlib/dhp/model/mdstore/MetadataRecordTest.java
@@ -1,8 +1,8 @@
package eu.dnetlib.dhp.model.mdstore;
-import org.junit.Test;
+import org.junit.jupiter.api.Test;
-import static org.junit.Assert.assertTrue;
+import static org.junit.jupiter.api.Assertions.assertTrue;
public class MetadataRecordTest {
@@ -10,6 +10,6 @@ public class MetadataRecordTest {
public void getTimestamp() {
MetadataRecord r = new MetadataRecord();
- assertTrue(r.getDateOfCollection() >0);
+ assertTrue(r.getDateOfCollection() > 0);
}
}
\ No newline at end of file
diff --git a/dhp-common/src/test/java/eu/dnetlib/message/MessageTest.java b/dhp-common/src/test/java/eu/dnetlib/message/MessageTest.java
index fbc9dc2514..73df63b321 100644
--- a/dhp-common/src/test/java/eu/dnetlib/message/MessageTest.java
+++ b/dhp-common/src/test/java/eu/dnetlib/message/MessageTest.java
@@ -1,12 +1,12 @@
package eu.dnetlib.message;
-import org.junit.Test;
+import org.junit.jupiter.api.Test;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
-import static org.junit.Assert.*;
+import static org.junit.jupiter.api.Assertions.*;
public class MessageTest {
diff --git a/dhp-common/src/test/java/eu/dnetlib/scholexplorer/relation/RelationMapperTest.java b/dhp-common/src/test/java/eu/dnetlib/scholexplorer/relation/RelationMapperTest.java
index db6f4429a2..eb9fb172d6 100644
--- a/dhp-common/src/test/java/eu/dnetlib/scholexplorer/relation/RelationMapperTest.java
+++ b/dhp-common/src/test/java/eu/dnetlib/scholexplorer/relation/RelationMapperTest.java
@@ -1,7 +1,7 @@
package eu.dnetlib.scholexplorer.relation;
-import org.apache.commons.io.IOUtils;
-import org.junit.Test;
+import org.junit.jupiter.api.Test;
+
public class RelationMapperTest {
diff --git a/dhp-doc-resources/img/data_provision_workflow.png b/dhp-doc-resources/img/data_provision_workflow.png
new file mode 100644
index 0000000000..31979fbb49
Binary files /dev/null and b/dhp-doc-resources/img/data_provision_workflow.png differ
diff --git a/dhp-schemas/README.md b/dhp-schemas/README.md
index 473ad4cf19..7431cda426 100644
--- a/dhp-schemas/README.md
+++ b/dhp-schemas/README.md
@@ -1,3 +1,11 @@
Description of the project
--------------------------
-This project defines **serialization schemas** of Avro data store files that are used to pass data between workflow nodes in the system.
+This project defines **object schemas** of the OpenAIRE main entities and the relationships that intercur among them.
+Namely it defines the model for
+
+- **research product (result)** which subclasses in publication, dataset, other research product, software
+- **data source** object describing the data provider (institutional repository, aggregators, cris systems)
+- **organization** research bodies managing a data source or participating to a research project
+- **project** research project
+
+Te serialization of such objects (data store files) are used to pass data between workflow nodes in the processing pipeline.
diff --git a/dhp-schemas/pom.xml b/dhp-schemas/pom.xml
index 8bc30a8b0e..a85c0dd230 100644
--- a/dhp-schemas/pom.xml
+++ b/dhp-schemas/pom.xml
@@ -5,7 +5,7 @@
eu.dnetlib.dhp
dhp
- 1.0.5-SNAPSHOT
+ 1.1.6-SNAPSHOT
../
@@ -27,18 +27,15 @@
- eu.dnetlib.dhp
- dhp-common
- ${project.version}
+ com.fasterxml.jackson.core
+ jackson-databind
- com.fasterxml.jackson.core
- jackson-databind
- test
+ com.google.guava
+ guava
-
diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/action/AtomicAction.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/action/AtomicAction.java
new file mode 100644
index 0000000000..0f9aa3adbc
--- /dev/null
+++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/action/AtomicAction.java
@@ -0,0 +1,38 @@
+package eu.dnetlib.dhp.schema.action;
+
+import com.fasterxml.jackson.databind.annotation.JsonDeserialize;
+import eu.dnetlib.dhp.schema.oaf.Oaf;
+
+import java.io.Serializable;
+
+@JsonDeserialize(using = AtomicActionDeserializer.class)
+public class AtomicAction implements Serializable {
+
+ private Class clazz;
+
+ private T payload;
+
+ public AtomicAction() {
+ }
+
+ public AtomicAction(Class clazz, T payload) {
+ this.clazz = clazz;
+ this.payload = payload;
+ }
+
+ public Class getClazz() {
+ return clazz;
+ }
+
+ public void setClazz(Class clazz) {
+ this.clazz = clazz;
+ }
+
+ public T getPayload() {
+ return payload;
+ }
+
+ public void setPayload(T payload) {
+ this.payload = payload;
+ }
+}
diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/action/AtomicActionDeserializer.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/action/AtomicActionDeserializer.java
new file mode 100644
index 0000000000..e6017288fa
--- /dev/null
+++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/action/AtomicActionDeserializer.java
@@ -0,0 +1,29 @@
+package eu.dnetlib.dhp.schema.action;
+
+import com.fasterxml.jackson.core.JsonParser;
+import com.fasterxml.jackson.core.JsonProcessingException;
+import com.fasterxml.jackson.databind.DeserializationContext;
+import com.fasterxml.jackson.databind.JsonDeserializer;
+import com.fasterxml.jackson.databind.JsonNode;
+import com.fasterxml.jackson.databind.ObjectMapper;
+import eu.dnetlib.dhp.schema.oaf.Oaf;
+
+import java.io.IOException;
+
+public class AtomicActionDeserializer extends JsonDeserializer {
+
+ @Override
+ public Object deserialize(JsonParser jp, DeserializationContext ctxt) throws IOException, JsonProcessingException {
+ JsonNode node = jp.getCodec().readTree(jp);
+ String classTag = node.get("clazz").asText();
+ JsonNode payload = node.get("payload");
+ ObjectMapper mapper = new ObjectMapper();
+
+ try {
+ final Class> clazz = Class.forName(classTag);
+ return new AtomicAction(clazz, (Oaf) mapper.readValue(payload.toString(), clazz));
+ } catch (ClassNotFoundException e) {
+ throw new IOException(e);
+ }
+ }
+}
diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Country.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Country.java
new file mode 100644
index 0000000000..e81120e428
--- /dev/null
+++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Country.java
@@ -0,0 +1,15 @@
+package eu.dnetlib.dhp.schema.oaf;
+
+public class Country extends Qualifier {
+
+ private DataInfo dataInfo;
+
+ public DataInfo getDataInfo() {
+ return dataInfo;
+ }
+
+ public void setDataInfo(DataInfo dataInfo) {
+ this.dataInfo = dataInfo;
+ }
+
+}
diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Datasource.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Datasource.java
index f52a500fe6..032468de2f 100644
--- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Datasource.java
+++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Datasource.java
@@ -40,9 +40,9 @@ public class Datasource extends OafEntity implements Serializable {
private List> odlanguages;
- private List< Field> odcontenttypes;
+ private List> odcontenttypes;
- private List< Field> accessinfopackage;
+ private List> accessinfopackage;
// re3data fields
private Field releasestartdate;
diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/GeoLocation.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/GeoLocation.java
index 43af60286d..1839fbd53e 100644
--- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/GeoLocation.java
+++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/GeoLocation.java
@@ -1,5 +1,6 @@
package eu.dnetlib.dhp.schema.oaf;
+import com.fasterxml.jackson.annotation.JsonIgnore;
import org.apache.commons.lang3.StringUtils;
import java.io.Serializable;
@@ -36,7 +37,7 @@ public class GeoLocation implements Serializable {
this.place = place;
}
-
+ @JsonIgnore
public boolean isBlank() {
return StringUtils.isBlank(point) &&
StringUtils.isBlank(box) &&
diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Instance.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Instance.java
index 8f852af65d..f82296d8bf 100644
--- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Instance.java
+++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Instance.java
@@ -22,6 +22,14 @@ public class Instance implements Serializable {
private Field dateofacceptance;
+ // ( article | book ) processing charges. Defined here to cope with possible wrongly typed results
+ private Field processingchargeamount;
+
+ // currency - alphabetic code describe in ISO-4217. Defined here to cope with possible wrongly typed results
+ private Field processingchargecurrency;
+
+ private Field refereed; //peer-review status
+
public Field getLicense() {
return license;
}
@@ -86,7 +94,29 @@ public class Instance implements Serializable {
this.dateofacceptance = dateofacceptance;
}
+ public Field getProcessingchargeamount() {
+ return processingchargeamount;
+ }
+ public void setProcessingchargeamount(Field processingchargeamount) {
+ this.processingchargeamount = processingchargeamount;
+ }
+
+ public Field getProcessingchargecurrency() {
+ return processingchargecurrency;
+ }
+
+ public void setProcessingchargecurrency(Field processingchargecurrency) {
+ this.processingchargecurrency = processingchargecurrency;
+ }
+
+ public Field getRefereed() {
+ return refereed;
+ }
+
+ public void setRefereed(Field refereed) {
+ this.refereed = refereed;
+ }
public String toComparableString(){
return String.format("%s::%s::%s::%s",
diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/KeyValue.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/KeyValue.java
index 59cefa40e9..5a841b96f5 100644
--- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/KeyValue.java
+++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/KeyValue.java
@@ -1,12 +1,10 @@
package eu.dnetlib.dhp.schema.oaf;
-import com.fasterxml.jackson.annotation.JacksonInject;
import com.fasterxml.jackson.annotation.JsonIgnore;
-import com.fasterxml.jackson.annotation.JsonIgnoreProperties;
import org.apache.commons.lang3.StringUtils;
import java.io.Serializable;
-@JsonIgnoreProperties({"blank"})
+
public class KeyValue implements Serializable {
private String key;
@@ -39,7 +37,6 @@ public class KeyValue implements Serializable {
this.dataInfo = dataInfo;
}
- @JsonIgnore
public String toComparableString() {
return isBlank()?"":String.format("%s::%s", key != null ? key.toLowerCase() : "", value != null ? value.toLowerCase() : "");
}
diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Qualifier.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Qualifier.java
index 7e4660f4b4..ae2bf1a602 100644
--- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Qualifier.java
+++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Qualifier.java
@@ -1,5 +1,6 @@
package eu.dnetlib.dhp.schema.oaf;
+import com.fasterxml.jackson.annotation.JsonIgnore;
import org.apache.commons.lang3.StringUtils;
import java.io.Serializable;
@@ -50,6 +51,8 @@ public class Qualifier implements Serializable {
schemeid != null ? schemeid : "",
schemename != null ? schemename : "");
}
+
+ @JsonIgnore
public boolean isBlank() {
return StringUtils.isBlank(classid) &&
StringUtils.isBlank(classname) &&
diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Relation.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Relation.java
index 03122983dc..6738b86938 100644
--- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Relation.java
+++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Relation.java
@@ -1,85 +1,104 @@
package eu.dnetlib.dhp.schema.oaf;
-import java.util.*;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Objects;
import java.util.stream.Collectors;
+import java.util.stream.Stream;
+
+import static com.google.common.base.Preconditions.checkArgument;
public class Relation extends Oaf {
- private String relType;
+ private String relType;
- private String subRelType;
+ private String subRelType;
- private String relClass;
+ private String relClass;
- private String source;
+ private String source;
- private String target;
+ private String target;
- private List collectedFrom;
+ private List collectedFrom = new ArrayList<>();
- public String getRelType() {
- return relType;
- }
+ public String getRelType() {
+ return relType;
+ }
- public void setRelType(String relType) {
- this.relType = relType;
- }
+ public void setRelType(final String relType) {
+ this.relType = relType;
+ }
- public String getSubRelType() {
- return subRelType;
- }
+ public String getSubRelType() {
+ return subRelType;
+ }
- public void setSubRelType(String subRelType) {
- this.subRelType = subRelType;
- }
+ public void setSubRelType(final String subRelType) {
+ this.subRelType = subRelType;
+ }
- public String getRelClass() {
- return relClass;
- }
+ public String getRelClass() {
+ return relClass;
+ }
- public void setRelClass(String relClass) {
- this.relClass = relClass;
- }
+ public void setRelClass(final String relClass) {
+ this.relClass = relClass;
+ }
- public String getSource() {
- return source;
- }
+ public String getSource() {
+ return source;
+ }
- public void setSource(String source) {
- this.source = source;
- }
+ public void setSource(final String source) {
+ this.source = source;
+ }
- public String getTarget() {
- return target;
- }
+ public String getTarget() {
+ return target;
+ }
- public void setTarget(String target) {
- this.target = target;
- }
+ public void setTarget(final String target) {
+ this.target = target;
+ }
- public List getCollectedFrom() {
- return collectedFrom;
- }
+ public List getCollectedFrom() {
+ return collectedFrom;
+ }
- public void setCollectedFrom(List collectedFrom) {
- this.collectedFrom = collectedFrom;
- }
+ public void setCollectedFrom(final List collectedFrom) {
+ this.collectedFrom = collectedFrom;
+ }
- public void mergeFrom(Relation other) {
- this.mergeOAFDataInfo(other);
- if (other.getCollectedFrom() == null || other.getCollectedFrom().size() == 0)
- return;
- if (collectedFrom == null && other.getCollectedFrom() != null) {
- collectedFrom = other.getCollectedFrom();
- return;
- }
- if (other.getCollectedFrom() != null) {
- collectedFrom.addAll(other.getCollectedFrom());
+ public void mergeFrom(final Relation r) {
+
+ checkArgument(Objects.equals(getSource(), r.getSource()),"source ids must be equal");
+ checkArgument(Objects.equals(getTarget(), r.getTarget()),"target ids must be equal");
+ checkArgument(Objects.equals(getRelType(), r.getRelType()),"relType(s) must be equal");
+ checkArgument(Objects.equals(getSubRelType(), r.getSubRelType()),"subRelType(s) must be equal");
+ checkArgument(Objects.equals(getRelClass(), r.getRelClass()),"relClass(es) must be equal");
+
+ setCollectedFrom(Stream.concat(getCollectedFrom().stream(), r.getCollectedFrom().stream())
+ .distinct() // relies on KeyValue.equals
+ .collect(Collectors.toList()));
+ }
+
+ @Override
+ public boolean equals(Object o) {
+ if (this == o) return true;
+ if (o == null || getClass() != o.getClass()) return false;
+ Relation relation = (Relation) o;
+ return relType.equals(relation.relType) &&
+ subRelType.equals(relation.subRelType) &&
+ relClass.equals(relation.relClass) &&
+ source.equals(relation.source) &&
+ target.equals(relation.target) &&
+ Objects.equals(collectedFrom, relation.collectedFrom);
+ }
+
+ @Override
+ public int hashCode() {
+ return Objects.hash(relType, subRelType, relClass, source, target, collectedFrom);
+ }
- collectedFrom = new ArrayList<>(collectedFrom
- .stream()
- .collect(Collectors.toMap(KeyValue::toComparableString, x -> x, (x1, x2) -> x1))
- .values());
- }
- }
}
diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Result.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Result.java
index eb5572ce13..b6c73e84a6 100644
--- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Result.java
+++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Result.java
@@ -1,12 +1,10 @@
package eu.dnetlib.dhp.schema.oaf;
-import org.apache.commons.lang3.StringUtils;
-
import java.io.Serializable;
-import java.util.*;
-import java.util.stream.Collectors;
+import java.util.Comparator;
+import java.util.List;
-public abstract class Result extends OafEntity implements Serializable {
+public class Result extends OafEntity implements Serializable {
private List author;
@@ -16,7 +14,7 @@ public abstract class Result extends OafEntity implements Serializable {
// common fields
private Qualifier language;
- private List country;
+ private List country;
private List subject;
@@ -44,16 +42,10 @@ public abstract class Result extends OafEntity implements Serializable {
private List> coverage;
- private Field refereed; //peer-review status
+ private Qualifier bestaccessright;
private List context;
- // ( article | book ) processing charges. Defined here to cope with possible wrongly typed results
- private Field processingchargeamount;
-
- // currency - alphabetic code describe in ISO-4217. Defined here to cope with possible wrongly typed results
- private Field processingchargecurrency;
-
private List externalReference;
private List instance;
@@ -82,11 +74,11 @@ public abstract class Result extends OafEntity implements Serializable {
this.language = language;
}
- public List getCountry() {
+ public List getCountry() {
return country;
}
- public void setCountry(List country) {
+ public void setCountry(List country) {
this.country = country;
}
@@ -194,12 +186,12 @@ public abstract class Result extends OafEntity implements Serializable {
this.coverage = coverage;
}
- public Field getRefereed() {
- return refereed;
+ public Qualifier getBestaccessright() {
+ return bestaccessright;
}
- public void setRefereed(Field refereed) {
- this.refereed = refereed;
+ public void setBestaccessright(Qualifier bestaccessright) {
+ this.bestaccessright = bestaccessright;
}
public List getContext() {
@@ -226,24 +218,6 @@ public abstract class Result extends OafEntity implements Serializable {
this.instance = instance;
}
- public Field getProcessingchargeamount() {
- return processingchargeamount;
- }
-
- public Result setProcessingchargeamount(Field processingchargeamount) {
- this.processingchargeamount = processingchargeamount;
- return this;
- }
-
- public Field getProcessingchargecurrency() {
- return processingchargecurrency;
- }
-
- public Result setProcessingchargecurrency(Field processingchargecurrency) {
- this.processingchargecurrency = processingchargecurrency;
- return this;
- }
-
@Override
public void mergeFrom(OafEntity e) {
super.mergeFrom(e);
@@ -287,19 +261,9 @@ public abstract class Result extends OafEntity implements Serializable {
coverage = mergeLists(coverage, r.getCoverage());
- if (r.getRefereed() != null && compareTrust(this, r) < 0)
- refereed = r.getRefereed();
-
context = mergeLists(context, r.getContext());
- if (r.getProcessingchargeamount() != null && compareTrust(this, r) < 0)
- processingchargeamount = r.getProcessingchargeamount();
-
- if (r.getProcessingchargecurrency() != null && compareTrust(this, r) < 0)
- processingchargecurrency = r.getProcessingchargecurrency();
-
externalReference = mergeLists(externalReference, r.getExternalReference());
-
}
@@ -314,5 +278,4 @@ public abstract class Result extends OafEntity implements Serializable {
return a.size() > b.size() ? a : b;
}
-
}
diff --git a/dhp-schemas/src/test/java/eu/dnetlib/dhp/schema/action/AtomicActionTest.java b/dhp-schemas/src/test/java/eu/dnetlib/dhp/schema/action/AtomicActionTest.java
new file mode 100644
index 0000000000..d216c05d5f
--- /dev/null
+++ b/dhp-schemas/src/test/java/eu/dnetlib/dhp/schema/action/AtomicActionTest.java
@@ -0,0 +1,42 @@
+package eu.dnetlib.dhp.schema.action;
+
+import com.fasterxml.jackson.databind.ObjectMapper;
+import eu.dnetlib.dhp.schema.oaf.Relation;
+import org.apache.commons.lang3.StringUtils;
+
+import org.junit.jupiter.api.Test;
+
+import java.io.IOException;
+
+import static org.junit.jupiter.api.Assertions.*;
+
+/**
+ * @author claudio.atzori
+ */
+public class AtomicActionTest {
+
+ @Test
+ public void serializationTest() throws IOException {
+
+ Relation rel = new Relation();
+ rel.setSource("1");
+ rel.setTarget("2");
+ rel.setRelType("resultResult");
+ rel.setSubRelType("dedup");
+ rel.setRelClass("merges");
+
+ AtomicAction aa1 = new AtomicAction(Relation.class, rel);
+
+ final ObjectMapper mapper = new ObjectMapper();
+ String json = mapper.writeValueAsString(aa1);
+
+ assertTrue(StringUtils.isNotBlank(json));
+
+ AtomicAction aa2 = mapper.readValue(json, AtomicAction.class);
+
+ assertEquals(aa1.getClazz(), aa2.getClazz());
+ assertEquals(aa1.getPayload(), aa2.getPayload());
+
+ }
+
+}
diff --git a/dhp-schemas/src/test/java/eu/dnetlib/dhp/schema/oaf/MergeTest.java b/dhp-schemas/src/test/java/eu/dnetlib/dhp/schema/oaf/MergeTest.java
index e487ddcbaa..ac4bd5d27d 100644
--- a/dhp-schemas/src/test/java/eu/dnetlib/dhp/schema/oaf/MergeTest.java
+++ b/dhp-schemas/src/test/java/eu/dnetlib/dhp/schema/oaf/MergeTest.java
@@ -1,11 +1,9 @@
package eu.dnetlib.dhp.schema.oaf;
-import org.junit.Assert;
-import org.junit.Before;
-import org.junit.Test;
+import static org.junit.jupiter.api.Assertions.*;
+import org.junit.jupiter.api.BeforeEach;
+import org.junit.jupiter.api.Test;
-import java.io.IOException;
-import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
@@ -13,7 +11,7 @@ public class MergeTest {
OafEntity oaf;
- @Before
+ @BeforeEach
public void setUp() {
oaf = new Publication();
}
@@ -44,8 +42,8 @@ public class MergeTest {
a.mergeFrom(b);
- Assert.assertNotNull(a.getCollectedfrom());
- Assert.assertEquals(3, a.getCollectedfrom().size());
+ assertNotNull(a.getCollectedfrom());
+ assertEquals(3, a.getCollectedfrom().size());
}
@@ -60,8 +58,8 @@ public class MergeTest {
a.mergeFrom(b);
- Assert.assertNotNull(a.getSubject());
- Assert.assertEquals(3, a.getSubject().size());
+ assertNotNull(a.getSubject());
+ assertEquals(3, a.getSubject().size());
}
diff --git a/dhp-schemas/src/test/java/eu/dnetlib/dhp/schema/scholexplorer/DLItest.java b/dhp-schemas/src/test/java/eu/dnetlib/dhp/schema/scholexplorer/DLItest.java
index 54f5f5f06f..6a88151c95 100644
--- a/dhp-schemas/src/test/java/eu/dnetlib/dhp/schema/scholexplorer/DLItest.java
+++ b/dhp-schemas/src/test/java/eu/dnetlib/dhp/schema/scholexplorer/DLItest.java
@@ -6,7 +6,7 @@ import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.SerializationFeature;
import eu.dnetlib.dhp.schema.oaf.Qualifier;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
-import org.junit.Test;
+import org.junit.jupiter.api.Test;
import java.io.IOException;
import java.util.Arrays;
diff --git a/dhp-workflows/dhp-aggregation/pom.xml b/dhp-workflows/dhp-aggregation/pom.xml
index c6bb99fc3e..95e9578478 100644
--- a/dhp-workflows/dhp-aggregation/pom.xml
+++ b/dhp-workflows/dhp-aggregation/pom.xml
@@ -4,7 +4,7 @@
eu.dnetlib.dhp
dhp-workflows
- 1.0.5-SNAPSHOT
+ 1.1.6-SNAPSHOT
dhp-aggregation
@@ -24,6 +24,61 @@
eu.dnetlib.dhp
dhp-common
${project.version}
+
+
+ com.sun.xml.bind
+ jaxb-core
+
+
+
+
+
+ eu.dnetlib.dhp
+ dhp-schemas
+ ${project.version}
+
+
+
+ eu.dnetlib
+ dnet-actionmanager-common
+
+
+ eu.dnetlib
+ dnet-openaireplus-mapping-utils
+
+
+ saxonica
+ saxon
+
+
+ saxonica
+ saxon-dom
+
+
+ jgrapht
+ jgrapht
+
+
+ net.sf.ehcache
+ ehcache
+
+
+ org.springframework
+ spring-test
+
+
+ org.apache.*
+ *
+
+
+ apache
+ *
+
+
+
+
+ eu.dnetlib
+ dnet-openaire-data-protos
@@ -44,13 +99,22 @@
jaxen
jaxen
+
+
+ org.mongodb
+ mongo-java-driver
+
- org.mockito
- mockito-core
- 2.25.0
- test
+ org.apache.hadoop
+ hadoop-distcp
+
+
+
+ org.postgresql
+ postgresql
+ 42.2.10
diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/actions/LicenseComparator.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/actions/LicenseComparator.java
new file mode 100644
index 0000000000..9d0e82aca9
--- /dev/null
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/actions/LicenseComparator.java
@@ -0,0 +1,49 @@
+package eu.dnetlib.dhp.migration.actions;
+
+import eu.dnetlib.data.proto.FieldTypeProtos.Qualifier;
+
+import java.util.Comparator;
+
+public class LicenseComparator implements Comparator {
+
+ @Override
+ public int compare(Qualifier left, Qualifier right) {
+
+ if (left == null && right == null) return 0;
+ if (left == null) return 1;
+ if (right == null) return -1;
+
+ String lClass = left.getClassid();
+ String rClass = right.getClassid();
+
+ if (lClass.equals(rClass)) return 0;
+
+ if (lClass.equals("OPEN SOURCE")) return -1;
+ if (rClass.equals("OPEN SOURCE")) return 1;
+
+ if (lClass.equals("OPEN")) return -1;
+ if (rClass.equals("OPEN")) return 1;
+
+ if (lClass.equals("6MONTHS")) return -1;
+ if (rClass.equals("6MONTHS")) return 1;
+
+ if (lClass.equals("12MONTHS")) return -1;
+ if (rClass.equals("12MONTHS")) return 1;
+
+ if (lClass.equals("EMBARGO")) return -1;
+ if (rClass.equals("EMBARGO")) return 1;
+
+ if (lClass.equals("RESTRICTED")) return -1;
+ if (rClass.equals("RESTRICTED")) return 1;
+
+ if (lClass.equals("CLOSED")) return -1;
+ if (rClass.equals("CLOSED")) return 1;
+
+ if (lClass.equals("UNKNOWN")) return -1;
+ if (rClass.equals("UNKNOWN")) return 1;
+
+ // Else (but unlikely), lexicographical ordering will do.
+ return lClass.compareTo(rClass);
+ }
+
+}
diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/actions/MigrateActionSet.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/actions/MigrateActionSet.java
new file mode 100644
index 0000000000..487fac359c
--- /dev/null
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/actions/MigrateActionSet.java
@@ -0,0 +1,170 @@
+package eu.dnetlib.dhp.migration.actions;
+
+import com.google.common.base.Splitter;
+import com.google.common.collect.Lists;
+import eu.dnetlib.dhp.application.ArgumentApplicationParser;
+import eu.dnetlib.dhp.utils.ISLookupClientFactory;
+import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
+import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
+import org.apache.commons.io.IOUtils;
+import org.apache.commons.lang3.StringUtils;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.tools.DistCp;
+import org.apache.hadoop.tools.DistCpOptions;
+import org.apache.hadoop.util.ToolRunner;
+
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.OutputStream;
+import java.util.*;
+import java.util.stream.Collectors;
+
+public class MigrateActionSet {
+
+ private static final Log log = LogFactory.getLog(MigrateActionSet.class);
+
+ private static final String SEPARATOR = "/";
+ private static final String TARGET_PATHS = "target_paths";
+ private static final String RAWSET_PREFIX = "rawset_";
+
+ private static Boolean DEFAULT_TRANSFORM_ONLY = false;
+
+ public static void main(String[] args) throws Exception {
+ final ArgumentApplicationParser parser = new ArgumentApplicationParser(
+ IOUtils.toString(MigrateActionSet.class.getResourceAsStream(
+ "/eu/dnetlib/dhp/migration/migrate_actionsets_parameters.json")));
+ parser.parseArgument(args);
+
+ new MigrateActionSet().run(parser);
+ }
+
+ private void run(ArgumentApplicationParser parser) throws Exception {
+
+ final String isLookupUrl = parser.get("isLookupUrl");
+ final String sourceNN = parser.get("sourceNameNode");
+ final String targetNN = parser.get("targetNameNode");
+ final String workDir = parser.get("workingDirectory");
+ final Integer distcp_num_maps = Integer.parseInt(parser.get("distcp_num_maps"));
+
+ final String distcp_memory_mb = parser.get("distcp_memory_mb");
+ final String distcp_task_timeout = parser.get("distcp_task_timeout");
+
+ final String transform_only_s = parser.get("transform_only");
+
+ log.info("transform only param: " + transform_only_s);
+
+ final Boolean transformOnly = Boolean.valueOf(parser.get("transform_only"));
+
+ log.info("transform only: " + transformOnly);
+
+ ISLookUpService isLookUp = ISLookupClientFactory.getLookUpService(isLookupUrl);
+
+ Configuration conf = getConfiguration(distcp_task_timeout, distcp_memory_mb, distcp_num_maps);
+ FileSystem targetFS = FileSystem.get(conf);
+
+ Configuration sourceConf = getConfiguration(distcp_task_timeout, distcp_memory_mb, distcp_num_maps);
+ sourceConf.set(FileSystem.FS_DEFAULT_NAME_KEY, sourceNN);
+ FileSystem sourceFS = FileSystem.get(sourceConf);
+
+ Properties props = new Properties();
+
+ List targetPaths = new ArrayList<>();
+
+ final List sourcePaths = getSourcePaths(sourceNN, isLookUp);
+ log.info(String.format("paths to process:\n%s", sourcePaths.stream().map(p -> p.toString()).collect(Collectors.joining("\n"))));
+ for(Path source : sourcePaths) {
+
+ if (!sourceFS.exists(source)) {
+ log.warn(String.format("skipping unexisting path: %s", source));
+ } else {
+
+ LinkedList pathQ = Lists.newLinkedList(Splitter.on(SEPARATOR).split(source.toUri().getPath()));
+
+ final String rawSet = pathQ.pollLast();
+ log.info(String.format("got RAWSET: %s", rawSet));
+
+ if (StringUtils.isNotBlank(rawSet) && rawSet.startsWith(RAWSET_PREFIX)) {
+
+ final String actionSetDirectory = pathQ.pollLast();
+
+ final Path targetPath = new Path(targetNN + workDir + SEPARATOR + actionSetDirectory + SEPARATOR + rawSet);
+
+ log.info(String.format("using TARGET PATH: %s", targetPath));
+
+ if (!transformOnly) {
+ if (targetFS.exists(targetPath)) {
+ targetFS.delete(targetPath, true);
+ }
+ runDistcp(distcp_num_maps, distcp_memory_mb, distcp_task_timeout, conf, source, targetPath);
+ }
+
+ targetPaths.add(targetPath);
+ }
+ }
+ }
+
+ props.setProperty(TARGET_PATHS, targetPaths
+ .stream()
+ .map(p -> p.toString())
+ .collect(Collectors.joining(",")));
+ File file = new File(System.getProperty("oozie.action.output.properties"));
+
+ try(OutputStream os = new FileOutputStream(file)) {
+ props.store(os, "");
+ }
+ System.out.println(file.getAbsolutePath());
+ }
+
+ private void runDistcp(Integer distcp_num_maps, String distcp_memory_mb, String distcp_task_timeout, Configuration conf, Path source, Path targetPath) throws Exception {
+
+ final DistCpOptions op = new DistCpOptions(source, targetPath);
+ op.setMaxMaps(distcp_num_maps);
+ op.preserve(DistCpOptions.FileAttribute.BLOCKSIZE);
+ op.preserve(DistCpOptions.FileAttribute.REPLICATION);
+ op.preserve(DistCpOptions.FileAttribute.CHECKSUMTYPE);
+
+ int res = ToolRunner.run(new DistCp(conf, op), new String[]{
+ "-Dmapred.task.timeout=" + distcp_task_timeout,
+ "-Dmapreduce.map.memory.mb=" + distcp_memory_mb,
+ "-pb",
+ "-m " + distcp_num_maps,
+ source.toString(),
+ targetPath.toString()});
+
+ if (res != 0) {
+ throw new RuntimeException(String.format("distcp exited with code %s", res));
+ }
+ }
+
+ private Configuration getConfiguration(String distcp_task_timeout, String distcp_memory_mb, Integer distcp_num_maps) {
+ final Configuration conf = new Configuration();
+ conf.set("dfs.webhdfs.socket.connect-timeout", distcp_task_timeout);
+ conf.set("dfs.webhdfs.socket.read-timeout", distcp_task_timeout);
+ conf.set("dfs.http.client.retry.policy.enabled", "true");
+ conf.set("mapred.task.timeout", distcp_task_timeout);
+ conf.set("mapreduce.map.memory.mb", distcp_memory_mb);
+ conf.set("mapred.map.tasks", String.valueOf(distcp_num_maps));
+ return conf;
+ }
+
+ private List getSourcePaths(String sourceNN, ISLookUpService isLookUp) throws ISLookUpException {
+ String XQUERY = "distinct-values(\n" +
+ "let $basePath := collection('/db/DRIVER/ServiceResources/ActionManagerServiceResourceType')//SERVICE_PROPERTIES/PROPERTY[@key = 'basePath']/@value/string()\n" +
+ "for $x in collection('/db/DRIVER/ActionManagerSetDSResources/ActionManagerSetDSResourceType') \n" +
+ "let $setDir := $x//SET/@directory/string()\n" +
+ "let $rawSet := $x//RAW_SETS/LATEST/@id/string()\n" +
+ "return concat($basePath, '/', $setDir, '/', $rawSet))";
+
+ log.info(String.format("running xquery:\n%s", XQUERY));
+ return isLookUp.quickSearchProfile(XQUERY)
+ .stream()
+ .map(p -> sourceNN + p)
+ .map(Path::new)
+ .collect(Collectors.toList());
+ }
+
+}
diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/actions/ProtoConverter.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/actions/ProtoConverter.java
new file mode 100644
index 0000000000..a7e70ee813
--- /dev/null
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/actions/ProtoConverter.java
@@ -0,0 +1,580 @@
+package eu.dnetlib.dhp.migration.actions;
+
+import com.google.common.collect.Lists;
+import com.googlecode.protobuf.format.JsonFormat;
+import eu.dnetlib.data.proto.*;
+import eu.dnetlib.dhp.schema.oaf.*;
+import org.apache.commons.lang3.StringUtils;
+
+import java.io.Serializable;
+import java.util.List;
+import java.util.Optional;
+import java.util.stream.Collectors;
+
+public class ProtoConverter implements Serializable {
+
+ public static final String UNKNOWN = "UNKNOWN";
+ public static final String NOT_AVAILABLE = "not available";
+ public static final String DNET_ACCESS_MODES = "dnet:access_modes";
+
+ public static Oaf convert(OafProtos.Oaf oaf) {
+ try {
+ switch (oaf.getKind()) {
+ case entity:
+ return convertEntity(oaf);
+ case relation:
+ return convertRelation(oaf);
+ default:
+ throw new IllegalArgumentException("invalid kind " + oaf.getKind());
+ }
+ } catch (Throwable e) {
+ throw new RuntimeException("error on getting " + JsonFormat.printToString(oaf), e);
+ }
+ }
+
+ private static Relation convertRelation(OafProtos.Oaf oaf) {
+ final OafProtos.OafRel r = oaf.getRel();
+ final Relation rel = new Relation();
+ rel.setDataInfo(mapDataInfo(oaf.getDataInfo()));
+ rel.setLastupdatetimestamp(oaf.getLastupdatetimestamp());
+ rel.setSource(r.getSource());
+ rel.setTarget(r.getTarget());
+ rel.setRelType(r.getRelType().toString());
+ rel.setSubRelType(r.getSubRelType().toString());
+ rel.setRelClass(r.getRelClass());
+ rel.setCollectedFrom(r.getCollectedfromCount() > 0 ?
+ r.getCollectedfromList().stream()
+ .map(kv -> mapKV(kv))
+ .collect(Collectors.toList()) : null);
+ return rel;
+ }
+
+ private static OafEntity convertEntity(OafProtos.Oaf oaf) {
+
+ switch (oaf.getEntity().getType()) {
+ case result:
+ final Result r = convertResult(oaf);
+ r.setInstance(convertInstances(oaf));
+ return r;
+ case project:
+ return convertProject(oaf);
+ case datasource:
+ return convertDataSource(oaf);
+ case organization:
+ return convertOrganization(oaf);
+ default:
+ throw new RuntimeException("received unknown type");
+ }
+ }
+
+ private static List convertInstances(OafProtos.Oaf oaf) {
+
+ final ResultProtos.Result r = oaf.getEntity().getResult();
+ if (r.getInstanceCount() > 0) {
+ return r.getInstanceList()
+ .stream()
+ .map(i -> convertInstance(i))
+ .collect(Collectors.toList());
+ }
+ return Lists.newArrayList();
+ }
+
+ private static Instance convertInstance(ResultProtos.Result.Instance ri) {
+ final Instance i = new Instance();
+ i.setAccessright(mapQualifier(ri.getAccessright()));
+ i.setCollectedfrom(mapKV(ri.getCollectedfrom()));
+ i.setDateofacceptance(mapStringField(ri.getDateofacceptance()));
+ i.setDistributionlocation(ri.getDistributionlocation());
+ i.setHostedby(mapKV(ri.getHostedby()));
+ i.setInstancetype(mapQualifier(ri.getInstancetype()));
+ i.setLicense(mapStringField(ri.getLicense()));
+ i.setUrl(ri.getUrlList());
+ i.setRefereed(mapStringField(ri.getRefereed()));
+ i.setProcessingchargeamount(mapStringField(ri.getProcessingchargeamount()));
+ i.setProcessingchargecurrency(mapStringField(ri.getProcessingchargecurrency()));
+ return i;
+ }
+
+ private static Organization convertOrganization(OafProtos.Oaf oaf) {
+ final OrganizationProtos.Organization.Metadata m = oaf.getEntity().getOrganization().getMetadata();
+ final Organization org = setOaf(new Organization(), oaf);
+ setEntity(org, oaf);
+ org.setLegalshortname(mapStringField(m.getLegalshortname()));
+ org.setLegalname(mapStringField(m.getLegalname()));
+ org.setAlternativeNames(m.getAlternativeNamesList().
+ stream()
+ .map(ProtoConverter::mapStringField)
+ .collect(Collectors.toList()));
+ org.setWebsiteurl(mapStringField(m.getWebsiteurl()));
+ org.setLogourl(mapStringField(m.getLogourl()));
+ org.setEclegalbody(mapStringField(m.getEclegalbody()));
+ org.setEclegalperson(mapStringField(m.getEclegalperson()));
+ org.setEcnonprofit(mapStringField(m.getEcnonprofit()));
+ org.setEcresearchorganization(mapStringField(m.getEcresearchorganization()));
+ org.setEchighereducation(mapStringField(m.getEchighereducation()));
+ org.setEcinternationalorganizationeurinterests(mapStringField(m.getEcinternationalorganizationeurinterests()));
+ org.setEcinternationalorganization(mapStringField(m.getEcinternationalorganization()));
+ org.setEcenterprise(mapStringField(m.getEcenterprise()));
+ org.setEcsmevalidated(mapStringField(m.getEcsmevalidated()));
+ org.setEcnutscode(mapStringField(m.getEcnutscode()));
+ org.setCountry(mapQualifier(m.getCountry()));
+
+ return org;
+ }
+
+ private static Datasource convertDataSource(OafProtos.Oaf oaf) {
+ final DatasourceProtos.Datasource.Metadata m = oaf.getEntity().getDatasource().getMetadata();
+ final Datasource datasource = setOaf(new Datasource(), oaf);
+ setEntity(datasource, oaf);
+ datasource.setAccessinfopackage(m.getAccessinfopackageList()
+ .stream()
+ .map(ProtoConverter::mapStringField)
+ .collect(Collectors.toList()));
+ datasource.setCertificates(mapStringField(m.getCertificates()));
+ datasource.setCitationguidelineurl(mapStringField(m.getCitationguidelineurl()));
+ datasource.setContactemail(mapStringField(m.getContactemail()));
+ datasource.setDatabaseaccessrestriction(mapStringField(m.getDatabaseaccessrestriction()));
+ datasource.setDatabaseaccesstype(mapStringField(m.getDatabaseaccesstype()));
+ datasource.setDataprovider(mapBoolField(m.getDataprovider()));
+ datasource.setDatasourcetype(mapQualifier(m.getDatasourcetype()));
+ datasource.setDatauploadrestriction(mapStringField(m.getDatauploadrestriction()));
+ datasource.setCitationguidelineurl(mapStringField(m.getCitationguidelineurl()));
+ datasource.setDatauploadtype(mapStringField(m.getDatauploadtype()));
+ datasource.setDateofvalidation(mapStringField(m.getDateofvalidation()));
+ datasource.setDescription(mapStringField(m.getDescription()));
+ datasource.setEnglishname(mapStringField(m.getEnglishname()));
+ datasource.setLatitude(mapStringField(m.getLatitude()));
+ datasource.setLongitude(mapStringField(m.getLongitude()));
+ datasource.setLogourl(mapStringField(m.getLogourl()));
+ datasource.setMissionstatementurl(mapStringField(m.getMissionstatementurl()));
+ datasource.setNamespaceprefix(mapStringField(m.getNamespaceprefix()));
+ datasource.setOdcontenttypes(m.getOdcontenttypesList()
+ .stream()
+ .map(ProtoConverter::mapStringField)
+ .collect(Collectors.toList()));
+ datasource.setOdlanguages(m.getOdlanguagesList()
+ .stream()
+ .map(ProtoConverter::mapStringField)
+ .collect(Collectors.toList()));
+ datasource.setOdnumberofitems(mapStringField(m.getOdnumberofitems()));
+ datasource.setOdnumberofitemsdate(mapStringField(m.getOdnumberofitemsdate()));
+ datasource.setOdpolicies(mapStringField(m.getOdpolicies()));
+ datasource.setOfficialname(mapStringField(m.getOfficialname()));
+ datasource.setOpenairecompatibility(mapQualifier(m.getOpenairecompatibility()));
+ datasource.setPidsystems(mapStringField(m.getPidsystems()));
+ datasource.setPolicies(m.getPoliciesList()
+ .stream()
+ .map(ProtoConverter::mapKV)
+ .collect(Collectors.toList()));
+ datasource.setQualitymanagementkind(mapStringField(m.getQualitymanagementkind()));
+ datasource.setReleaseenddate(mapStringField(m.getReleaseenddate()));
+ datasource.setServiceprovider(mapBoolField(m.getServiceprovider()));
+ datasource.setReleasestartdate(mapStringField(m.getReleasestartdate()));
+ datasource.setSubjects(m.getSubjectsList()
+ .stream()
+ .map(ProtoConverter::mapStructuredProperty)
+ .collect(Collectors.toList()));
+ datasource.setVersioning(mapBoolField(m.getVersioning()));
+ datasource.setWebsiteurl(mapStringField(m.getWebsiteurl()));
+ datasource.setJournal(mapJournal(m.getJournal()));
+
+
+ return datasource;
+ }
+
+ private static Project convertProject(OafProtos.Oaf oaf) {
+ final ProjectProtos.Project.Metadata m = oaf.getEntity().getProject().getMetadata();
+ final Project project = setOaf(new Project(), oaf);
+ setEntity(project, oaf);
+ project.setAcronym(mapStringField(m.getAcronym()));
+ project.setCallidentifier(mapStringField(m.getCallidentifier()));
+ project.setCode(mapStringField(m.getCode()));
+ project.setContactemail(mapStringField(m.getContactemail()));
+ project.setContactfax(mapStringField(m.getContactfax()));
+ project.setContactfullname(mapStringField(m.getContactfullname()));
+ project.setContactphone(mapStringField(m.getContactphone()));
+ project.setContracttype(mapQualifier(m.getContracttype()));
+ project.setCurrency(mapStringField(m.getCurrency()));
+ project.setDuration(mapStringField(m.getDuration()));
+ project.setEcarticle29_3(mapStringField(m.getEcarticle293()));
+ project.setEcsc39(mapStringField(m.getEcsc39()));
+ project.setOamandatepublications(mapStringField(m.getOamandatepublications()));
+ project.setStartdate(mapStringField(m.getStartdate()));
+ project.setEnddate(mapStringField(m.getEnddate()));
+ project.setFundedamount(m.getFundedamount());
+ project.setTotalcost(m.getTotalcost());
+ project.setKeywords(mapStringField(m.getKeywords()));
+ project.setSubjects(m.getSubjectsList().stream()
+ .map(sp -> mapStructuredProperty(sp))
+ .collect(Collectors.toList()));
+ project.setTitle(mapStringField(m.getTitle()));
+ project.setWebsiteurl(mapStringField(m.getWebsiteurl()));
+ project.setFundingtree(m.getFundingtreeList().stream()
+ .map(f -> mapStringField(f))
+ .collect(Collectors.toList()));
+ project.setJsonextrainfo(mapStringField(m.getJsonextrainfo()));
+ project.setSummary(mapStringField(m.getSummary()));
+ project.setOptional1(mapStringField(m.getOptional1()));
+ project.setOptional2(mapStringField(m.getOptional2()));
+ return project;
+ }
+
+ private static Result convertResult(OafProtos.Oaf oaf) {
+ switch (oaf.getEntity().getResult().getMetadata().getResulttype().getClassid()) {
+ case "dataset":
+ return createDataset(oaf);
+ case "publication":
+ return createPublication(oaf);
+ case "software":
+ return createSoftware(oaf);
+ case "other":
+ return createORP(oaf);
+ default:
+ Result result = setOaf(new Result(), oaf);
+ setEntity(result, oaf);
+ return setResult(result, oaf);
+ }
+ }
+
+ private static Software createSoftware(OafProtos.Oaf oaf) {
+ ResultProtos.Result.Metadata m = oaf.getEntity().getResult().getMetadata();
+ Software software = setOaf(new Software(), oaf);
+ setEntity(software, oaf);
+ setResult(software, oaf);
+
+ software.setDocumentationUrl(m.getDocumentationUrlList()
+ .stream()
+ .map(ProtoConverter::mapStringField)
+ .collect(Collectors.toList()));
+ software.setLicense(m.getLicenseList()
+ .stream()
+ .map(ProtoConverter::mapStructuredProperty)
+ .collect(Collectors.toList()));
+ software.setCodeRepositoryUrl(mapStringField(m.getCodeRepositoryUrl()));
+ software.setProgrammingLanguage(mapQualifier(m.getProgrammingLanguage()));
+ return software;
+ }
+
+ private static OtherResearchProduct createORP(OafProtos.Oaf oaf) {
+ ResultProtos.Result.Metadata m = oaf.getEntity().getResult().getMetadata();
+ OtherResearchProduct otherResearchProducts = setOaf(new OtherResearchProduct(), oaf);
+ setEntity(otherResearchProducts, oaf);
+ setResult(otherResearchProducts, oaf);
+ otherResearchProducts.setContactperson(m.getContactpersonList()
+ .stream()
+ .map(ProtoConverter::mapStringField)
+ .collect(Collectors.toList()));
+ otherResearchProducts.setContactgroup(m.getContactgroupList()
+ .stream()
+ .map(ProtoConverter::mapStringField)
+ .collect(Collectors.toList()));
+ otherResearchProducts.setTool(m.getToolList()
+ .stream()
+ .map(ProtoConverter::mapStringField)
+ .collect(Collectors.toList()));
+
+ return otherResearchProducts;
+ }
+
+ private static Publication createPublication(OafProtos.Oaf oaf) {
+
+ ResultProtos.Result.Metadata m = oaf.getEntity().getResult().getMetadata();
+ Publication publication = setOaf(new Publication(), oaf);
+ setEntity(publication, oaf);
+ setResult(publication, oaf);
+ publication.setJournal(mapJournal(m.getJournal()));
+ return publication;
+ }
+
+ private static Dataset createDataset(OafProtos.Oaf oaf) {
+
+ ResultProtos.Result.Metadata m = oaf.getEntity().getResult().getMetadata();
+ Dataset dataset = setOaf(new Dataset(), oaf);
+ setEntity(dataset, oaf);
+ setResult(dataset, oaf);
+ dataset.setStoragedate(mapStringField(m.getStoragedate()));
+ dataset.setDevice(mapStringField(m.getDevice()));
+ dataset.setSize(mapStringField(m.getSize()));
+ dataset.setVersion(mapStringField(m.getVersion()));
+ dataset.setLastmetadataupdate(mapStringField(m.getLastmetadataupdate()));
+ dataset.setMetadataversionnumber(mapStringField(m.getMetadataversionnumber()));
+ dataset.setGeolocation(m.getGeolocationList()
+ .stream()
+ .map(ProtoConverter::mapGeolocation)
+ .collect(Collectors.toList()));
+ return dataset;
+
+ }
+
+ public static T setOaf(T oaf, OafProtos.Oaf o) {
+ oaf.setDataInfo(mapDataInfo(o.getDataInfo()));
+ oaf.setLastupdatetimestamp(o.getLastupdatetimestamp());
+ return oaf;
+ }
+
+ public static T setEntity(T entity, OafProtos.Oaf oaf) {
+ //setting Entity fields
+ final OafProtos.OafEntity e = oaf.getEntity();
+ entity.setId(e.getId());
+ entity.setOriginalId(e.getOriginalIdList());
+ entity.setCollectedfrom(e.getCollectedfromList()
+ .stream()
+ .map(ProtoConverter::mapKV)
+ .collect(Collectors.toList()));
+ entity.setPid(e.getPidList().stream()
+ .map(ProtoConverter::mapStructuredProperty)
+ .collect(Collectors.toList()));
+ entity.setDateofcollection(e.getDateofcollection());
+ entity.setDateoftransformation(e.getDateoftransformation());
+ entity.setExtraInfo(e.getExtraInfoList()
+ .stream()
+ .map(ProtoConverter::mapExtraInfo)
+ .collect(Collectors.toList()));
+ return entity;
+ }
+
+ public static T setResult(T entity, OafProtos.Oaf oaf) {
+ //setting Entity fields
+ final ResultProtos.Result.Metadata m = oaf.getEntity().getResult().getMetadata();
+ entity.setAuthor(m.getAuthorList()
+ .stream()
+ .map(ProtoConverter::mapAuthor)
+ .collect(Collectors.toList()));
+ entity.setResulttype(mapQualifier(m.getResulttype()));
+ entity.setLanguage(mapQualifier(m.getLanguage()));
+ entity.setCountry(m.getCountryList()
+ .stream()
+ .map(ProtoConverter::mapQualifierAsCountry)
+ .collect(Collectors.toList()));
+ entity.setSubject(m.getSubjectList()
+ .stream()
+ .map(ProtoConverter::mapStructuredProperty)
+ .collect(Collectors.toList()));
+ entity.setTitle(m.getTitleList()
+ .stream()
+ .map(ProtoConverter::mapStructuredProperty)
+ .collect(Collectors.toList()));
+ entity.setRelevantdate(m.getRelevantdateList()
+ .stream()
+ .map(ProtoConverter::mapStructuredProperty)
+ .collect(Collectors.toList()));
+ entity.setDescription(m.getDescriptionList()
+ .stream()
+ .map(ProtoConverter::mapStringField)
+ .collect(Collectors.toList()));
+ entity.setDateofacceptance(mapStringField(m.getDateofacceptance()));
+ entity.setPublisher(mapStringField(m.getPublisher()));
+ entity.setEmbargoenddate(mapStringField(m.getEmbargoenddate()));
+ entity.setSource(m.getSourceList()
+ .stream()
+ .map(ProtoConverter::mapStringField)
+ .collect(Collectors.toList()));
+ entity.setFulltext(m.getFulltextList()
+ .stream()
+ .map(ProtoConverter::mapStringField)
+ .collect(Collectors.toList()));
+ entity.setFormat(m.getFormatList()
+ .stream()
+ .map(ProtoConverter::mapStringField)
+ .collect(Collectors.toList()));
+ entity.setContributor(m.getContributorList()
+ .stream()
+ .map(ProtoConverter::mapStringField)
+ .collect(Collectors.toList()));
+ entity.setResourcetype(mapQualifier(m.getResourcetype()));
+ entity.setCoverage(m.getCoverageList()
+ .stream()
+ .map(ProtoConverter::mapStringField)
+ .collect(Collectors.toList()));
+ entity.setContext(m.getContextList()
+ .stream()
+ .map(ProtoConverter::mapContext)
+ .collect(Collectors.toList()));
+
+ entity.setBestaccessright(getBestAccessRights(oaf.getEntity().getResult().getInstanceList()));
+
+ return entity;
+ }
+
+ private static Qualifier getBestAccessRights(List instanceList) {
+ if (instanceList != null) {
+ final Optional min = instanceList.stream()
+ .map(i -> i.getAccessright()).min(new LicenseComparator());
+
+ final Qualifier rights = min.isPresent() ? mapQualifier(min.get()) : new Qualifier();
+
+ if (StringUtils.isBlank(rights.getClassid())) {
+ rights.setClassid(UNKNOWN);
+ }
+ if (StringUtils.isBlank(rights.getClassname()) || UNKNOWN.equalsIgnoreCase(rights.getClassname())) {
+ rights.setClassname(NOT_AVAILABLE);
+ }
+ if (StringUtils.isBlank(rights.getSchemeid())) {
+ rights.setSchemeid(DNET_ACCESS_MODES);
+ }
+ if (StringUtils.isBlank(rights.getSchemename())) {
+ rights.setSchemename(DNET_ACCESS_MODES);
+ }
+
+ return rights;
+ }
+ return null;
+ }
+
+ private static Context mapContext(ResultProtos.Result.Context context) {
+
+ final Context entity = new Context();
+ entity.setId(context.getId());
+ entity.setDataInfo(context.getDataInfoList()
+ .stream()
+ .map(ProtoConverter::mapDataInfo)
+ .collect(Collectors.toList()));
+ return entity;
+ }
+
+
+ public static KeyValue mapKV(FieldTypeProtos.KeyValue kv) {
+ final KeyValue keyValue = new KeyValue();
+ keyValue.setKey(kv.getKey());
+ keyValue.setValue(kv.getValue());
+ keyValue.setDataInfo(mapDataInfo(kv.getDataInfo()));
+ return keyValue;
+ }
+
+ public static DataInfo mapDataInfo(FieldTypeProtos.DataInfo d) {
+ final DataInfo dataInfo = new DataInfo();
+ dataInfo.setDeletedbyinference(d.getDeletedbyinference());
+ dataInfo.setInferenceprovenance(d.getInferenceprovenance());
+ dataInfo.setInferred(d.getInferred());
+ dataInfo.setInvisible(d.getInvisible());
+ dataInfo.setProvenanceaction(mapQualifier(d.getProvenanceaction()));
+ dataInfo.setTrust(d.getTrust());
+ return dataInfo;
+ }
+
+ public static Qualifier mapQualifier(FieldTypeProtos.Qualifier q) {
+ final Qualifier qualifier = new Qualifier();
+ qualifier.setClassid(q.getClassid());
+ qualifier.setClassname(q.getClassname());
+ qualifier.setSchemeid(q.getSchemeid());
+ qualifier.setSchemename(q.getSchemename());
+ return qualifier;
+ }
+
+ public static Country mapQualifierAsCountry(FieldTypeProtos.Qualifier q) {
+ final Country c = new Country();
+ c.setClassid(q.getClassid());
+ c.setClassname(q.getClassname());
+ c.setSchemeid(q.getSchemeid());
+ c.setSchemename(q.getSchemename());
+ c.setDataInfo(mapDataInfo(q.getDataInfo()));
+ return c;
+ }
+
+ public static StructuredProperty mapStructuredProperty(FieldTypeProtos.StructuredProperty sp) {
+ final StructuredProperty structuredProperty = new StructuredProperty();
+ structuredProperty.setValue(sp.getValue());
+ structuredProperty.setQualifier(mapQualifier(sp.getQualifier()));
+ structuredProperty.setDataInfo(mapDataInfo(sp.getDataInfo()));
+ return structuredProperty;
+ }
+
+ public static ExtraInfo mapExtraInfo(FieldTypeProtos.ExtraInfo extraInfo) {
+ final ExtraInfo entity = new ExtraInfo();
+ entity.setName(extraInfo.getName());
+ entity.setTypology(extraInfo.getTypology());
+ entity.setProvenance(extraInfo.getProvenance());
+ entity.setTrust(extraInfo.getTrust());
+ entity.setValue(extraInfo.getValue());
+ return entity;
+ }
+
+ public static OAIProvenance mapOAIProvenance(FieldTypeProtos.OAIProvenance oaiProvenance) {
+ final OAIProvenance entity = new OAIProvenance();
+ entity.setOriginDescription(mapOriginalDescription(oaiProvenance.getOriginDescription()));
+ return entity;
+ }
+
+ public static OriginDescription mapOriginalDescription(FieldTypeProtos.OAIProvenance.OriginDescription originDescription) {
+ final OriginDescription originDescriptionResult = new OriginDescription();
+ originDescriptionResult.setHarvestDate(originDescription.getHarvestDate());
+ originDescriptionResult.setAltered(originDescription.getAltered());
+ originDescriptionResult.setBaseURL(originDescription.getBaseURL());
+ originDescriptionResult.setIdentifier(originDescription.getIdentifier());
+ originDescriptionResult.setDatestamp(originDescription.getDatestamp());
+ originDescriptionResult.setMetadataNamespace(originDescription.getMetadataNamespace());
+ return originDescriptionResult;
+ }
+
+ public static Field mapStringField(FieldTypeProtos.StringField s) {
+ final Field stringField = new Field<>();
+ stringField.setValue(s.getValue());
+ stringField.setDataInfo(mapDataInfo(s.getDataInfo()));
+ return stringField;
+ }
+
+ public static Field mapBoolField(FieldTypeProtos.BoolField b) {
+ final Field booleanField = new Field<>();
+ booleanField.setValue(b.getValue());
+ booleanField.setDataInfo(mapDataInfo(b.getDataInfo()));
+ return booleanField;
+ }
+
+ public static Field mapIntField(FieldTypeProtos.IntField b) {
+ final Field entity = new Field<>();
+ entity.setValue(b.getValue());
+ entity.setDataInfo(mapDataInfo(b.getDataInfo()));
+ return entity;
+ }
+
+ public static Journal mapJournal(FieldTypeProtos.Journal j) {
+ final Journal journal = new Journal();
+ journal.setConferencedate(j.getConferencedate());
+ journal.setConferenceplace(j.getConferenceplace());
+ journal.setEdition(j.getEdition());
+ journal.setEp(j.getEp());
+ journal.setIss(j.getIss());
+ journal.setIssnLinking(j.getIssnLinking());
+ journal.setIssnOnline(j.getIssnOnline());
+ journal.setIssnPrinted(j.getIssnPrinted());
+ journal.setName(j.getName());
+ journal.setSp(j.getSp());
+ journal.setVol(j.getVol());
+ journal.setDataInfo(mapDataInfo(j.getDataInfo()));
+ return journal;
+ }
+
+ public static Author mapAuthor(FieldTypeProtos.Author author) {
+ final Author entity = new Author();
+ entity.setFullname(author.getFullname());
+ entity.setName(author.getName());
+ entity.setSurname(author.getSurname());
+ entity.setRank(author.getRank());
+ entity.setPid(author.getPidList()
+ .stream()
+ .map(kv -> {
+ final StructuredProperty sp = new StructuredProperty();
+ sp.setValue(kv.getValue());
+ final Qualifier q = new Qualifier();
+ q.setClassid(kv.getKey());
+ q.setClassname(kv.getKey());
+ sp.setQualifier(q);
+ return sp;
+ })
+ .collect(Collectors.toList()));
+ entity.setAffiliation(author.getAffiliationList()
+ .stream()
+ .map(ProtoConverter::mapStringField)
+ .collect(Collectors.toList()));
+ return entity;
+
+ }
+
+ public static GeoLocation mapGeolocation(ResultProtos.Result.GeoLocation geoLocation) {
+ final GeoLocation entity = new GeoLocation();
+ entity.setPoint(geoLocation.getPoint());
+ entity.setBox(geoLocation.getBox());
+ entity.setPlace(geoLocation.getPlace());
+ return entity;
+ }
+}
diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/actions/TransformActions.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/actions/TransformActions.java
new file mode 100644
index 0000000000..19a0cb5c9d
--- /dev/null
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/actions/TransformActions.java
@@ -0,0 +1,194 @@
+package eu.dnetlib.dhp.migration.actions;
+
+import com.fasterxml.jackson.core.JsonProcessingException;
+import com.fasterxml.jackson.databind.ObjectMapper;
+import com.google.common.base.Splitter;
+import com.google.common.collect.Lists;
+import com.google.protobuf.InvalidProtocolBufferException;
+import eu.dnetlib.dhp.schema.action.AtomicAction;
+import eu.dnetlib.data.proto.OafProtos;
+import eu.dnetlib.dhp.application.ArgumentApplicationParser;
+import eu.dnetlib.dhp.schema.oaf.*;
+import eu.dnetlib.dhp.utils.ISLookupClientFactory;
+import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
+import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.commons.lang3.StringUtils;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.compress.GzipCodec;
+import org.apache.hadoop.mapred.SequenceFileOutputFormat;
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.sql.SparkSession;
+import scala.Tuple2;
+
+import java.io.IOException;
+import java.io.Serializable;
+import java.util.LinkedList;
+
+public class TransformActions implements Serializable {
+
+ private static final Log log = LogFactory.getLog(TransformActions.class);
+ private static final String SEPARATOR = "/";
+
+ public static void main(String[] args) throws Exception {
+ final ArgumentApplicationParser parser = new ArgumentApplicationParser(
+ IOUtils.toString(MigrateActionSet.class.getResourceAsStream(
+ "/eu/dnetlib/dhp/migration/transform_actionsets_parameters.json")));
+ parser.parseArgument(args);
+
+ new TransformActions().run(parser);
+ }
+
+ private void run(ArgumentApplicationParser parser) throws ISLookUpException, IOException {
+
+ final String isLookupUrl = parser.get("isLookupUrl");
+ log.info("isLookupUrl: " + isLookupUrl);
+
+ final String inputPaths = parser.get("inputPaths");
+
+ if (StringUtils.isBlank(inputPaths)) {
+ throw new RuntimeException("empty inputPaths");
+ }
+ log.info("inputPaths: " + inputPaths);
+
+ final String targetBaseDir = getTargetBaseDir(isLookupUrl);
+
+ try(SparkSession spark = getSparkSession(parser)) {
+ final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
+ final FileSystem fs = FileSystem.get(spark.sparkContext().hadoopConfiguration());
+
+ for(String sourcePath : Lists.newArrayList(Splitter.on(",").split(inputPaths))) {
+
+ LinkedList pathQ = Lists.newLinkedList(Splitter.on(SEPARATOR).split(sourcePath));
+
+ final String rawset = pathQ.pollLast();
+ final String actionSetDirectory = pathQ.pollLast();
+
+ final Path targetDirectory = new Path(targetBaseDir + SEPARATOR + actionSetDirectory + SEPARATOR + rawset);
+
+ if (fs.exists(targetDirectory)) {
+ log.info(String.format("found target directory '%s", targetDirectory));
+ fs.delete(targetDirectory, true);
+ log.info(String.format("deleted target directory '%s", targetDirectory));
+ }
+
+ log.info(String.format("transforming actions from '%s' to '%s'", sourcePath, targetDirectory));
+
+ sc.sequenceFile(sourcePath, Text.class, Text.class)
+ .mapToPair(a -> new Tuple2<>(a._1(), eu.dnetlib.actionmanager.actions.AtomicAction.fromJSON(a._2().toString())))
+ .mapToPair(a -> new Tuple2<>(a._1(), transformAction(a._1().toString(), a._2())))
+ .filter(t -> StringUtils.isNotBlank(t._2().toString()))
+ .saveAsHadoopFile(targetDirectory.toString(), Text.class, Text.class, SequenceFileOutputFormat.class, GzipCodec.class);
+ }
+ }
+ }
+
+ private Text transformAction(String atomicaActionId, eu.dnetlib.actionmanager.actions.AtomicAction aa) throws InvalidProtocolBufferException, JsonProcessingException {
+ final Text out = new Text();
+ final ObjectMapper mapper = new ObjectMapper();
+ if (aa.getTargetValue() != null && aa.getTargetValue().length > 0) {
+ out.set(mapper.writeValueAsString(doTransform(aa)));
+ } else {
+ if (atomicaActionId.contains("dedupSimilarity")) {
+ out.set(mapper.writeValueAsString(getRelationAtomicAction(atomicaActionId)));
+ }
+ }
+
+ return out;
+ }
+
+ private AtomicAction getRelationAtomicAction(String atomicaActionId) {
+ final String[] splitId = atomicaActionId.split("@");
+
+ String source = splitId[0];
+ String target = splitId[2];
+
+ String[] relSemantic = splitId[1].split("_");
+
+ Relation rel = new Relation();
+ rel.setSource(source);
+ rel.setTarget(target);
+ rel.setRelType(relSemantic[0]);
+ rel.setSubRelType(relSemantic[1]);
+ rel.setRelClass(relSemantic[2]);
+
+ DataInfo d = new DataInfo();
+ d.setDeletedbyinference(false);
+ d.setInferenceprovenance("deduplication");
+ d.setInferred(true);
+ d.setInvisible(false);
+ Qualifier provenanceaction = new Qualifier();
+
+ provenanceaction.setClassid("deduplication");
+ provenanceaction.setClassname("deduplication");
+ provenanceaction.setSchemeid("dnet:provenanceActions");
+ provenanceaction.setSchemename("dnet:provenanceActions");
+
+ d.setProvenanceaction(provenanceaction);
+
+ rel.setDataInfo(d);
+
+ return new AtomicAction<>(Relation.class, rel);
+ }
+
+ private AtomicAction doTransform(eu.dnetlib.actionmanager.actions.AtomicAction aa) throws InvalidProtocolBufferException {
+ final OafProtos.Oaf proto_oaf = OafProtos.Oaf.parseFrom(aa.getTargetValue());
+ final Oaf oaf = ProtoConverter.convert(proto_oaf);
+ switch (proto_oaf.getKind()) {
+ case entity:
+ switch (proto_oaf.getEntity().getType()) {
+ case datasource:
+ return new AtomicAction<>(Datasource.class, (Datasource) oaf);
+ case organization:
+ return new AtomicAction<>(Organization.class, (Organization) oaf);
+ case project:
+ return new AtomicAction<>(Project.class, (Project) oaf);
+ case result:
+ final String resulttypeid = proto_oaf.getEntity().getResult().getMetadata().getResulttype().getClassid();
+ switch (resulttypeid) {
+ case "publication":
+ return new AtomicAction<>(Publication.class, (Publication) oaf);
+ case "software":
+ return new AtomicAction<>(Software.class, (Software) oaf);
+ case "other":
+ return new AtomicAction<>(OtherResearchProduct.class, (OtherResearchProduct) oaf);
+ case "dataset":
+ return new AtomicAction<>(Dataset.class, (Dataset) oaf);
+ default:
+ // can be an update, where the resulttype is not specified
+ return new AtomicAction<>(Result.class, (Result) oaf);
+ }
+ default:
+ throw new IllegalArgumentException("invalid entity type: " + proto_oaf.getEntity().getType());
+ }
+ case relation:
+ return new AtomicAction<>(Relation.class, (Relation) oaf);
+ default:
+ throw new IllegalArgumentException("invalid kind: " + proto_oaf.getKind());
+ }
+ }
+
+ private String getTargetBaseDir(String isLookupUrl) throws ISLookUpException {
+ ISLookUpService isLookUp = ISLookupClientFactory.getLookUpService(isLookupUrl);
+ String XQUERY = "collection('/db/DRIVER/ServiceResources/ActionManagerServiceResourceType')//SERVICE_PROPERTIES/PROPERTY[@key = 'basePath']/@value/string()";
+ return isLookUp.getResourceProfileByQuery(XQUERY);
+ }
+
+ private static SparkSession getSparkSession(ArgumentApplicationParser parser) {
+ SparkConf conf = new SparkConf();
+
+ return SparkSession
+ .builder()
+ .appName(TransformActions.class.getSimpleName())
+ .master(parser.get("master"))
+ .config(conf)
+ .enableHiveSupport()
+ .getOrCreate();
+ }
+}
diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/step1/MigrateDbEntitiesApplication.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/step1/MigrateDbEntitiesApplication.java
new file mode 100644
index 0000000000..7db2b17726
--- /dev/null
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/step1/MigrateDbEntitiesApplication.java
@@ -0,0 +1,481 @@
+package eu.dnetlib.dhp.migration.step1;
+
+import static eu.dnetlib.dhp.migration.utils.OafMapperUtils.asString;
+import static eu.dnetlib.dhp.migration.utils.OafMapperUtils.createOpenaireId;
+import static eu.dnetlib.dhp.migration.utils.OafMapperUtils.dataInfo;
+import static eu.dnetlib.dhp.migration.utils.OafMapperUtils.field;
+import static eu.dnetlib.dhp.migration.utils.OafMapperUtils.journal;
+import static eu.dnetlib.dhp.migration.utils.OafMapperUtils.listFields;
+import static eu.dnetlib.dhp.migration.utils.OafMapperUtils.listKeyValues;
+import static eu.dnetlib.dhp.migration.utils.OafMapperUtils.qualifier;
+import static eu.dnetlib.dhp.migration.utils.OafMapperUtils.structuredProperty;
+
+import java.io.Closeable;
+import java.io.IOException;
+import java.sql.Array;
+import java.sql.ResultSet;
+import java.sql.SQLException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Date;
+import java.util.List;
+import java.util.function.Consumer;
+import java.util.function.Function;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.commons.lang3.StringUtils;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+
+import eu.dnetlib.dhp.application.ArgumentApplicationParser;
+import eu.dnetlib.dhp.migration.utils.AbstractMigrationApplication;
+import eu.dnetlib.dhp.migration.utils.DbClient;
+import eu.dnetlib.dhp.schema.oaf.Context;
+import eu.dnetlib.dhp.schema.oaf.DataInfo;
+import eu.dnetlib.dhp.schema.oaf.Dataset;
+import eu.dnetlib.dhp.schema.oaf.Datasource;
+import eu.dnetlib.dhp.schema.oaf.Field;
+import eu.dnetlib.dhp.schema.oaf.Journal;
+import eu.dnetlib.dhp.schema.oaf.KeyValue;
+import eu.dnetlib.dhp.schema.oaf.Oaf;
+import eu.dnetlib.dhp.schema.oaf.Organization;
+import eu.dnetlib.dhp.schema.oaf.OtherResearchProduct;
+import eu.dnetlib.dhp.schema.oaf.Project;
+import eu.dnetlib.dhp.schema.oaf.Publication;
+import eu.dnetlib.dhp.schema.oaf.Qualifier;
+import eu.dnetlib.dhp.schema.oaf.Relation;
+import eu.dnetlib.dhp.schema.oaf.Result;
+import eu.dnetlib.dhp.schema.oaf.Software;
+import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
+
+public class MigrateDbEntitiesApplication extends AbstractMigrationApplication implements Closeable {
+
+ private static final Qualifier ENTITYREGISTRY_PROVENANCE_ACTION =
+ qualifier("sysimport:crosswalk:entityregistry", "sysimport:crosswalk:entityregistry", "dnet:provenance_actions", "dnet:provenance_actions");
+
+ private static final Log log = LogFactory.getLog(MigrateDbEntitiesApplication.class);
+
+ private final DbClient dbClient;
+
+ private final long lastUpdateTimestamp;
+
+ public static void main(final String[] args) throws Exception {
+ final ArgumentApplicationParser parser = new ArgumentApplicationParser(
+ IOUtils.toString(MigrateDbEntitiesApplication.class.getResourceAsStream("/eu/dnetlib/dhp/migration/migrate_db_entities_parameters.json")));
+
+ parser.parseArgument(args);
+
+ final String dbUrl = parser.get("postgresUrl");
+ final String dbUser = parser.get("postgresUser");
+ final String dbPassword = parser.get("postgresPassword");
+
+ final String hdfsPath = parser.get("hdfsPath");
+
+ final boolean processClaims = parser.get("action") != null && parser.get("action").equalsIgnoreCase("claims");
+
+ try (final MigrateDbEntitiesApplication smdbe = new MigrateDbEntitiesApplication(hdfsPath, dbUrl, dbUser, dbPassword)) {
+ if (processClaims) {
+ log.info("Processing claims...");
+ smdbe.execute("queryClaims.sql", smdbe::processClaims);
+ } else {
+ log.info("Processing datasources...");
+ smdbe.execute("queryDatasources.sql", smdbe::processDatasource);
+
+ log.info("Processing projects...");
+ smdbe.execute("queryProjects.sql", smdbe::processProject);
+
+ log.info("Processing orgs...");
+ smdbe.execute("queryOrganizations.sql", smdbe::processOrganization);
+
+ log.info("Processing relations ds <-> orgs ...");
+ smdbe.execute("queryDatasourceOrganization.sql", smdbe::processDatasourceOrganization);
+
+ log.info("Processing projects <-> orgs ...");
+ smdbe.execute("queryProjectOrganization.sql", smdbe::processProjectOrganization);
+ }
+ log.info("All done.");
+ }
+ }
+
+ protected MigrateDbEntitiesApplication() { // ONLY FOR UNIT TEST
+ super();
+ this.dbClient = null;
+ this.lastUpdateTimestamp = new Date().getTime();
+ }
+
+ public MigrateDbEntitiesApplication(final String hdfsPath, final String dbUrl, final String dbUser,
+ final String dbPassword) throws Exception {
+ super(hdfsPath);
+ this.dbClient = new DbClient(dbUrl, dbUser, dbPassword);
+ this.lastUpdateTimestamp = new Date().getTime();
+ }
+
+ public void execute(final String sqlFile, final Function> producer) throws Exception {
+ final String sql = IOUtils.toString(getClass().getResourceAsStream("/eu/dnetlib/dhp/migration/sql/" + sqlFile));
+
+ final Consumer consumer = rs -> producer.apply(rs).forEach(oaf -> emitOaf(oaf));
+
+ dbClient.processResults(sql, consumer);
+ }
+
+ public List processDatasource(final ResultSet rs) {
+
+ try {
+
+ final DataInfo info = prepareDataInfo(rs);
+
+ final Datasource ds = new Datasource();
+
+ ds.setId(createOpenaireId(10, rs.getString("datasourceid"), true));
+ ds.setOriginalId(Arrays.asList(rs.getString("datasourceid")));
+ ds.setCollectedfrom(listKeyValues(rs.getString("collectedfromid"), rs.getString("collectedfromname")));
+ ds.setPid(new ArrayList<>());
+ ds.setDateofcollection(asString(rs.getDate("dateofcollection")));
+ ds.setDateoftransformation(null); // Value not returned by the SQL query
+ ds.setExtraInfo(new ArrayList<>()); // Values not present in the DB
+ ds.setOaiprovenance(null); // Values not present in the DB
+ ds.setDatasourcetype(prepareQualifierSplitting(rs.getString("datasourcetype")));
+ ds.setOpenairecompatibility(prepareQualifierSplitting(rs.getString("openairecompatibility")));
+ ds.setOfficialname(field(rs.getString("officialname"), info));
+ ds.setEnglishname(field(rs.getString("englishname"), info));
+ ds.setWebsiteurl(field(rs.getString("websiteurl"), info));
+ ds.setLogourl(field(rs.getString("logourl"), info));
+ ds.setContactemail(field(rs.getString("contactemail"), info));
+ ds.setNamespaceprefix(field(rs.getString("namespaceprefix"), info));
+ ds.setLatitude(field(Double.toString(rs.getDouble("latitude")), info));
+ ds.setLongitude(field(Double.toString(rs.getDouble("longitude")), info));
+ ds.setDateofvalidation(field(asString(rs.getDate("dateofvalidation")), info));
+ ds.setDescription(field(rs.getString("description"), info));
+ ds.setSubjects(prepareListOfStructProps(rs.getArray("subjects"), info));
+ ds.setOdnumberofitems(field(Double.toString(rs.getInt("odnumberofitems")), info));
+ ds.setOdnumberofitemsdate(field(asString(rs.getDate("odnumberofitemsdate")), info));
+ ds.setOdpolicies(field(rs.getString("odpolicies"), info));
+ ds.setOdlanguages(prepareListFields(rs.getArray("odlanguages"), info));
+ ds.setOdcontenttypes(prepareListFields(rs.getArray("odcontenttypes"), info));
+ ds.setAccessinfopackage(prepareListFields(rs.getArray("accessinfopackage"), info));
+ ds.setReleasestartdate(field(asString(rs.getDate("releasestartdate")), info));
+ ds.setReleaseenddate(field(asString(rs.getDate("releaseenddate")), info));
+ ds.setMissionstatementurl(field(rs.getString("missionstatementurl"), info));
+ ds.setDataprovider(field(rs.getBoolean("dataprovider"), info));
+ ds.setServiceprovider(field(rs.getBoolean("serviceprovider"), info));
+ ds.setDatabaseaccesstype(field(rs.getString("databaseaccesstype"), info));
+ ds.setDatauploadtype(field(rs.getString("datauploadtype"), info));
+ ds.setDatabaseaccessrestriction(field(rs.getString("databaseaccessrestriction"), info));
+ ds.setDatauploadrestriction(field(rs.getString("datauploadrestriction"), info));
+ ds.setVersioning(field(rs.getBoolean("versioning"), info));
+ ds.setCitationguidelineurl(field(rs.getString("citationguidelineurl"), info));
+ ds.setQualitymanagementkind(field(rs.getString("qualitymanagementkind"), info));
+ ds.setPidsystems(field(rs.getString("pidsystems"), info));
+ ds.setCertificates(field(rs.getString("certificates"), info));
+ ds.setPolicies(new ArrayList<>()); // The sql query returns an empty array
+ ds.setJournal(prepareJournal(rs.getString("officialname"), rs.getString("journal"), info)); // Journal
+ ds.setDataInfo(info);
+ ds.setLastupdatetimestamp(lastUpdateTimestamp);
+
+ return Arrays.asList(ds);
+ } catch (final Exception e) {
+ throw new RuntimeException(e);
+ }
+ }
+
+ public List processProject(final ResultSet rs) {
+ try {
+
+ final DataInfo info = prepareDataInfo(rs);
+
+ final Project p = new Project();
+
+ p.setId(createOpenaireId(40, rs.getString("projectid"), true));
+ p.setOriginalId(Arrays.asList(rs.getString("projectid")));
+ p.setCollectedfrom(listKeyValues(rs.getString("collectedfromid"), rs.getString("collectedfromname")));
+ p.setPid(new ArrayList<>());
+ p.setDateofcollection(asString(rs.getDate("dateofcollection")));
+ p.setDateoftransformation(asString(rs.getDate("dateoftransformation")));
+ p.setExtraInfo(new ArrayList<>()); // Values not present in the DB
+ p.setOaiprovenance(null); // Values not present in the DB
+ p.setWebsiteurl(field(rs.getString("websiteurl"), info));
+ p.setCode(field(rs.getString("code"), info));
+ p.setAcronym(field(rs.getString("acronym"), info));
+ p.setTitle(field(rs.getString("title"), info));
+ p.setStartdate(field(asString(rs.getDate("startdate")), info));
+ p.setEnddate(field(asString(rs.getDate("enddate")), info));
+ p.setCallidentifier(field(rs.getString("callidentifier"), info));
+ p.setKeywords(field(rs.getString("keywords"), info));
+ p.setDuration(field(Integer.toString(rs.getInt("duration")), info));
+ p.setEcsc39(field(Boolean.toString(rs.getBoolean("ecsc39")), info));
+ p.setOamandatepublications(field(Boolean.toString(rs.getBoolean("oamandatepublications")), info));
+ p.setEcarticle29_3(field(Boolean.toString(rs.getBoolean("ecarticle29_3")), info));
+ p.setSubjects(prepareListOfStructProps(rs.getArray("subjects"), info));
+ p.setFundingtree(prepareListFields(rs.getArray("fundingtree"), info));
+ p.setContracttype(prepareQualifierSplitting(rs.getString("contracttype")));
+ p.setOptional1(field(rs.getString("optional1"), info));
+ p.setOptional2(field(rs.getString("optional2"), info));
+ p.setJsonextrainfo(field(rs.getString("jsonextrainfo"), info));
+ p.setContactfullname(field(rs.getString("contactfullname"), info));
+ p.setContactfax(field(rs.getString("contactfax"), info));
+ p.setContactphone(field(rs.getString("contactphone"), info));
+ p.setContactemail(field(rs.getString("contactemail"), info));
+ p.setSummary(field(rs.getString("summary"), info));
+ p.setCurrency(field(rs.getString("currency"), info));
+ p.setTotalcost(new Float(rs.getDouble("totalcost")));
+ p.setFundedamount(new Float(rs.getDouble("fundedamount")));
+ p.setDataInfo(info);
+ p.setLastupdatetimestamp(lastUpdateTimestamp);
+
+ return Arrays.asList(p);
+
+ } catch (final Exception e) {
+ throw new RuntimeException(e);
+ }
+ }
+
+ public List processOrganization(final ResultSet rs) {
+
+ try {
+
+ final DataInfo info = prepareDataInfo(rs);
+
+ final Organization o = new Organization();
+
+ o.setId(createOpenaireId(20, rs.getString("organizationid"), true));
+ o.setOriginalId(Arrays.asList(rs.getString("organizationid")));
+ o.setCollectedfrom(listKeyValues(rs.getString("collectedfromid"), rs.getString("collectedfromname")));
+ o.setPid(new ArrayList<>());
+ o.setDateofcollection(asString(rs.getDate("dateofcollection")));
+ o.setDateoftransformation(asString(rs.getDate("dateoftransformation")));
+ o.setExtraInfo(new ArrayList<>()); // Values not present in the DB
+ o.setOaiprovenance(null); // Values not present in the DB
+ o.setLegalshortname(field(rs.getString("legalshortname"), info));
+ o.setLegalname(field(rs.getString("legalname"), info));
+ o.setAlternativeNames(new ArrayList<>()); // Values not returned by the SQL query
+ o.setWebsiteurl(field(rs.getString("websiteurl"), info));
+ o.setLogourl(field(rs.getString("logourl"), info));
+ o.setEclegalbody(field(Boolean.toString(rs.getBoolean("eclegalbody")), info));
+ o.setEclegalperson(field(Boolean.toString(rs.getBoolean("eclegalperson")), info));
+ o.setEcnonprofit(field(Boolean.toString(rs.getBoolean("ecnonprofit")), info));
+ o.setEcresearchorganization(field(Boolean.toString(rs.getBoolean("ecresearchorganization")), info));
+ o.setEchighereducation(field(Boolean.toString(rs.getBoolean("echighereducation")), info));
+ o.setEcinternationalorganizationeurinterests(field(Boolean.toString(rs.getBoolean("ecinternationalorganizationeurinterests")), info));
+ o.setEcinternationalorganization(field(Boolean.toString(rs.getBoolean("ecinternationalorganization")), info));
+ o.setEcenterprise(field(Boolean.toString(rs.getBoolean("ecenterprise")), info));
+ o.setEcsmevalidated(field(Boolean.toString(rs.getBoolean("ecsmevalidated")), info));
+ o.setEcnutscode(field(Boolean.toString(rs.getBoolean("ecnutscode")), info));
+ o.setCountry(prepareQualifierSplitting(rs.getString("country")));
+ o.setDataInfo(info);
+ o.setLastupdatetimestamp(lastUpdateTimestamp);
+
+ return Arrays.asList(o);
+ } catch (final Exception e) {
+ throw new RuntimeException(e);
+ }
+ }
+
+ public List processDatasourceOrganization(final ResultSet rs) {
+ try {
+ final DataInfo info = prepareDataInfo(rs);
+ final String orgId = createOpenaireId(20, rs.getString("organization"), true);
+ final String dsId = createOpenaireId(10, rs.getString("datasource"), true);
+ final List collectedFrom = listKeyValues(rs.getString("collectedfromid"), rs.getString("collectedfromname"));
+
+ final Relation r1 = new Relation();
+ r1.setRelType("datasourceOrganization");
+ r1.setSubRelType("provision");
+ r1.setRelClass("isProvidedBy");
+ r1.setSource(dsId);
+ r1.setTarget(orgId);
+ r1.setCollectedFrom(collectedFrom);
+ r1.setDataInfo(info);
+ r1.setLastupdatetimestamp(lastUpdateTimestamp);
+
+ final Relation r2 = new Relation();
+ r2.setRelType("datasourceOrganization");
+ r2.setSubRelType("provision");
+ r2.setRelClass("provides");
+ r2.setSource(orgId);
+ r2.setTarget(dsId);
+ r2.setCollectedFrom(collectedFrom);
+ r2.setDataInfo(info);
+ r2.setLastupdatetimestamp(lastUpdateTimestamp);
+
+ return Arrays.asList(r1, r2);
+ } catch (final Exception e) {
+ throw new RuntimeException(e);
+ }
+ }
+
+ public List processProjectOrganization(final ResultSet rs) {
+ try {
+ final DataInfo info = prepareDataInfo(rs);
+ final String orgId = createOpenaireId(20, rs.getString("resporganization"), true);
+ final String projectId = createOpenaireId(40, rs.getString("project"), true);
+ final List collectedFrom = listKeyValues(rs.getString("collectedfromid"), rs.getString("collectedfromname"));
+
+ final Relation r1 = new Relation();
+ r1.setRelType("projectOrganization");
+ r1.setSubRelType("participation");
+ r1.setRelClass("isParticipant");
+ r1.setSource(projectId);
+ r1.setTarget(orgId);
+ r1.setCollectedFrom(collectedFrom);
+ r1.setDataInfo(info);
+ r1.setLastupdatetimestamp(lastUpdateTimestamp);
+
+ final Relation r2 = new Relation();
+ r2.setRelType("projectOrganization");
+ r2.setSubRelType("participation");
+ r2.setRelClass("hasParticipant");
+ r2.setSource(orgId);
+ r2.setTarget(projectId);
+ r2.setCollectedFrom(collectedFrom);
+ r2.setDataInfo(info);
+ r2.setLastupdatetimestamp(lastUpdateTimestamp);
+
+ return Arrays.asList(r1, r2);
+ } catch (final Exception e) {
+ throw new RuntimeException(e);
+ }
+ }
+
+ public List processClaims(final ResultSet rs) {
+
+ final DataInfo info =
+ dataInfo(false, null, false, false, qualifier("user:claim", "user:claim", "dnet:provenanceActions", "dnet:provenanceActions"), "0.9");
+
+ try {
+
+ if (rs.getString("source_type").equals("context")) {
+ final Result r;
+
+ if (rs.getString("target_type").equals("dataset")) {
+ r = new Dataset();
+ } else if (rs.getString("target_type").equals("software")) {
+ r = new Software();
+ } else if (rs.getString("target_type").equals("other")) {
+ r = new OtherResearchProduct();
+ } else {
+ r = new Publication();
+ }
+ r.setId(createOpenaireId(50, rs.getString("target_id"), false));
+ r.setLastupdatetimestamp(lastUpdateTimestamp);
+ r.setContext(prepareContext(rs.getString("source_id"), info));
+ r.setDataInfo(info);
+
+ return Arrays.asList(r);
+ } else {
+ final String sourceId = createOpenaireId(rs.getString("source_type"), rs.getString("source_id"), false);
+ final String targetId = createOpenaireId(rs.getString("target_type"), rs.getString("target_id"), false);
+
+ final Relation r1 = new Relation();
+ final Relation r2 = new Relation();
+
+ if (rs.getString("source_type").equals("project")) {
+ r1.setRelType("resultProject");
+ r1.setSubRelType("outcome");
+ r1.setRelClass("produces");
+
+ r2.setRelType("resultProject");
+ r2.setSubRelType("outcome");
+ r2.setRelClass("isProducedBy");
+ } else {
+ r1.setRelType("resultResult");
+ r1.setSubRelType("relationship");
+ r1.setRelClass("isRelatedTo");
+
+ r2.setRelType("resultResult");
+ r2.setSubRelType("relationship");
+ r2.setRelClass("isRelatedTo");
+ }
+
+ r1.setSource(sourceId);
+ r1.setTarget(targetId);
+ r1.setDataInfo(info);
+ r1.setLastupdatetimestamp(lastUpdateTimestamp);
+
+ r2.setSource(targetId);
+ r2.setTarget(sourceId);
+ r2.setDataInfo(info);
+ r2.setLastupdatetimestamp(lastUpdateTimestamp);
+
+ return Arrays.asList(r1, r2);
+ }
+
+ } catch (final Exception e) {
+ throw new RuntimeException(e);
+ }
+ }
+
+ private List prepareContext(final String id, final DataInfo dataInfo) {
+ final Context context = new Context();
+ context.setId(id);
+ context.setDataInfo(Arrays.asList(dataInfo));
+ return Arrays.asList(context);
+ }
+
+ private DataInfo prepareDataInfo(final ResultSet rs) throws SQLException {
+ final Boolean deletedbyinference = rs.getBoolean("deletedbyinference");
+ final String inferenceprovenance = rs.getString("inferenceprovenance");
+ final Boolean inferred = rs.getBoolean("inferred");
+ final String trust = rs.getString("trust");
+ return dataInfo(deletedbyinference, inferenceprovenance, inferred, false, ENTITYREGISTRY_PROVENANCE_ACTION, trust);
+ }
+
+ private Qualifier prepareQualifierSplitting(final String s) {
+ if (StringUtils.isBlank(s)) { return null; }
+ final String[] arr = s.split("@@@");
+ return arr.length == 4 ? qualifier(arr[0], arr[1], arr[2], arr[3]) : null;
+ }
+
+ private List> prepareListFields(final Array array, final DataInfo info) {
+ try {
+ return array != null ? listFields(info, (String[]) array.getArray()) : new ArrayList<>();
+ } catch (final SQLException e) {
+ throw new RuntimeException("Invalid SQL array", e);
+ }
+ }
+
+ private StructuredProperty prepareStructProp(final String s, final DataInfo dataInfo) {
+ if (StringUtils.isBlank(s)) { return null; }
+ final String[] parts = s.split("###");
+ if (parts.length == 2) {
+ final String value = parts[0];
+ final String[] arr = parts[1].split("@@@");
+ if (arr.length == 4) { return structuredProperty(value, arr[0], arr[1], arr[2], arr[3], dataInfo); }
+ }
+ return null;
+ }
+
+ private List prepareListOfStructProps(final Array array, final DataInfo dataInfo) throws SQLException {
+ final List res = new ArrayList<>();
+ if (array != null) {
+ for (final String s : (String[]) array.getArray()) {
+ final StructuredProperty sp = prepareStructProp(s, dataInfo);
+ if (sp != null) {
+ res.add(sp);
+ }
+ }
+ }
+
+ return res;
+ }
+
+ private Journal prepareJournal(final String name, final String sj, final DataInfo info) {
+ if (StringUtils.isNotBlank(sj)) {
+ final String[] arr = sj.split("@@@");
+ if (arr.length == 3) {
+ final String issn = StringUtils.isNotBlank(arr[0]) ? arr[0] : null;
+ final String eissn = StringUtils.isNotBlank(arr[1]) ? arr[1] : null;;
+ final String lissn = StringUtils.isNotBlank(arr[2]) ? arr[2] : null;;
+ if (issn != null || eissn != null
+ || lissn != null) { return journal(name, issn, eissn, eissn, null, null, null, null, null, null, null, info); }
+ }
+ }
+ return null;
+ }
+
+ @Override
+ public void close() throws IOException {
+ super.close();
+ dbClient.close();
+ }
+
+}
diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/step1/MigrateMongoMdstoresApplication.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/step1/MigrateMongoMdstoresApplication.java
new file mode 100644
index 0000000000..b1de31326b
--- /dev/null
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/step1/MigrateMongoMdstoresApplication.java
@@ -0,0 +1,67 @@
+package eu.dnetlib.dhp.migration.step1;
+
+import java.io.Closeable;
+import java.io.IOException;
+import java.util.Map;
+import java.util.Map.Entry;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+
+import eu.dnetlib.dhp.application.ArgumentApplicationParser;
+import eu.dnetlib.dhp.migration.utils.AbstractMigrationApplication;
+import eu.dnetlib.dhp.migration.utils.MdstoreClient;
+
+public class MigrateMongoMdstoresApplication extends AbstractMigrationApplication implements Closeable {
+
+ private static final Log log = LogFactory.getLog(MigrateMongoMdstoresApplication.class);
+
+ private final MdstoreClient mdstoreClient;
+
+ public static void main(final String[] args) throws Exception {
+ final ArgumentApplicationParser parser = new ArgumentApplicationParser(
+ IOUtils.toString(MigrateMongoMdstoresApplication.class.getResourceAsStream("/eu/dnetlib/dhp/migration/migrate_mongo_mstores_parameters.json")));
+ parser.parseArgument(args);
+
+ final String mongoBaseUrl = parser.get("mongoBaseUrl");
+ final String mongoDb = parser.get("mongoDb");
+
+ final String mdFormat = parser.get("mdFormat");
+ final String mdLayout = parser.get("mdLayout");
+ final String mdInterpretation = parser.get("mdInterpretation");
+
+ final String hdfsPath = parser.get("hdfsPath");
+
+ try (MigrateMongoMdstoresApplication app = new MigrateMongoMdstoresApplication(hdfsPath, mongoBaseUrl, mongoDb)) {
+ app.execute(mdFormat, mdLayout, mdInterpretation);
+ }
+
+ }
+
+ public MigrateMongoMdstoresApplication(final String hdfsPath, final String mongoBaseUrl, final String mongoDb) throws Exception {
+ super(hdfsPath);
+ this.mdstoreClient = new MdstoreClient(mongoBaseUrl, mongoDb);
+ }
+
+ public void execute(final String format, final String layout, final String interpretation) {
+ final Map colls = mdstoreClient.validCollections(format, layout, interpretation);
+ log.info("Found " + colls.size() + " mdstores");
+
+ for (final Entry entry : colls.entrySet()) {
+ log.info("Processing mdstore " + entry.getKey() + " (collection: " + entry.getValue() + ")");
+ final String currentColl = entry.getValue();
+
+ for (final String xml : mdstoreClient.listRecords(currentColl)) {
+ emit(xml, "native_" + format);
+ }
+ }
+ }
+
+ @Override
+ public void close() throws IOException {
+ super.close();
+ mdstoreClient.close();
+ }
+
+}
diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/step2/AbstractMdRecordToOafMapper.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/step2/AbstractMdRecordToOafMapper.java
new file mode 100644
index 0000000000..7c3000fbad
--- /dev/null
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/step2/AbstractMdRecordToOafMapper.java
@@ -0,0 +1,394 @@
+package eu.dnetlib.dhp.migration.step2;
+
+import static eu.dnetlib.dhp.migration.utils.OafMapperUtils.createOpenaireId;
+import static eu.dnetlib.dhp.migration.utils.OafMapperUtils.dataInfo;
+import static eu.dnetlib.dhp.migration.utils.OafMapperUtils.field;
+import static eu.dnetlib.dhp.migration.utils.OafMapperUtils.journal;
+import static eu.dnetlib.dhp.migration.utils.OafMapperUtils.keyValue;
+import static eu.dnetlib.dhp.migration.utils.OafMapperUtils.listFields;
+import static eu.dnetlib.dhp.migration.utils.OafMapperUtils.oaiIProvenance;
+import static eu.dnetlib.dhp.migration.utils.OafMapperUtils.qualifier;
+import static eu.dnetlib.dhp.migration.utils.OafMapperUtils.structuredProperty;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Date;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.commons.lang3.StringUtils;
+import org.dom4j.Document;
+import org.dom4j.DocumentFactory;
+import org.dom4j.DocumentHelper;
+import org.dom4j.Node;
+
+import eu.dnetlib.dhp.schema.oaf.Author;
+import eu.dnetlib.dhp.schema.oaf.DataInfo;
+import eu.dnetlib.dhp.schema.oaf.Dataset;
+import eu.dnetlib.dhp.schema.oaf.Field;
+import eu.dnetlib.dhp.schema.oaf.GeoLocation;
+import eu.dnetlib.dhp.schema.oaf.Instance;
+import eu.dnetlib.dhp.schema.oaf.Journal;
+import eu.dnetlib.dhp.schema.oaf.KeyValue;
+import eu.dnetlib.dhp.schema.oaf.OAIProvenance;
+import eu.dnetlib.dhp.schema.oaf.Oaf;
+import eu.dnetlib.dhp.schema.oaf.OtherResearchProduct;
+import eu.dnetlib.dhp.schema.oaf.Publication;
+import eu.dnetlib.dhp.schema.oaf.Qualifier;
+import eu.dnetlib.dhp.schema.oaf.Relation;
+import eu.dnetlib.dhp.schema.oaf.Result;
+import eu.dnetlib.dhp.schema.oaf.Software;
+import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
+
+public abstract class AbstractMdRecordToOafMapper {
+
+ protected final Map code2name;
+
+ protected static final Qualifier MAIN_TITLE_QUALIFIER = qualifier("main title", "main title", "dnet:dataCite_title", "dnet:dataCite_title");
+
+ protected static final Qualifier PUBLICATION_RESULTTYPE_QUALIFIER =
+ qualifier("publication", "publication", "dnet:result_typologies", "dnet:result_typologies");
+ protected static final Qualifier DATASET_RESULTTYPE_QUALIFIER = qualifier("dataset", "dataset", "dnet:result_typologies", "dnet:result_typologies");
+ protected static final Qualifier SOFTWARE_RESULTTYPE_QUALIFIER = qualifier("software", "software", "dnet:result_typologies", "dnet:result_typologies");
+ protected static final Qualifier OTHER_RESULTTYPE_QUALIFIER = qualifier("other", "other", "dnet:result_typologies", "dnet:result_typologies");
+
+ protected AbstractMdRecordToOafMapper(final Map code2name) {
+ this.code2name = code2name;
+ }
+
+ public List processMdRecord(final String xml) {
+ try {
+ final Map nsContext = new HashMap<>();
+ nsContext.put("dr", "http://www.driver-repository.eu/namespace/dr");
+ nsContext.put("dri", "http://www.driver-repository.eu/namespace/dri");
+ nsContext.put("oaf", "http://namespace.openaire.eu/oaf");
+ nsContext.put("oai", "http://www.openarchives.org/OAI/2.0/");
+ nsContext.put("prov", "http://www.openarchives.org/OAI/2.0/provenance");
+ nsContext.put("dc", "http://purl.org/dc/elements/1.1/");
+ nsContext.put("datacite", "http://datacite.org/schema/kernel-3");
+ DocumentFactory.getInstance().setXPathNamespaceURIs(nsContext);
+
+ final Document doc = DocumentHelper.parseText(xml);
+
+ final String type = doc.valueOf("//dr:CobjCategory/@type");
+ final KeyValue collectedFrom = keyValue(doc.valueOf("//oaf:collectedFrom/@id"), doc.valueOf("//oaf:collectedFrom/@name"));
+ final KeyValue hostedBy = StringUtils.isBlank(doc.valueOf("//oaf:hostedBy/@id")) ? collectedFrom
+ : keyValue(doc.valueOf("//oaf:hostedBy/@id"), doc.valueOf("//oaf:hostedBy/@name"));
+
+ final DataInfo info = prepareDataInfo(doc);
+ final long lastUpdateTimestamp = new Date().getTime();
+
+ return createOafs(doc, type, collectedFrom, hostedBy, info, lastUpdateTimestamp);
+ } catch (final Exception e) {
+ throw new RuntimeException(e);
+ }
+ }
+
+ protected List createOafs(final Document doc,
+ final String type,
+ final KeyValue collectedFrom,
+ final KeyValue hostedBy,
+ final DataInfo info,
+ final long lastUpdateTimestamp) {
+
+ final List oafs = new ArrayList<>();
+
+ switch (type.toLowerCase()) {
+ case "":
+ case "publication":
+ final Publication p = new Publication();
+ populateResultFields(p, doc, collectedFrom, hostedBy, info, lastUpdateTimestamp);
+ p.setResulttype(PUBLICATION_RESULTTYPE_QUALIFIER);
+ p.setJournal(prepareJournal(doc, info));
+ oafs.add(p);
+ break;
+ case "dataset":
+ final Dataset d = new Dataset();
+ populateResultFields(d, doc, collectedFrom, hostedBy, info, lastUpdateTimestamp);
+ d.setResulttype(DATASET_RESULTTYPE_QUALIFIER);
+ d.setStoragedate(prepareDatasetStorageDate(doc, info));
+ d.setDevice(prepareDatasetDevice(doc, info));
+ d.setSize(prepareDatasetSize(doc, info));
+ d.setVersion(prepareDatasetVersion(doc, info));
+ d.setLastmetadataupdate(prepareDatasetLastMetadataUpdate(doc, info));
+ d.setMetadataversionnumber(prepareDatasetMetadataVersionNumber(doc, info));
+ d.setGeolocation(prepareDatasetGeoLocations(doc, info));
+ oafs.add(d);
+ break;
+ case "software":
+ final Software s = new Software();
+ populateResultFields(s, doc, collectedFrom, hostedBy, info, lastUpdateTimestamp);
+ s.setResulttype(SOFTWARE_RESULTTYPE_QUALIFIER);
+ s.setDocumentationUrl(prepareSoftwareDocumentationUrls(doc, info));
+ s.setLicense(prepareSoftwareLicenses(doc, info));
+ s.setCodeRepositoryUrl(prepareSoftwareCodeRepositoryUrl(doc, info));
+ s.setProgrammingLanguage(prepareSoftwareProgrammingLanguage(doc, info));
+ oafs.add(s);
+ break;
+ case "otherresearchproducts":
+ default:
+ final OtherResearchProduct o = new OtherResearchProduct();
+ populateResultFields(o, doc, collectedFrom, hostedBy, info, lastUpdateTimestamp);
+ o.setResulttype(OTHER_RESULTTYPE_QUALIFIER);
+ o.setContactperson(prepareOtherResearchProductContactPersons(doc, info));
+ o.setContactgroup(prepareOtherResearchProductContactGroups(doc, info));
+ o.setTool(prepareOtherResearchProductTools(doc, info));
+ oafs.add(o);
+ break;
+ }
+
+ if (!oafs.isEmpty()) {
+ oafs.addAll(addProjectRels(doc, collectedFrom, info, lastUpdateTimestamp));
+ oafs.addAll(addOtherResultRels(doc, collectedFrom, info, lastUpdateTimestamp));
+ }
+
+ return oafs;
+ }
+
+ private List addProjectRels(final Document doc,
+ final KeyValue collectedFrom,
+ final DataInfo info,
+ final long lastUpdateTimestamp) {
+
+ final List res = new ArrayList<>();
+
+ final String docId = createOpenaireId(50, doc.valueOf("//dri:objIdentifier"), false);
+
+ for (final Object o : doc.selectNodes("//oaf:projectid")) {
+ final String projectId = createOpenaireId(40, ((Node) o).getText(), true);
+
+ final Relation r1 = new Relation();
+ r1.setRelType("resultProject");
+ r1.setSubRelType("outcome");
+ r1.setRelClass("isProducedBy");
+ r1.setSource(docId);
+ r1.setTarget(projectId);
+ r1.setCollectedFrom(Arrays.asList(collectedFrom));
+ r1.setDataInfo(info);
+ r1.setLastupdatetimestamp(lastUpdateTimestamp);
+ res.add(r1);
+
+ final Relation r2 = new Relation();
+ r2.setRelType("resultProject");
+ r2.setSubRelType("outcome");
+ r2.setRelClass("produces");
+ r2.setSource(projectId);
+ r2.setTarget(docId);
+ r2.setCollectedFrom(Arrays.asList(collectedFrom));
+ r2.setDataInfo(info);
+ r2.setLastupdatetimestamp(lastUpdateTimestamp);
+ res.add(r2);
+ }
+
+ return res;
+ }
+
+ protected abstract List addOtherResultRels(final Document doc,
+ final KeyValue collectedFrom,
+ final DataInfo info,
+ final long lastUpdateTimestamp);
+
+ private void populateResultFields(final Result r,
+ final Document doc,
+ final KeyValue collectedFrom,
+ final KeyValue hostedBy,
+ final DataInfo info,
+ final long lastUpdateTimestamp) {
+ r.setDataInfo(info);
+ r.setLastupdatetimestamp(lastUpdateTimestamp);
+ r.setId(createOpenaireId(50, doc.valueOf("//dri:objIdentifier"), false));
+ r.setOriginalId(Arrays.asList(doc.valueOf("//dri:objIdentifier")));
+ r.setCollectedfrom(Arrays.asList(collectedFrom));
+ r.setPid(prepareListStructProps(doc, "//oaf:identifier", "@identifierType", "dnet:pid_types", "dnet:pid_types", info));
+ r.setDateofcollection(doc.valueOf("//dr:dateOfCollection"));
+ r.setDateoftransformation(doc.valueOf("//dr:dateOfTransformation"));
+ r.setExtraInfo(new ArrayList<>()); // NOT PRESENT IN MDSTORES
+ r.setOaiprovenance(prepareOAIprovenance(doc));
+ r.setAuthor(prepareAuthors(doc, info));
+ r.setLanguage(prepareLanguages(doc));
+ r.setCountry(new ArrayList<>()); // NOT PRESENT IN MDSTORES
+ r.setSubject(prepareSubjects(doc, info));
+ r.setTitle(prepareTitles(doc, info));
+ r.setRelevantdate(prepareRelevantDates(doc, info));
+ r.setDescription(prepareDescriptions(doc, info));
+ r.setDateofacceptance(prepareField(doc, "//oaf:dateAccepted", info));
+ r.setPublisher(preparePublisher(doc, info));
+ r.setEmbargoenddate(prepareField(doc, "//oaf:embargoenddate", info));
+ r.setSource(prepareSources(doc, info));
+ r.setFulltext(new ArrayList<>()); // NOT PRESENT IN MDSTORES
+ r.setFormat(prepareFormats(doc, info));
+ r.setContributor(prepareContributors(doc, info));
+ r.setResourcetype(prepareResourceType(doc, info));
+ r.setCoverage(prepareCoverages(doc, info));
+ r.setContext(new ArrayList<>()); // NOT PRESENT IN MDSTORES
+ r.setExternalReference(new ArrayList<>()); // NOT PRESENT IN MDSTORES
+ r.setInstance(prepareInstances(doc, info, collectedFrom, hostedBy));
+ }
+
+ protected abstract Qualifier prepareResourceType(Document doc, DataInfo info);
+
+ protected abstract List prepareInstances(Document doc, DataInfo info, KeyValue collectedfrom, KeyValue hostedby);
+
+ protected abstract List> prepareSources(Document doc, DataInfo info);
+
+ protected abstract List prepareRelevantDates(Document doc, DataInfo info);
+
+ protected abstract List> prepareCoverages(Document doc, DataInfo info);
+
+ protected abstract List> prepareContributors(Document doc, DataInfo info);
+
+ protected abstract List> prepareFormats(Document doc, DataInfo info);
+
+ protected abstract Field preparePublisher(Document doc, DataInfo info);
+
+ protected abstract List> prepareDescriptions(Document doc, DataInfo info);
+
+ protected abstract List prepareTitles(Document doc, DataInfo info);
+
+ protected abstract List prepareSubjects(Document doc, DataInfo info);
+
+ protected abstract Qualifier prepareLanguages(Document doc);
+
+ protected abstract List prepareAuthors(Document doc, DataInfo info);
+
+ protected abstract List> prepareOtherResearchProductTools(Document doc, DataInfo info);
+
+ protected abstract List> prepareOtherResearchProductContactGroups(Document doc, DataInfo info);
+
+ protected abstract List> prepareOtherResearchProductContactPersons(Document doc, DataInfo info);
+
+ protected abstract Qualifier prepareSoftwareProgrammingLanguage(Document doc, DataInfo info);
+
+ protected abstract Field prepareSoftwareCodeRepositoryUrl(Document doc, DataInfo info);
+
+ protected abstract List prepareSoftwareLicenses(Document doc, DataInfo info);
+
+ protected abstract List> prepareSoftwareDocumentationUrls(Document doc, DataInfo info);
+
+ protected abstract List prepareDatasetGeoLocations(Document doc, DataInfo info);
+
+ protected abstract Field prepareDatasetMetadataVersionNumber(Document doc, DataInfo info);
+
+ protected abstract Field prepareDatasetLastMetadataUpdate(Document doc, DataInfo info);
+
+ protected abstract Field prepareDatasetVersion(Document doc, DataInfo info);
+
+ protected abstract Field prepareDatasetSize(Document doc, DataInfo info);
+
+ protected abstract Field prepareDatasetDevice(Document doc, DataInfo info);
+
+ protected abstract Field prepareDatasetStorageDate(Document doc, DataInfo info);
+
+ private Journal prepareJournal(final Document doc, final DataInfo info) {
+ final Node n = doc.selectSingleNode("//oaf:journal");
+ if (n != null) {
+ final String name = n.getText();
+ final String issnPrinted = n.valueOf("@issn");
+ final String issnOnline = n.valueOf("@eissn");
+ final String issnLinking = n.valueOf("@lissn");
+ final String ep = n.valueOf("@ep");
+ final String iss = n.valueOf("@iss");
+ final String sp = n.valueOf("@sp");
+ final String vol = n.valueOf("@vol");
+ final String edition = n.valueOf("@edition");
+ if (StringUtils.isNotBlank(name)) { return journal(name, issnPrinted, issnOnline, issnLinking, ep, iss, sp, vol, edition, null, null, info); }
+ }
+ return null;
+ }
+
+ protected Qualifier prepareQualifier(final Node node, final String xpath, final String schemeId, final String schemeName) {
+ final String classId = node.valueOf(xpath);
+ final String className = code2name.get(classId);
+ return qualifier(classId, className, schemeId, schemeName);
+ }
+
+ protected List prepareListStructProps(final Node node,
+ final String xpath,
+ final String xpathClassId,
+ final String schemeId,
+ final String schemeName,
+ final DataInfo info) {
+ final List res = new ArrayList<>();
+ for (final Object o : node.selectNodes(xpath)) {
+ final Node n = (Node) o;
+ final String classId = n.valueOf(xpathClassId);
+ final String className = code2name.get(classId);
+ res.add(structuredProperty(n.getText(), classId, className, schemeId, schemeName, info));
+ }
+ return res;
+ }
+
+ protected List prepareListStructProps(final Node node, final String xpath, final Qualifier qualifier, final DataInfo info) {
+ final List res = new ArrayList<>();
+ for (final Object o : node.selectNodes(xpath)) {
+ final Node n = (Node) o;
+ res.add(structuredProperty(n.getText(), qualifier, info));
+ }
+ return res;
+ }
+
+ protected List prepareListStructProps(final Node node, final String xpath, final DataInfo info) {
+ final List res = new ArrayList<>();
+ for (final Object o : node.selectNodes(xpath)) {
+ final Node n = (Node) o;
+ res.add(structuredProperty(n.getText(), n.valueOf("@classid"), n.valueOf("@classname"), n.valueOf("@schemeid"), n
+ .valueOf("@schemename"), info));
+ }
+ return res;
+ }
+
+ protected OAIProvenance prepareOAIprovenance(final Document doc) {
+ final Node n = doc.selectSingleNode("//*[local-name()='provenance']/*[local-name()='originDescription']");
+
+ if (n == null) { return null; }
+
+ final String identifier = n.valueOf("./*[local-name()='identifier']");
+ final String baseURL = n.valueOf("./*[local-name()='baseURL']");;
+ final String metadataNamespace = n.valueOf("./*[local-name()='metadataNamespace']");;
+ final boolean altered = n.valueOf("@altered").equalsIgnoreCase("true");
+ final String datestamp = n.valueOf("./*[local-name()='datestamp']");;
+ final String harvestDate = n.valueOf("@harvestDate");;
+
+ return oaiIProvenance(identifier, baseURL, metadataNamespace, altered, datestamp, harvestDate);
+
+ }
+
+ protected DataInfo prepareDataInfo(final Document doc) {
+ final Node n = doc.selectSingleNode("//oaf:datainfo");
+
+ if (n == null) { return null; }
+
+ final String paClassId = n.valueOf("./oaf:provenanceaction/@classid");
+ final String paClassName = n.valueOf("./oaf:provenanceaction/@classname");
+ final String paSchemeId = n.valueOf("./oaf:provenanceaction/@schemeid");
+ final String paSchemeName = n.valueOf("./oaf:provenanceaction/@schemename");
+
+ final boolean deletedbyinference = Boolean.parseBoolean(n.valueOf("./oaf:deletedbyinference"));
+ final String inferenceprovenance = n.valueOf("./oaf:inferenceprovenance");
+ final Boolean inferred = Boolean.parseBoolean(n.valueOf("./oaf:inferred"));
+ final String trust = n.valueOf("./oaf:trust");
+
+ return dataInfo(deletedbyinference, inferenceprovenance, inferred, false, qualifier(paClassId, paClassName, paSchemeId, paSchemeName), trust);
+ }
+
+ protected Field prepareField(final Node node, final String xpath, final DataInfo info) {
+ return field(node.valueOf(xpath), info);
+ }
+
+ protected List> prepareListFields(final Node node, final String xpath, final DataInfo info) {
+ return listFields(info, prepareListString(node, xpath));
+ }
+
+ protected List prepareListString(final Node node, final String xpath) {
+ final List res = new ArrayList<>();
+ for (final Object o : node.selectNodes(xpath)) {
+ final String s = ((Node) o).getText().trim();
+ if (StringUtils.isNotBlank(s)) {
+ res.add(s);
+ }
+ }
+ return res;
+ }
+
+}
diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/step2/GenerateEntitiesApplication.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/step2/GenerateEntitiesApplication.java
new file mode 100644
index 0000000000..7f907b0c8c
--- /dev/null
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/step2/GenerateEntitiesApplication.java
@@ -0,0 +1,173 @@
+package eu.dnetlib.dhp.migration.step2;
+
+import java.io.IOException;
+import java.sql.SQLException;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.stream.Collectors;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.commons.lang3.StringUtils;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.compress.GzipCodec;
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.sql.SparkSession;
+
+import com.fasterxml.jackson.databind.ObjectMapper;
+
+import eu.dnetlib.dhp.application.ArgumentApplicationParser;
+import eu.dnetlib.dhp.migration.step1.MigrateMongoMdstoresApplication;
+import eu.dnetlib.dhp.migration.utils.DbClient;
+import eu.dnetlib.dhp.schema.oaf.Dataset;
+import eu.dnetlib.dhp.schema.oaf.Datasource;
+import eu.dnetlib.dhp.schema.oaf.Oaf;
+import eu.dnetlib.dhp.schema.oaf.Organization;
+import eu.dnetlib.dhp.schema.oaf.OtherResearchProduct;
+import eu.dnetlib.dhp.schema.oaf.Project;
+import eu.dnetlib.dhp.schema.oaf.Publication;
+import eu.dnetlib.dhp.schema.oaf.Relation;
+import eu.dnetlib.dhp.schema.oaf.Software;
+import scala.Tuple2;
+
+public class GenerateEntitiesApplication {
+
+ private static final Log log = LogFactory.getLog(GenerateEntitiesApplication.class);
+
+ public static void main(final String[] args) throws Exception {
+ final ArgumentApplicationParser parser = new ArgumentApplicationParser(
+ IOUtils.toString(MigrateMongoMdstoresApplication.class
+ .getResourceAsStream("/eu/dnetlib/dhp/migration/generate_entities_parameters.json")));
+
+ parser.parseArgument(args);
+
+ final String sourcePaths = parser.get("sourcePaths");
+ final String targetPath = parser.get("targetPath");
+
+ final String dbUrl = parser.get("postgresUrl");
+ final String dbUser = parser.get("postgresUser");
+ final String dbPassword = parser.get("postgresPassword");
+
+ final Map code2name = loadClassNames(dbUrl, dbUser, dbPassword);
+
+ try (final SparkSession spark = newSparkSession(parser); final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext())) {
+ final List existingSourcePaths = Arrays.stream(sourcePaths.split(",")).filter(p -> exists(sc, p)).collect(Collectors.toList());
+ generateEntities(sc, code2name, existingSourcePaths, targetPath);
+ }
+ }
+
+ private static SparkSession newSparkSession(final ArgumentApplicationParser parser) {
+ return SparkSession
+ .builder()
+ .appName(GenerateEntitiesApplication.class.getSimpleName())
+ .master(parser.get("master"))
+ .getOrCreate();
+ }
+
+ private static void generateEntities(final JavaSparkContext sc,
+ final Map code2name,
+ final List sourcePaths,
+ final String targetPath) {
+
+ log.info("Generate entities from files:");
+ sourcePaths.forEach(log::info);
+
+ JavaRDD inputRdd = sc.emptyRDD();
+
+ for (final String sp : sourcePaths) {
+ inputRdd = inputRdd.union(sc.sequenceFile(sp, Text.class, Text.class)
+ .map(k -> new Tuple2<>(k._1().toString(), k._2().toString()))
+ .map(k -> convertToListOaf(k._1(), k._2(), code2name))
+ .flatMap(list -> list.iterator())
+ .map(oaf -> oaf.getClass().getSimpleName().toLowerCase() + "|" + convertToJson(oaf)));
+ }
+
+ inputRdd.saveAsTextFile(targetPath, GzipCodec.class);
+
+ }
+
+ private static List convertToListOaf(final String id, final String s, final Map code2name) {
+ final String type = StringUtils.substringAfter(id, ":");
+
+ switch (type.toLowerCase()) {
+ case "native_oaf":
+ return new OafToOafMapper(code2name).processMdRecord(s);
+ case "native_odf":
+ return new OdfToOafMapper(code2name).processMdRecord(s);
+ case "datasource":
+ return Arrays.asList(convertFromJson(s, Datasource.class));
+ case "organization":
+ return Arrays.asList(convertFromJson(s, Organization.class));
+ case "project":
+ return Arrays.asList(convertFromJson(s, Project.class));
+ case "relation":
+ return Arrays.asList(convertFromJson(s, Relation.class));
+ case "publication":
+ return Arrays.asList(convertFromJson(s, Publication.class));
+ case "dataset":
+ return Arrays.asList(convertFromJson(s, Dataset.class));
+ case "software":
+ return Arrays.asList(convertFromJson(s, Software.class));
+ case "otherresearchproducts":
+ default:
+ return Arrays.asList(convertFromJson(s, OtherResearchProduct.class));
+ }
+
+ }
+
+ private static Map loadClassNames(final String dbUrl, final String dbUser, final String dbPassword) throws IOException {
+
+ log.info("Loading vocabulary terms from db...");
+
+ final Map map = new HashMap<>();
+
+ try (DbClient dbClient = new DbClient(dbUrl, dbUser, dbPassword)) {
+ dbClient.processResults("select code, name from class", rs -> {
+ try {
+ map.put(rs.getString("code"), rs.getString("name"));
+ } catch (final SQLException e) {
+ e.printStackTrace();
+ }
+ });
+ }
+
+ log.info("Found " + map.size() + " terms.");
+
+ return map;
+
+ }
+
+ private static String convertToJson(final Oaf oaf) {
+ try {
+ return new ObjectMapper().writeValueAsString(oaf);
+ } catch (final Exception e) {
+ throw new RuntimeException(e);
+ }
+ }
+
+ private static Oaf convertFromJson(final String s, final Class extends Oaf> clazz) {
+ try {
+ return new ObjectMapper().readValue(s, clazz);
+ } catch (final Exception e) {
+ log.error("Error parsing object of class: " + clazz);
+ log.error(s);
+ throw new RuntimeException(e);
+ }
+ }
+
+ private static boolean exists(final JavaSparkContext context, final String pathToFile) {
+ try {
+ final FileSystem hdfs = org.apache.hadoop.fs.FileSystem.get(context.hadoopConfiguration());
+ final Path path = new Path(pathToFile);
+ return hdfs.exists(path);
+ } catch (final IOException e) {
+ throw new RuntimeException(e);
+ }
+ }
+}
diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/step2/OafToOafMapper.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/step2/OafToOafMapper.java
new file mode 100644
index 0000000000..110abc4864
--- /dev/null
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/step2/OafToOafMapper.java
@@ -0,0 +1,242 @@
+package eu.dnetlib.dhp.migration.step2;
+
+import static eu.dnetlib.dhp.migration.utils.OafMapperUtils.createOpenaireId;
+import static eu.dnetlib.dhp.migration.utils.OafMapperUtils.field;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+import java.util.Map;
+
+import org.dom4j.Document;
+import org.dom4j.Node;
+
+import eu.dnetlib.dhp.migration.utils.PacePerson;
+import eu.dnetlib.dhp.schema.oaf.Author;
+import eu.dnetlib.dhp.schema.oaf.DataInfo;
+import eu.dnetlib.dhp.schema.oaf.Field;
+import eu.dnetlib.dhp.schema.oaf.GeoLocation;
+import eu.dnetlib.dhp.schema.oaf.Instance;
+import eu.dnetlib.dhp.schema.oaf.KeyValue;
+import eu.dnetlib.dhp.schema.oaf.Oaf;
+import eu.dnetlib.dhp.schema.oaf.Qualifier;
+import eu.dnetlib.dhp.schema.oaf.Relation;
+import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
+
+public class OafToOafMapper extends AbstractMdRecordToOafMapper {
+
+ public OafToOafMapper(final Map code2name) {
+ super(code2name);
+ }
+
+ @Override
+ protected List prepareAuthors(final Document doc, final DataInfo info) {
+ final List res = new ArrayList<>();
+ int pos = 1;
+ for (final Object o : doc.selectNodes("//dc:creator")) {
+ final Node n = (Node) o;
+ final Author author = new Author();
+ author.setFullname(n.getText());
+ author.setRank(pos++);
+ final PacePerson p = new PacePerson(n.getText(), false);
+ if (p.isAccurate()) {
+ author.setName(p.getNormalisedFirstName());
+ author.setSurname(p.getNormalisedSurname());
+ }
+ res.add(author);
+ }
+ return res;
+ }
+
+ @Override
+ protected Qualifier prepareLanguages(final Document doc) {
+ return prepareQualifier(doc, "//dc:language", "dnet:languages", "dnet:languages");
+ }
+
+ @Override
+ protected List prepareSubjects(final Document doc, final DataInfo info) {
+ return prepareListStructProps(doc, "//dc:subject", info);
+ }
+
+ @Override
+ protected List prepareTitles(final Document doc, final DataInfo info) {
+ return prepareListStructProps(doc, "//dc:title", MAIN_TITLE_QUALIFIER, info);
+ }
+
+ @Override
+ protected List> prepareDescriptions(final Document doc, final DataInfo info) {
+ return prepareListFields(doc, "//dc:description", info);
+ }
+
+ @Override
+ protected Field preparePublisher(final Document doc, final DataInfo info) {
+ return prepareField(doc, "//dc:publisher", info);
+ }
+
+ @Override
+ protected List> prepareFormats(final Document doc, final DataInfo info) {
+ return prepareListFields(doc, "//dc:format", info);
+ }
+
+ @Override
+ protected List> prepareContributors(final Document doc, final DataInfo info) {
+ return prepareListFields(doc, "//dc:contributor", info);
+ }
+
+ @Override
+ protected List> prepareCoverages(final Document doc, final DataInfo info) {
+ return prepareListFields(doc, "//dc:coverage", info);
+ }
+
+ @Override
+ protected List prepareInstances(final Document doc, final DataInfo info, final KeyValue collectedfrom, final KeyValue hostedby) {
+ final List res = new ArrayList<>();
+ for (final Object o : doc.selectNodes("//dc:identifier")) {
+ final String url = ((Node) o).getText().trim();
+ if (url.startsWith("http")) {
+ final Instance instance = new Instance();
+ instance.setUrl(Arrays.asList(url));
+ instance.setInstancetype(prepareQualifier(doc, "//dr:CobjCategory", "dnet:publication_resource", "dnet:publication_resource"));
+ instance.setCollectedfrom(collectedfrom);
+ instance.setHostedby(hostedby);
+ instance.setDateofacceptance(field(doc.valueOf("//oaf:dateAccepted"), info));
+ instance.setDistributionlocation(doc.valueOf("//oaf:distributionlocation"));
+ instance.setAccessright(prepareQualifier(doc, "//oaf:accessrights", "dnet:access_modes", "dnet:access_modes"));
+ instance.setLicense(field(doc.valueOf("//oaf:license"), info));
+ instance.setRefereed(field(doc.valueOf("//oaf:refereed"), info));
+ instance.setProcessingchargeamount(field(doc.valueOf("//oaf:processingchargeamount"), info));
+ instance.setProcessingchargecurrency(field(doc.valueOf("//oaf:processingchargeamount/@currency"), info));
+ res.add(instance);
+ }
+ }
+ return res;
+ }
+
+ @Override
+ protected List> prepareSources(final Document doc, final DataInfo info) {
+ return prepareListFields(doc, "//dc:source", info);
+ }
+
+ @Override
+ protected List prepareRelevantDates(final Document doc, final DataInfo info) {
+ return new ArrayList<>(); // NOT PRESENT IN OAF
+ }
+
+ // SOFTWARES
+
+ @Override
+ protected Qualifier prepareSoftwareProgrammingLanguage(final Document doc, final DataInfo info) {
+ return null; // NOT PRESENT IN OAF
+ }
+
+ @Override
+ protected Field prepareSoftwareCodeRepositoryUrl(final Document doc, final DataInfo info) {
+ return null; // NOT PRESENT IN OAF
+ }
+
+ @Override
+ protected List prepareSoftwareLicenses(final Document doc, final DataInfo info) {
+ return new ArrayList<>(); // NOT PRESENT IN OAF
+ }
+
+ @Override
+ protected List> prepareSoftwareDocumentationUrls(final Document doc, final DataInfo info) {
+ return new ArrayList<>(); // NOT PRESENT IN OAF
+ }
+
+ // DATASETS
+ @Override
+ protected List prepareDatasetGeoLocations(final Document doc, final DataInfo info) {
+ return new ArrayList<>(); // NOT PRESENT IN OAF
+ }
+
+ @Override
+ protected Field prepareDatasetMetadataVersionNumber(final Document doc, final DataInfo info) {
+ return null; // NOT PRESENT IN OAF
+ }
+
+ @Override
+ protected Field prepareDatasetLastMetadataUpdate(final Document doc, final DataInfo info) {
+ return null; // NOT PRESENT IN OAF
+ }
+
+ @Override
+ protected Field prepareDatasetVersion(final Document doc, final DataInfo info) {
+ return null; // NOT PRESENT IN OAF
+ }
+
+ @Override
+ protected Field prepareDatasetSize(final Document doc, final DataInfo info) {
+ return null; // NOT PRESENT IN OAF
+ }
+
+ @Override
+ protected Field prepareDatasetDevice(final Document doc, final DataInfo info) {
+ return null; // NOT PRESENT IN OAF
+ }
+
+ @Override
+ protected Field prepareDatasetStorageDate(final Document doc, final DataInfo info) {
+ return null; // NOT PRESENT IN OAF
+ }
+
+ // OTHER PRODUCTS
+
+ @Override
+ protected List> prepareOtherResearchProductTools(final Document doc, final DataInfo info) {
+ return new ArrayList<>(); // NOT PRESENT IN OAF
+ }
+
+ @Override
+ protected List> prepareOtherResearchProductContactGroups(final Document doc, final DataInfo info) {
+ return new ArrayList<>(); // NOT PRESENT IN OAF
+ }
+
+ @Override
+ protected List> prepareOtherResearchProductContactPersons(final Document doc, final DataInfo info) {
+ return new ArrayList<>(); // NOT PRESENT IN OAF
+ }
+
+ @Override
+ protected List addOtherResultRels(final Document doc,
+ final KeyValue collectedFrom,
+ final DataInfo info,
+ final long lastUpdateTimestamp) {
+ final String docId = createOpenaireId(50, doc.valueOf("//dri:objIdentifier"), false);
+
+ final List res = new ArrayList<>();
+
+ for (final Object o : doc.selectNodes("//*[local-name()='relatedDataset']")) {
+ final String otherId = createOpenaireId(50, ((Node) o).getText(), false);
+
+ final Relation r1 = new Relation();
+ r1.setRelType("resultResult");
+ r1.setSubRelType("publicationDataset");
+ r1.setRelClass("isRelatedTo");
+ r1.setSource(docId);
+ r1.setTarget(otherId);
+ r1.setCollectedFrom(Arrays.asList(collectedFrom));
+ r1.setDataInfo(info);
+ r1.setLastupdatetimestamp(lastUpdateTimestamp);
+ res.add(r1);
+
+ final Relation r2 = new Relation();
+ r2.setRelType("resultResult");
+ r2.setSubRelType("publicationDataset");
+ r2.setRelClass("isRelatedTo");
+ r2.setSource(otherId);
+ r2.setTarget(docId);
+ r2.setCollectedFrom(Arrays.asList(collectedFrom));
+ r2.setDataInfo(info);
+ r2.setLastupdatetimestamp(lastUpdateTimestamp);
+ res.add(r2);
+ }
+ return res;
+ }
+
+ @Override
+ protected Qualifier prepareResourceType(final Document doc, final DataInfo info) {
+ return null; // NOT PRESENT IN OAF
+ }
+
+}
diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/step2/OdfToOafMapper.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/step2/OdfToOafMapper.java
new file mode 100644
index 0000000000..b4868b8f9b
--- /dev/null
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/step2/OdfToOafMapper.java
@@ -0,0 +1,265 @@
+package eu.dnetlib.dhp.migration.step2;
+
+import static eu.dnetlib.dhp.migration.utils.OafMapperUtils.createOpenaireId;
+import static eu.dnetlib.dhp.migration.utils.OafMapperUtils.field;
+import static eu.dnetlib.dhp.migration.utils.OafMapperUtils.structuredProperty;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.commons.lang3.StringUtils;
+import org.dom4j.Document;
+import org.dom4j.Node;
+
+import eu.dnetlib.dhp.schema.oaf.Author;
+import eu.dnetlib.dhp.schema.oaf.DataInfo;
+import eu.dnetlib.dhp.schema.oaf.Field;
+import eu.dnetlib.dhp.schema.oaf.GeoLocation;
+import eu.dnetlib.dhp.schema.oaf.Instance;
+import eu.dnetlib.dhp.schema.oaf.KeyValue;
+import eu.dnetlib.dhp.schema.oaf.Oaf;
+import eu.dnetlib.dhp.schema.oaf.Qualifier;
+import eu.dnetlib.dhp.schema.oaf.Relation;
+import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
+
+public class OdfToOafMapper extends AbstractMdRecordToOafMapper {
+
+ public OdfToOafMapper(final Map code2name) {
+ super(code2name);
+ }
+
+ @Override
+ protected List prepareTitles(final Document doc, final DataInfo info) {
+ return prepareListStructProps(doc, "//datacite:title", MAIN_TITLE_QUALIFIER, info);
+ }
+
+ @Override
+ protected List prepareAuthors(final Document doc, final DataInfo info) {
+ final List res = new ArrayList<>();
+ int pos = 1;
+ for (final Object o : doc.selectNodes("//datacite:creator")) {
+ final Node n = (Node) o;
+ final Author author = new Author();
+ author.setFullname(n.valueOf("./datacite:creatorName"));
+ author.setName(n.valueOf("./datacite:givenName"));
+ author.setSurname(n.valueOf("./datacite:familyName"));
+ author.setAffiliation(prepareListFields(doc, "./datacite:affiliation", info));
+ author.setPid(preparePids(doc, info));
+ author.setRank(pos++);
+ res.add(author);
+ }
+ return res;
+ }
+
+ private List preparePids(final Document doc, final DataInfo info) {
+ final List res = new ArrayList<>();
+ for (final Object o : doc.selectNodes("./datacite:nameIdentifier")) {
+ res.add(structuredProperty(((Node) o).getText(), prepareQualifier((Node) o, "./@nameIdentifierScheme", "dnet:pid_types", "dnet:pid_types"), info));
+ }
+ return res;
+ }
+
+ @Override
+ protected List prepareInstances(final Document doc, final DataInfo info, final KeyValue collectedfrom, final KeyValue hostedby) {
+ final List res = new ArrayList<>();
+ for (final Object o : doc.selectNodes("//datacite:alternateIdentifier[@alternateIdentifierType='URL']")) {
+ final Instance instance = new Instance();
+ instance.setUrl(Arrays.asList(((Node) o).getText().trim()));
+ instance.setInstancetype(prepareQualifier(doc, "//dr:CobjCategory", "dnet:publication_resource", "dnet:publication_resource"));
+ instance.setCollectedfrom(collectedfrom);
+ instance.setHostedby(hostedby);
+ instance.setDateofacceptance(field(doc.valueOf("//oaf:dateAccepted"), info));
+ instance.setDistributionlocation(doc.valueOf("//oaf:distributionlocation"));
+ instance.setAccessright(prepareQualifier(doc, "//oaf:accessrights", "dnet:access_modes", "dnet:access_modes"));
+ instance.setLicense(field(doc.valueOf("//oaf:license"), info));
+ instance.setRefereed(field(doc.valueOf("//oaf:refereed"), info));
+ instance.setProcessingchargeamount(field(doc.valueOf("//oaf:processingchargeamount"), info));
+ instance.setProcessingchargecurrency(field(doc.valueOf("//oaf:processingchargeamount/@currency"), info));
+ res.add(instance);
+ }
+ return res;
+ }
+
+ @Override
+ protected List> prepareSources(final Document doc, final DataInfo info) {
+ return new ArrayList<>(); // Not present in ODF ???
+ }
+
+ @Override
+ protected List prepareRelevantDates(final Document doc, final DataInfo info) {
+ final List res = new ArrayList<>();
+ for (final Object o : doc.selectNodes("//datacite:date")) {
+ final String dateType = ((Node) o).valueOf("@dateType");
+ if (StringUtils.isBlank(dateType) && !dateType.equalsIgnoreCase("Accepted") && !dateType.equalsIgnoreCase("Issued")
+ && !dateType.equalsIgnoreCase("Updated") && !dateType.equalsIgnoreCase("Available")) {
+ res.add(structuredProperty(((Node) o).getText(), "UNKNOWN", "UNKNOWN", "dnet:dataCite_date", "dnet:dataCite_date", info));
+ }
+ }
+ return res;
+ }
+
+ @Override
+ protected List> prepareCoverages(final Document doc, final DataInfo info) {
+ return new ArrayList<>(); // Not present in ODF ???
+ }
+
+ @Override
+ protected List> prepareContributors(final Document doc, final DataInfo info) {
+ return prepareListFields(doc, "//datacite:contributorName", info);
+ }
+
+ @Override
+ protected List> prepareFormats(final Document doc, final DataInfo info) {
+ return prepareListFields(doc, "//datacite:format", info);
+ }
+
+ @Override
+ protected Field preparePublisher(final Document doc, final DataInfo info) {
+ return prepareField(doc, "//datacite:publisher", info);
+ }
+
+ @Override
+ protected List> prepareDescriptions(final Document doc, final DataInfo info) {
+ return prepareListFields(doc, "//datacite:description[@descriptionType='Abstract']", info);
+ }
+
+ @Override
+ protected List prepareSubjects(final Document doc, final DataInfo info) {
+ return prepareListStructProps(doc, "//datacite:subject", info);
+ }
+
+ @Override
+ protected Qualifier prepareLanguages(final Document doc) {
+ return prepareQualifier(doc, "//datacite:language", "dnet:languages", "dnet:languages");
+ }
+
+ @Override
+ protected List> prepareOtherResearchProductTools(final Document doc, final DataInfo info) {
+ return new ArrayList<>(); // Not present in ODF ???
+ }
+
+ @Override
+ protected List> prepareOtherResearchProductContactGroups(final Document doc, final DataInfo info) {
+ return prepareListFields(doc, "//datacite:contributor[@contributorType='ContactGroup']/datacite:contributorName", info);
+ }
+
+ @Override
+ protected List> prepareOtherResearchProductContactPersons(final Document doc, final DataInfo info) {
+ return prepareListFields(doc, "//datacite:contributor[@contributorType='ContactPerson']/datacite:contributorName", info);
+ }
+
+ @Override
+ protected Qualifier prepareSoftwareProgrammingLanguage(final Document doc, final DataInfo info) {
+ return prepareQualifier(doc, "//datacite:format", "dnet:programming_languages", "dnet:programming_languages");
+ }
+
+ @Override
+ protected Field prepareSoftwareCodeRepositoryUrl(final Document doc, final DataInfo info) {
+ return null; // Not present in ODF ???
+ }
+
+ @Override
+ protected List prepareSoftwareLicenses(final Document doc, final DataInfo info) {
+ return new ArrayList<>(); // Not present in ODF ???
+ }
+
+ @Override
+ protected List> prepareSoftwareDocumentationUrls(final Document doc, final DataInfo info) {
+ return prepareListFields(doc, "//datacite:relatedIdentifier[@relatedIdentifierType='URL' and @relationType='IsDocumentedBy']", info);
+ }
+
+ // DATASETS
+
+ @Override
+ protected List prepareDatasetGeoLocations(final Document doc, final DataInfo info) {
+ final List res = new ArrayList<>();
+
+ for (final Object o : doc.selectNodes("//datacite:geoLocation")) {
+ final GeoLocation loc = new GeoLocation();
+ loc.setBox(((Node) o).valueOf("./datacite:geoLocationBox"));
+ loc.setPlace(((Node) o).valueOf("./datacite:geoLocationPlace"));
+ loc.setPoint(((Node) o).valueOf("./datacite:geoLocationPoint"));
+ res.add(loc);
+ }
+ return res;
+ }
+
+ @Override
+ protected Field prepareDatasetMetadataVersionNumber(final Document doc, final DataInfo info) {
+ return null; // Not present in ODF ???
+ }
+
+ @Override
+ protected Field prepareDatasetLastMetadataUpdate(final Document doc, final DataInfo info) {
+ return prepareField(doc, "//datacite:date[@dateType='Updated']", info);
+ }
+
+ @Override
+ protected Field prepareDatasetVersion(final Document doc, final DataInfo info) {
+ return prepareField(doc, "//datacite:version", info);
+ }
+
+ @Override
+ protected Field prepareDatasetSize(final Document doc, final DataInfo info) {
+ return prepareField(doc, "//datacite:size", info);
+ }
+
+ @Override
+ protected Field prepareDatasetDevice(final Document doc, final DataInfo info) {
+ return null; // Not present in ODF ???
+ }
+
+ @Override
+ protected Field prepareDatasetStorageDate(final Document doc, final DataInfo info) {
+ return prepareField(doc, "//datacite:date[@dateType='Issued']", info);
+ }
+
+ @Override
+ protected List addOtherResultRels(final Document doc, final KeyValue collectedFrom, final DataInfo info, final long lastUpdateTimestamp) {
+
+ final String docId = createOpenaireId(50, doc.valueOf("//dri:objIdentifier"), false);
+
+ final List res = new ArrayList<>();
+
+ for (final Object o : doc.selectNodes("//datacite:relatedIdentifier[@relatedIdentifierType='OPENAIRE']")) {
+ final String otherId = createOpenaireId(50, ((Node) o).getText(), false);
+ final String type = ((Node) o).valueOf("@relationType");
+
+ if (type.equals("IsSupplementTo")) {
+ res.add(prepareOtherResultRel(collectedFrom, info, lastUpdateTimestamp, docId, otherId, "supplement", "isSupplementTo"));
+ res.add(prepareOtherResultRel(collectedFrom, info, lastUpdateTimestamp, otherId, docId, "supplement", "isSupplementedBy"));
+ } else if (type.equals("IsPartOf")) {
+ res.add(prepareOtherResultRel(collectedFrom, info, lastUpdateTimestamp, docId, otherId, "part", "IsPartOf"));
+ res.add(prepareOtherResultRel(collectedFrom, info, lastUpdateTimestamp, otherId, docId, "part", "HasParts"));
+ } else {}
+ }
+ return res;
+ }
+
+ private Relation prepareOtherResultRel(final KeyValue collectedFrom,
+ final DataInfo info,
+ final long lastUpdateTimestamp,
+ final String source,
+ final String target,
+ final String subRelType,
+ final String relClass) {
+ final Relation r = new Relation();
+ r.setRelType("resultResult");
+ r.setSubRelType(subRelType);
+ r.setRelClass(relClass);
+ r.setSource(source);
+ r.setTarget(target);
+ r.setCollectedFrom(Arrays.asList(collectedFrom));
+ r.setDataInfo(info);
+ r.setLastupdatetimestamp(lastUpdateTimestamp);
+ return r;
+ }
+
+ @Override
+ protected Qualifier prepareResourceType(final Document doc, final DataInfo info) {
+ return prepareQualifier(doc, "//*[local-name() = 'resource']//*[local-name() = 'resourceType']", "dnet:dataCite_resource", "dnet:dataCite_resource");
+ }
+
+}
diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/step3/DispatchEntitiesApplication.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/step3/DispatchEntitiesApplication.java
new file mode 100644
index 0000000000..4ee24cba0a
--- /dev/null
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/step3/DispatchEntitiesApplication.java
@@ -0,0 +1,71 @@
+package eu.dnetlib.dhp.migration.step3;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.commons.lang3.StringUtils;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.io.compress.GzipCodec;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.sql.SparkSession;
+
+import eu.dnetlib.dhp.application.ArgumentApplicationParser;
+import eu.dnetlib.dhp.migration.step1.MigrateMongoMdstoresApplication;
+import eu.dnetlib.dhp.schema.oaf.Dataset;
+import eu.dnetlib.dhp.schema.oaf.Datasource;
+import eu.dnetlib.dhp.schema.oaf.Organization;
+import eu.dnetlib.dhp.schema.oaf.OtherResearchProduct;
+import eu.dnetlib.dhp.schema.oaf.Project;
+import eu.dnetlib.dhp.schema.oaf.Publication;
+import eu.dnetlib.dhp.schema.oaf.Relation;
+import eu.dnetlib.dhp.schema.oaf.Software;
+
+public class DispatchEntitiesApplication {
+
+ private static final Log log = LogFactory.getLog(DispatchEntitiesApplication.class);
+
+ public static void main(final String[] args) throws Exception {
+ final ArgumentApplicationParser parser = new ArgumentApplicationParser(
+ IOUtils.toString(MigrateMongoMdstoresApplication.class
+ .getResourceAsStream("/eu/dnetlib/dhp/migration/dispatch_entities_parameters.json")));
+ parser.parseArgument(args);
+
+ try (final SparkSession spark = newSparkSession(parser); final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext())) {
+
+ final String sourcePath = parser.get("sourcePath");
+ final String targetPath = parser.get("graphRawPath");
+
+ processEntity(sc, Publication.class, sourcePath, targetPath);
+ processEntity(sc, Dataset.class, sourcePath, targetPath);
+ processEntity(sc, Software.class, sourcePath, targetPath);
+ processEntity(sc, OtherResearchProduct.class, sourcePath, targetPath);
+ processEntity(sc, Datasource.class, sourcePath, targetPath);
+ processEntity(sc, Organization.class, sourcePath, targetPath);
+ processEntity(sc, Project.class, sourcePath, targetPath);
+ processEntity(sc, Relation.class, sourcePath, targetPath);
+ }
+ }
+
+ private static SparkSession newSparkSession(final ArgumentApplicationParser parser) {
+ return SparkSession
+ .builder()
+ .appName(DispatchEntitiesApplication.class.getSimpleName())
+ .master(parser.get("master"))
+ .getOrCreate();
+ }
+
+ private static void processEntity(final JavaSparkContext sc, final Class> clazz, final String sourcePath, final String targetPath) {
+ final String type = clazz.getSimpleName().toLowerCase();
+
+ log.info(String.format("Processing entities (%s) in file: %s", type, sourcePath));
+
+ sc.textFile(sourcePath)
+ .filter(l -> isEntityType(l, type))
+ .map(l -> StringUtils.substringAfter(l, "|"))
+ .saveAsTextFile(targetPath + "/" + type, GzipCodec.class); // use repartition(XXX) ???
+ }
+
+ private static boolean isEntityType(final String line, final String type) {
+ return StringUtils.substringBefore(line, "|").equalsIgnoreCase(type);
+ }
+
+}
diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/utils/AbstractMigrationApplication.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/utils/AbstractMigrationApplication.java
new file mode 100644
index 0000000000..e1a5e5fa79
--- /dev/null
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/utils/AbstractMigrationApplication.java
@@ -0,0 +1,81 @@
+package eu.dnetlib.dhp.migration.utils;
+
+import java.io.Closeable;
+import java.io.IOException;
+import java.util.concurrent.atomic.AtomicInteger;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.Text;
+import org.codehaus.jackson.map.ObjectMapper;
+
+import eu.dnetlib.dhp.schema.oaf.Oaf;
+
+public class AbstractMigrationApplication implements Closeable {
+
+ private final AtomicInteger counter = new AtomicInteger(0);
+
+ private final Text key = new Text();
+
+ private final Text value = new Text();
+
+ private final SequenceFile.Writer writer;
+
+ private final ObjectMapper objectMapper = new ObjectMapper();
+
+ private static final Log log = LogFactory.getLog(AbstractMigrationApplication.class);
+
+ protected AbstractMigrationApplication() { // ONLY FOR UNIT TEST
+ this.writer = null;
+ }
+
+ public AbstractMigrationApplication(final String hdfsPath) throws Exception {
+
+ log.info(String.format("Creating SequenceFile Writer, hdfsPath=%s", hdfsPath));
+
+ this.writer = SequenceFile.createWriter(getConf(), SequenceFile.Writer.file(new Path(hdfsPath)), SequenceFile.Writer
+ .keyClass(Text.class), SequenceFile.Writer.valueClass(Text.class));
+ }
+
+ private Configuration getConf() throws IOException {
+ final Configuration conf = new Configuration();
+ /*
+ * conf.set("fs.defaultFS", hdfsNameNode); conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName());
+ * conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName()); System.setProperty("HADOOP_USER_NAME", hdfsUser);
+ * System.setProperty("hadoop.home.dir", "/"); FileSystem.get(URI.create(hdfsNameNode), conf);
+ */
+ return conf;
+ }
+
+ protected void emit(final String s, final String type) {
+ try {
+ key.set(counter.getAndIncrement() + ":" + type);
+ value.set(s);
+ writer.append(key, value);
+ } catch (final Exception e) {
+ throw new RuntimeException(e);
+ }
+ }
+
+ protected void emitOaf(final Oaf oaf) {
+ try {
+ emit(objectMapper.writeValueAsString(oaf), oaf.getClass().getSimpleName().toLowerCase());
+ } catch (final Exception e) {
+ throw new RuntimeException(e);
+ }
+ }
+
+ public ObjectMapper getObjectMapper() {
+ return objectMapper;
+ }
+
+ @Override
+ public void close() throws IOException {
+ writer.hflush();
+ writer.close();
+ }
+
+}
diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/utils/DbClient.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/utils/DbClient.java
new file mode 100644
index 0000000000..8e97843464
--- /dev/null
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/utils/DbClient.java
@@ -0,0 +1,65 @@
+package eu.dnetlib.dhp.migration.utils;
+
+import java.io.Closeable;
+import java.io.IOException;
+import java.sql.Connection;
+import java.sql.DriverManager;
+import java.sql.ResultSet;
+import java.sql.SQLException;
+import java.sql.Statement;
+import java.util.function.Consumer;
+
+import org.apache.commons.lang3.StringUtils;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+
+public class DbClient implements Closeable {
+
+ private static final Log log = LogFactory.getLog(DbClient.class);
+
+ private Connection connection;
+
+ public DbClient(final String address, final String login, final String password) {
+
+ try {
+ Class.forName("org.postgresql.Driver");
+
+ this.connection =
+ StringUtils.isNoneBlank(login, password) ? DriverManager.getConnection(address, login, password) : DriverManager.getConnection(address);
+ this.connection.setAutoCommit(false);
+ } catch (final Exception e) {
+ log.error("Connection to postgresDB failed");
+ throw new RuntimeException("Connection to postgresDB failed", e);
+ }
+ log.info("Opened database successfully");
+ }
+
+ public void processResults(final String sql, final Consumer consumer) {
+
+ try (final Statement stmt = connection.createStatement()) {
+ stmt.setFetchSize(100);
+
+ try (final ResultSet rs = stmt.executeQuery(sql)) {
+ while (rs.next()) {
+ consumer.accept(rs);
+ }
+ } catch (final SQLException e) {
+ log.error("Error executing sql query: " + sql, e);
+ throw new RuntimeException("Error executing sql query", e);
+ }
+ } catch (final SQLException e1) {
+ log.error("Error preparing sql statement", e1);
+ throw new RuntimeException("Error preparing sql statement", e1);
+ }
+ }
+
+ @Override
+ public void close() throws IOException {
+ try {
+ connection.close();
+ } catch (final SQLException e) {
+ throw new RuntimeException(e);
+ }
+ }
+
+}
diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/utils/MdstoreClient.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/utils/MdstoreClient.java
new file mode 100644
index 0000000000..612503da74
--- /dev/null
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/utils/MdstoreClient.java
@@ -0,0 +1,94 @@
+package eu.dnetlib.dhp.migration.utils;
+
+import java.io.Closeable;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.stream.StreamSupport;
+
+import org.apache.commons.lang3.StringUtils;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.bson.Document;
+
+import com.google.common.collect.Iterables;
+import com.mongodb.MongoClient;
+import com.mongodb.MongoClientURI;
+import com.mongodb.client.MongoCollection;
+import com.mongodb.client.MongoDatabase;
+
+public class MdstoreClient implements Closeable {
+
+ private final MongoClient client;
+ private final MongoDatabase db;
+
+ private static final String COLL_METADATA = "metadata";
+ private static final String COLL_METADATA_MANAGER = "metadataManager";
+
+ private static final Log log = LogFactory.getLog(MdstoreClient.class);
+
+ public MdstoreClient(final String baseUrl, final String dbName) {
+ this.client = new MongoClient(new MongoClientURI(baseUrl));
+ this.db = getDb(client, dbName);
+ }
+
+ public Map validCollections(final String mdFormat, final String mdLayout, final String mdInterpretation) {
+
+ final Map transactions = new HashMap<>();
+ for (final Document entry : getColl(db, COLL_METADATA_MANAGER, true).find()) {
+ final String mdId = entry.getString("mdId");
+ final String currentId = entry.getString("currentId");
+ if (StringUtils.isNoneBlank(mdId, currentId)) {
+ transactions.put(mdId, currentId);
+ }
+ }
+
+ final Map res = new HashMap<>();
+ for (final Document entry : getColl(db, COLL_METADATA, true).find()) {
+ if (entry.getString("format").equals(mdFormat) && entry.getString("layout").equals(mdLayout)
+ && entry.getString("interpretation").equals(mdInterpretation) && transactions.containsKey(entry.getString("mdId"))) {
+ res.put(entry.getString("mdId"), transactions.get(entry.getString("mdId")));
+ }
+ }
+
+ return res;
+ }
+
+ private MongoDatabase getDb(final MongoClient client, final String dbName) {
+ if (!Iterables.contains(client.listDatabaseNames(), dbName)) {
+ final String err = String.format("Database '%s' not found in %s", dbName, client.getAddress());
+ log.warn(err);
+ throw new RuntimeException(err);
+ }
+ return client.getDatabase(dbName);
+ }
+
+ private MongoCollection getColl(final MongoDatabase db, final String collName, final boolean abortIfMissing) {
+ if (!Iterables.contains(db.listCollectionNames(), collName)) {
+ final String err = String.format(String.format("Missing collection '%s' in database '%s'", collName, db.getName()));
+ log.warn(err);
+ if (abortIfMissing) {
+ throw new RuntimeException(err);
+ } else {
+ return null;
+ }
+ }
+ return db.getCollection(collName);
+ }
+
+ public Iterable listRecords(final String collName) {
+ final MongoCollection coll = getColl(db, collName, false);
+ return coll == null ? new ArrayList<>()
+ : () -> StreamSupport.stream(coll.find().spliterator(), false)
+ .filter(e -> e.containsKey("body"))
+ .map(e -> e.getString("body"))
+ .iterator();
+ }
+
+ @Override
+ public void close() throws IOException {
+ client.close();
+ }
+
+}
diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/utils/OafMapperUtils.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/utils/OafMapperUtils.java
new file mode 100644
index 0000000000..8e51c1858e
--- /dev/null
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/utils/OafMapperUtils.java
@@ -0,0 +1,195 @@
+package eu.dnetlib.dhp.migration.utils;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+import java.util.Objects;
+import java.util.stream.Collectors;
+
+import org.apache.commons.lang3.StringUtils;
+
+import eu.dnetlib.dhp.schema.oaf.DataInfo;
+import eu.dnetlib.dhp.schema.oaf.ExtraInfo;
+import eu.dnetlib.dhp.schema.oaf.Field;
+import eu.dnetlib.dhp.schema.oaf.Journal;
+import eu.dnetlib.dhp.schema.oaf.KeyValue;
+import eu.dnetlib.dhp.schema.oaf.OAIProvenance;
+import eu.dnetlib.dhp.schema.oaf.OriginDescription;
+import eu.dnetlib.dhp.schema.oaf.Qualifier;
+import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
+import eu.dnetlib.dhp.utils.DHPUtils;
+
+public class OafMapperUtils {
+
+ public static KeyValue keyValue(final String k, final String v) {
+ final KeyValue kv = new KeyValue();
+ kv.setKey(k);
+ kv.setValue(v);
+ return kv;
+ }
+
+ public static List listKeyValues(final String... s) {
+ if (s.length % 2 > 0) { throw new RuntimeException("Invalid number of parameters (k,v,k,v,....)"); }
+
+ final List list = new ArrayList<>();
+ for (int i = 0; i < s.length; i += 2) {
+ list.add(keyValue(s[i], s[i + 1]));
+ }
+ return list;
+ }
+
+ public static Field field(final T value, final DataInfo info) {
+ if (value == null || StringUtils.isBlank(value.toString())) { return null; }
+
+ final Field field = new Field<>();
+ field.setValue(value);
+ field.setDataInfo(info);
+ return field;
+ }
+
+ public static List> listFields(final DataInfo info, final String... values) {
+ return Arrays.stream(values).map(v -> field(v, info)).filter(Objects::nonNull).collect(Collectors.toList());
+ }
+
+ public static List> listFields(final DataInfo info, final List values) {
+ return values.stream().map(v -> field(v, info)).filter(Objects::nonNull).collect(Collectors.toList());
+ }
+
+ public static Qualifier qualifier(final String classid, final String classname, final String schemeid, final String schemename) {
+ final Qualifier q = new Qualifier();
+ q.setClassid(classid);
+ q.setClassname(classname);
+ q.setSchemeid(schemeid);
+ q.setSchemename(schemename);
+ return q;
+ }
+
+ public static StructuredProperty structuredProperty(final String value,
+ final String classid,
+ final String classname,
+ final String schemeid,
+ final String schemename,
+ final DataInfo dataInfo) {
+
+ return structuredProperty(value, qualifier(classid, classname, schemeid, schemename), dataInfo);
+ }
+
+ public static StructuredProperty structuredProperty(final String value, final Qualifier qualifier, final DataInfo dataInfo) {
+ if (value == null) { return null; }
+ final StructuredProperty sp = new StructuredProperty();
+ sp.setValue(value);
+ sp.setQualifier(qualifier);
+ sp.setDataInfo(dataInfo);
+ return sp;
+ }
+
+ public static ExtraInfo extraInfo(final String name, final String value, final String typology, final String provenance, final String trust) {
+ final ExtraInfo info = new ExtraInfo();
+ info.setName(name);
+ info.setValue(value);
+ info.setTypology(typology);
+ info.setProvenance(provenance);
+ info.setTrust(trust);
+ return info;
+ }
+
+ public static OAIProvenance oaiIProvenance(final String identifier,
+ final String baseURL,
+ final String metadataNamespace,
+ final Boolean altered,
+ final String datestamp,
+ final String harvestDate) {
+
+ final OriginDescription desc = new OriginDescription();
+ desc.setIdentifier(identifier);
+ desc.setBaseURL(baseURL);
+ desc.setMetadataNamespace(metadataNamespace);
+ desc.setAltered(altered);
+ desc.setDatestamp(datestamp);
+ desc.setHarvestDate(harvestDate);
+
+ final OAIProvenance p = new OAIProvenance();
+ p.setOriginDescription(desc);
+
+ return p;
+ }
+
+ public static Journal journal(final String name,
+ final String issnPrinted,
+ final String issnOnline,
+ final String issnLinking,
+ final String ep,
+ final String iss,
+ final String sp,
+ final String vol,
+ final String edition,
+ final String conferenceplace,
+ final String conferencedate,
+ final DataInfo dataInfo) {
+
+ if (StringUtils.isNotBlank(name) || StringUtils.isNotBlank(issnPrinted) || StringUtils.isNotBlank(issnOnline) || StringUtils.isNotBlank(issnLinking)) {
+ final Journal j = new Journal();
+ j.setName(name);
+ j.setIssnPrinted(issnPrinted);
+ j.setIssnOnline(issnOnline);
+ j.setIssnLinking(issnLinking);
+ j.setEp(ep);
+ j.setIss(iss);
+ j.setSp(sp);
+ j.setVol(vol);
+ j.setEdition(edition);
+ j.setConferenceplace(conferenceplace);
+ j.setConferencedate(conferencedate);
+ j.setDataInfo(dataInfo);
+ return j;
+ } else {
+ return null;
+ }
+ }
+
+ public static DataInfo dataInfo(final Boolean deletedbyinference,
+ final String inferenceprovenance,
+ final Boolean inferred,
+ final Boolean invisible,
+ final Qualifier provenanceaction,
+ final String trust) {
+ final DataInfo d = new DataInfo();
+ d.setDeletedbyinference(deletedbyinference);
+ d.setInferenceprovenance(inferenceprovenance);
+ d.setInferred(inferred);
+ d.setInvisible(invisible);
+ d.setProvenanceaction(provenanceaction);
+ d.setTrust(trust);
+ return d;
+ }
+
+ public static String createOpenaireId(final int prefix, final String originalId, final boolean to_md5) {
+ if (to_md5) {
+ final String nsPrefix = StringUtils.substringBefore(originalId, "::");
+ final String rest = StringUtils.substringAfter(originalId, "::");
+ return String.format("%s|%s::%s", prefix, nsPrefix, DHPUtils.md5(rest));
+ } else {
+ return String.format("%s|%s", prefix, originalId);
+ }
+ }
+
+ public static String createOpenaireId(final String type, final String originalId, final boolean to_md5) {
+ switch (type) {
+ case "datasource":
+ return createOpenaireId(10, originalId, to_md5);
+ case "organization":
+ return createOpenaireId(20, originalId, to_md5);
+ case "person":
+ return createOpenaireId(30, originalId, to_md5);
+ case "project":
+ return createOpenaireId(40, originalId, to_md5);
+ default:
+ return createOpenaireId(50, originalId, to_md5);
+ }
+ }
+
+ public static String asString(final Object o) {
+ return o == null ? "" : o.toString();
+ }
+
+}
diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/utils/PacePerson.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/utils/PacePerson.java
new file mode 100644
index 0000000000..69e128e63c
--- /dev/null
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/utils/PacePerson.java
@@ -0,0 +1,176 @@
+package eu.dnetlib.dhp.migration.utils;
+
+import java.nio.charset.Charset;
+import java.text.Normalizer;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.commons.lang3.text.WordUtils;
+
+import com.google.common.base.Joiner;
+import com.google.common.base.Splitter;
+import com.google.common.collect.Iterables;
+import com.google.common.collect.Lists;
+import com.google.common.hash.Hashing;
+
+public class PacePerson {
+
+ private static final String UTF8 = "UTF-8";
+ private List name = Lists.newArrayList();
+ private List surname = Lists.newArrayList();
+ private List fullname = Lists.newArrayList();
+ private final String original;
+
+ private static Set particles = null;
+
+ public static final String capitalize(final String s) {
+ return WordUtils.capitalize(s.toLowerCase(), ' ', '-');
+ }
+
+ public static final String dotAbbreviations(final String s) {
+ return s.length() == 1 ? s + "." : s;
+ }
+
+ public static Set loadFromClasspath(final String classpath) {
+ final Set h = new HashSet<>();
+ try {
+ for (final String s : IOUtils.readLines(PacePerson.class.getResourceAsStream(classpath))) {
+ h.add(s);
+ }
+ } catch (final Throwable e) {
+ return new HashSet<>();
+ }
+ return h;
+ }
+
+ public PacePerson(String s, final boolean aggressive) {
+ original = s;
+ s = Normalizer.normalize(s, Normalizer.Form.NFD);
+ s = s.replaceAll("\\(.+\\)", "");
+ s = s.replaceAll("\\[.+\\]", "");
+ s = s.replaceAll("\\{.+\\}", "");
+ s = s.replaceAll("\\s+-\\s+", "-");
+ s = s.replaceAll("[\\p{Punct}&&[^,-]]", " ");
+ s = s.replaceAll("\\d", " ");
+ s = s.replaceAll("\\n", " ");
+ s = s.replaceAll("\\.", " ");
+ s = s.replaceAll("\\s+", " ");
+
+ if (aggressive) {
+ s = s.replaceAll("[\\p{InCombiningDiacriticalMarks}&&[^,-]]", "");
+ // s = s.replaceAll("[\\W&&[^,-]]", "");
+ }
+
+ if (s.contains(",")) {
+ final String[] arr = s.split(",");
+ if (arr.length == 1) {
+ fullname = splitTerms(arr[0]);
+ } else if (arr.length > 1) {
+ surname = splitTerms(arr[0]);
+ name = splitTerms(arr[1]);
+ fullname.addAll(surname);
+ fullname.addAll(name);
+ }
+ } else {
+ fullname = splitTerms(s);
+
+ int lastInitialPosition = fullname.size();
+ boolean hasSurnameInUpperCase = false;
+
+ for (int i = 0; i < fullname.size(); i++) {
+ final String term = fullname.get(i);
+ if (term.length() == 1) {
+ lastInitialPosition = i;
+ } else if (term.equals(term.toUpperCase())) {
+ hasSurnameInUpperCase = true;
+ }
+ }
+
+ if (lastInitialPosition < fullname.size() - 1) { // Case: Michele G. Artini
+ name = fullname.subList(0, lastInitialPosition + 1);
+ surname = fullname.subList(lastInitialPosition + 1, fullname.size());
+ } else if (hasSurnameInUpperCase) { // Case: Michele ARTINI
+ for (final String term : fullname) {
+ if (term.length() > 1 && term.equals(term.toUpperCase())) {
+ surname.add(term);
+ } else {
+ name.add(term);
+ }
+ }
+ }
+ }
+ }
+
+ private List splitTerms(final String s) {
+ if (particles == null) {
+ particles = loadFromClasspath("/eu/dnetlib/dhp/migration/pace/name_particles.txt");
+ }
+
+ final List list = Lists.newArrayList();
+ for (final String part : Splitter.on(" ").omitEmptyStrings().split(s)) {
+ if (!particles.contains(part.toLowerCase())) {
+ list.add(part);
+ }
+ }
+ return list;
+ }
+
+ public List getName() {
+ return name;
+ }
+
+ public String getNameString() {
+ return Joiner.on(" ").join(getName());
+ }
+
+ public List getSurname() {
+ return surname;
+ }
+
+ public List getFullname() {
+ return fullname;
+ }
+
+ public String getOriginal() {
+ return original;
+ }
+
+ public String hash() {
+ return Hashing.murmur3_128().hashString(getNormalisedFullname(), Charset.forName(UTF8)).toString();
+ }
+
+ public String getNormalisedFirstName() {
+ return Joiner.on(" ").join(getCapitalFirstnames());
+ }
+
+ public String getNormalisedSurname() {
+ return Joiner.on(" ").join(getCapitalSurname());
+ }
+
+ public String getSurnameString() {
+ return Joiner.on(" ").join(getSurname());
+ }
+
+ public String getNormalisedFullname() {
+ return isAccurate() ? getNormalisedSurname() + ", " + getNormalisedFirstName() : Joiner.on(" ").join(fullname);
+ }
+
+ public List getCapitalFirstnames() {
+ return Lists.newArrayList(Iterables.transform(getNameWithAbbreviations(), PacePerson::capitalize));
+ }
+
+ public List getCapitalSurname() {
+ return Lists.newArrayList(Iterables.transform(surname, PacePerson::capitalize));
+ }
+
+ public List getNameWithAbbreviations() {
+ return Lists.newArrayList(Iterables.transform(name, PacePerson::dotAbbreviations));
+ }
+
+ public boolean isAccurate() {
+ return name != null && surname != null && !name.isEmpty() && !surname.isEmpty();
+ }
+
+}
diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/dispatch_entities_parameters.json b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/dispatch_entities_parameters.json
new file mode 100644
index 0000000000..8c81290ca2
--- /dev/null
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/dispatch_entities_parameters.json
@@ -0,0 +1,20 @@
+[
+ {
+ "paramName": "s",
+ "paramLongName": "sourcePath",
+ "paramDescription": "the source path",
+ "paramRequired": true
+ },
+ {
+ "paramName": "mt",
+ "paramLongName": "master",
+ "paramDescription": "should be local or yarn",
+ "paramRequired": true
+ },
+ {
+ "paramName": "g",
+ "paramLongName": "graphRawPath",
+ "paramDescription": "the path of the graph Raw in hdfs",
+ "paramRequired": true
+ }
+]
\ No newline at end of file
diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/generate_entities_parameters.json b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/generate_entities_parameters.json
new file mode 100644
index 0000000000..53ee010c45
--- /dev/null
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/generate_entities_parameters.json
@@ -0,0 +1,39 @@
+[
+ {
+ "paramName": "s",
+ "paramLongName": "sourcePaths",
+ "paramDescription": "the HDFS source paths which contains the sequential file (comma separated)",
+ "paramRequired": true
+ },
+ {
+ "paramName": "mt",
+ "paramLongName": "master",
+ "paramDescription": "should be local or yarn",
+ "paramRequired": true
+ },
+ {
+ "paramName": "t",
+ "paramLongName": "targetPath",
+ "paramDescription": "the path of the target file",
+ "paramRequired": true
+ },
+ {
+ "paramName": "pgurl",
+ "paramLongName": "postgresUrl",
+ "paramDescription": "postgres url, example: jdbc:postgresql://localhost:5432/testdb",
+ "paramRequired": true
+ },
+ {
+ "paramName": "pguser",
+ "paramLongName": "postgresUser",
+ "paramDescription": "postgres user",
+ "paramRequired": false
+ },
+ {
+ "paramName": "pgpasswd",
+ "paramLongName": "postgresPassword",
+ "paramDescription": "postgres password",
+ "paramRequired": false
+ }
+
+]
\ No newline at end of file
diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/migrate_actionsets_parameters.json b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/migrate_actionsets_parameters.json
new file mode 100644
index 0000000000..c4910ec61b
--- /dev/null
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/migrate_actionsets_parameters.json
@@ -0,0 +1,10 @@
+[
+ {"paramName":"is", "paramLongName":"isLookupUrl", "paramDescription": "URL of the isLookUp Service", "paramRequired": true},
+ {"paramName":"sn", "paramLongName":"sourceNameNode", "paramDescription": "nameNode of the source cluster", "paramRequired": true},
+ {"paramName":"tn", "paramLongName":"targetNameNode", "paramDescription": "namoNode of the target cluster", "paramRequired": true},
+ {"paramName":"w", "paramLongName":"workingDirectory", "paramDescription": "working directory", "paramRequired": true},
+ {"paramName":"nm", "paramLongName":"distcp_num_maps", "paramDescription": "maximum number of map tasks used in the distcp process", "paramRequired": true},
+ {"paramName":"mm", "paramLongName":"distcp_memory_mb", "paramDescription": "memory for distcp action copying actionsets from remote cluster", "paramRequired": true},
+ {"paramName":"tt", "paramLongName":"distcp_task_timeout", "paramDescription": "timeout for distcp copying actions from remote cluster", "paramRequired": true},
+ {"paramName":"tr", "paramLongName":"transform_only", "paramDescription": "activate tranform-only mode. Only apply transformation step", "paramRequired": true}
+]
diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/migrate_db_entities_parameters.json b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/migrate_db_entities_parameters.json
new file mode 100644
index 0000000000..cb13ff0242
--- /dev/null
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/migrate_db_entities_parameters.json
@@ -0,0 +1,32 @@
+[
+ {
+ "paramName": "p",
+ "paramLongName": "hdfsPath",
+ "paramDescription": "the path where storing the sequential file",
+ "paramRequired": true
+ },
+ {
+ "paramName": "pgurl",
+ "paramLongName": "postgresUrl",
+ "paramDescription": "postgres url, example: jdbc:postgresql://localhost:5432/testdb",
+ "paramRequired": true
+ },
+ {
+ "paramName": "pguser",
+ "paramLongName": "postgresUser",
+ "paramDescription": "postgres user",
+ "paramRequired": false
+ },
+ {
+ "paramName": "pgpasswd",
+ "paramLongName": "postgresPassword",
+ "paramDescription": "postgres password",
+ "paramRequired": false
+ },
+ {
+ "paramName": "a",
+ "paramLongName": "action",
+ "paramDescription": "process claims",
+ "paramRequired": false
+ }
+]
\ No newline at end of file
diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/migrate_mongo_mstores_parameters.json b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/migrate_mongo_mstores_parameters.json
new file mode 100644
index 0000000000..ee1a6ac4ee
--- /dev/null
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/migrate_mongo_mstores_parameters.json
@@ -0,0 +1,38 @@
+[
+ {
+ "paramName": "p",
+ "paramLongName": "hdfsPath",
+ "paramDescription": "the path where storing the sequential file",
+ "paramRequired": true
+ },
+ {
+ "paramName": "mongourl",
+ "paramLongName": "mongoBaseUrl",
+ "paramDescription": "mongoDB url, example: mongodb://[username:password@]host[:port]",
+ "paramRequired": true
+ },
+ {
+ "paramName": "mongodb",
+ "paramLongName": "mongoDb",
+ "paramDescription": "mongo database",
+ "paramRequired": true
+ },
+ {
+ "paramName": "f",
+ "paramLongName": "mdFormat",
+ "paramDescription": "metadata format",
+ "paramRequired": true
+ },
+ {
+ "paramName": "l",
+ "paramLongName": "mdLayout",
+ "paramDescription": "metadata layout",
+ "paramRequired": true
+ },
+ {
+ "paramName": "i",
+ "paramLongName": "mdInterpretation",
+ "paramDescription": "metadata interpretation",
+ "paramRequired": true
+ }
+]
\ No newline at end of file
diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/pace/name_particles.txt b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/pace/name_particles.txt
new file mode 100644
index 0000000000..dae37c9dc3
--- /dev/null
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/pace/name_particles.txt
@@ -0,0 +1,7 @@
+van
+der
+de
+dell
+sig
+mr
+mrs
diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/sql/queryClaims.sql b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/sql/queryClaims.sql
new file mode 100644
index 0000000000..0390c11aa3
--- /dev/null
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/sql/queryClaims.sql
@@ -0,0 +1 @@
+SELECT source_type, source_id, target_type, target_id, semantics FROM claim WHERE approved=TRUE;
\ No newline at end of file
diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/sql/queryDatasourceOrganization.sql b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/sql/queryDatasourceOrganization.sql
new file mode 100644
index 0000000000..745f83971a
--- /dev/null
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/sql/queryDatasourceOrganization.sql
@@ -0,0 +1,17 @@
+SELECT
+ dor.datasource AS datasource,
+ dor.organization AS organization,
+ NULL AS startdate,
+ NULL AS enddate,
+ false AS inferred,
+ false AS deletedbyinference,
+ 0.9 AS trust,
+ NULL AS inferenceprovenance,
+ dc.id AS collectedfromid,
+ dc.officialname AS collectedfromname,
+ 'providedBy@@@provided by@@@dnet:datasources_organizations_typologies@@@dnet:datasources_organizations_typologies' AS semantics,
+ d.provenanceaction || '@@@' || d.provenanceaction || '@@@dnet:provenanceActions@@@dnet:provenanceActions' AS provenanceaction
+
+FROM dsm_datasource_organization dor
+ LEFT OUTER JOIN dsm_datasources d ON (dor.datasource = d.id)
+ LEFT OUTER JOIN dsm_datasources dc ON (dc.id = d.collectedfrom)
diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/sql/queryDatasources.sql b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/sql/queryDatasources.sql
new file mode 100644
index 0000000000..8c587f34ee
--- /dev/null
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/sql/queryDatasources.sql
@@ -0,0 +1,147 @@
+SELECT
+ d.id AS datasourceid,
+ d.id || array_agg(distinct di.pid) AS identities,
+ d.officialname AS officialname,
+ d.englishname AS englishname,
+ d.contactemail AS contactemail,
+ CASE
+ WHEN (array_agg(DISTINCT COALESCE (a.compatibility_override, a.compatibility):: TEXT) @> ARRAY ['openaire-cris_1.1'])
+ THEN
+ 'openaire-cris_1.1@@@OpenAIRE CRIS v1.1@@@dnet:datasourceCompatibilityLevel@@@dnet:datasourceCompatibilityLevel'
+ WHEN (array_agg(DISTINCT COALESCE (a.compatibility_override, a.compatibility):: TEXT) @> ARRAY ['driver', 'openaire2.0'])
+ THEN
+ 'driver-openaire2.0@@@OpenAIRE 2.0+ (DRIVER OA, EC funding)@@@dnet:datasourceCompatibilityLevel@@@dnet:datasourceCompatibilityLevel'
+ WHEN (array_agg(DISTINCT COALESCE (a.compatibility_override, a.compatibility) :: TEXT) @> ARRAY ['driver'])
+ THEN
+ 'driver@@@OpenAIRE Basic (DRIVER OA)@@@dnet:datasourceCompatibilityLevel@@@dnet:datasourceCompatibilityLevel'
+ WHEN (array_agg(DISTINCT COALESCE (a.compatibility_override, a.compatibility) :: TEXT) @> ARRAY ['openaire2.0'])
+ THEN
+ 'openaire2.0@@@OpenAIRE 2.0 (EC funding)@@@dnet:datasourceCompatibilityLevel@@@dnet:datasourceCompatibilityLevel'
+ WHEN (array_agg(DISTINCT COALESCE (a.compatibility_override, a.compatibility) :: TEXT) @> ARRAY ['openaire3.0'])
+ THEN
+ 'openaire3.0@@@OpenAIRE 3.0 (OA, funding)@@@dnet:datasourceCompatibilityLevel@@@dnet:datasourceCompatibilityLevel'
+ WHEN (array_agg(DISTINCT COALESCE (a.compatibility_override, a.compatibility) :: TEXT) @> ARRAY ['openaire2.0_data'])
+ THEN
+ 'openaire2.0_data@@@OpenAIRE Data (funded, referenced datasets)@@@dnet:datasourceCompatibilityLevel@@@dnet:datasourceCompatibilityLevel'
+ WHEN (array_agg(DISTINCT COALESCE (a.compatibility_override, a.compatibility) :: TEXT) @> ARRAY ['native'])
+ THEN
+ 'native@@@proprietary@@@dnet:datasourceCompatibilityLevel@@@dnet:datasourceCompatibilityLevel'
+ WHEN (array_agg(DISTINCT COALESCE (a.compatibility_override, a.compatibility) :: TEXT) @> ARRAY ['hostedBy'])
+ THEN
+ 'hostedBy@@@collected from a compatible aggregator@@@dnet:datasourceCompatibilityLevel@@@dnet:datasourceCompatibilityLevel'
+ WHEN (array_agg(DISTINCT COALESCE (a.compatibility_override, a.compatibility) :: TEXT) @> ARRAY ['notCompatible'])
+ THEN
+ 'notCompatible@@@under validation@@@dnet:datasourceCompatibilityLevel@@@dnet:datasourceCompatibilityLevel'
+ ELSE
+ 'UNKNOWN@@@not available@@@dnet:datasourceCompatibilityLevel@@@dnet:datasourceCompatibilityLevel'
+ END AS openairecompatibility,
+ d.websiteurl AS websiteurl,
+ d.logourl AS logourl,
+ array_agg(DISTINCT CASE WHEN a.protocol = 'oai' and last_aggregation_date is not null THEN a.baseurl ELSE NULL END) AS accessinfopackage,
+ d.latitude AS latitude,
+ d.longitude AS longitude,
+ d.namespaceprefix AS namespaceprefix,
+ NULL AS odnumberofitems,
+ NULL AS odnumberofitemsdate,
+
+ (SELECT array_agg(s|| '###keywords@@@keywords@@@dnet:subject_classification_typologies@@@dnet:subject_classification_typologies')
+ FROM UNNEST(
+ ARRAY(
+ SELECT trim(s)
+ FROM unnest(string_to_array(d.subjects, '@@')) AS s)) AS s) AS subjects,
+
+ d.description AS description,
+ NULL AS odpolicies,
+ ARRAY(SELECT trim(s)
+ FROM unnest(string_to_array(d.languages, ',')) AS s) AS odlanguages,
+ ARRAY(SELECT trim(s)
+ FROM unnest(string_to_array(d.od_contenttypes, '-')) AS s) AS odcontenttypes,
+ false AS inferred,
+ false AS deletedbyinference,
+ 0.9 AS trust,
+ NULL AS inferenceprovenance,
+ d.dateofcollection AS dateofcollection,
+ d.dateofvalidation AS dateofvalidation,
+ -- re3data fields
+ d.releasestartdate AS releasestartdate,
+ d.releaseenddate AS releaseenddate,
+ d.missionstatementurl AS missionstatementurl,
+ d.dataprovider AS dataprovider,
+ d.serviceprovider AS serviceprovider,
+ d.databaseaccesstype AS databaseaccesstype,
+ d.datauploadtype AS datauploadtype,
+ d.databaseaccessrestriction AS databaseaccessrestriction,
+ d.datauploadrestriction AS datauploadrestriction,
+ d.versioning AS versioning,
+ d.citationguidelineurl AS citationguidelineurl,
+ d.qualitymanagementkind AS qualitymanagementkind,
+ d.pidsystems AS pidsystems,
+ d.certificates AS certificates,
+ ARRAY[]::text[] AS policies,
+ dc.id AS collectedfromid,
+ dc.officialname AS collectedfromname,
+ d.typology || '@@@' || CASE
+ WHEN (d.typology = 'crissystem') THEN 'CRIS System'
+ WHEN (d.typology = 'datarepository::unknown') THEN 'Data Repository'
+ WHEN (d.typology = 'aggregator::datarepository') THEN 'Data Repository Aggregator'
+ WHEN (d.typology = 'infospace') THEN 'Information Space'
+ WHEN (d.typology = 'pubsrepository::institutional') THEN 'Institutional Repository'
+ WHEN (d.typology = 'aggregator::pubsrepository::institutional') THEN 'Institutional Repository Aggregator'
+ WHEN (d.typology = 'pubsrepository::journal') THEN 'Journal'
+ WHEN (d.typology = 'aggregator::pubsrepository::journals') THEN 'Journal Aggregator/Publisher'
+ WHEN (d.typology = 'pubsrepository::mock') THEN 'Other'
+ WHEN (d.typology = 'pubscatalogue::unknown') THEN 'Publication Catalogue'
+ WHEN (d.typology = 'pubsrepository::unknown') THEN 'Publication Repository'
+ WHEN (d.typology = 'aggregator::pubsrepository::unknown') THEN 'Publication Repository Aggregator'
+ WHEN (d.typology = 'entityregistry') THEN 'Registry'
+ WHEN (d.typology = 'scholarcomminfra') THEN 'Scholarly Comm. Infrastructure'
+ WHEN (d.typology = 'pubsrepository::thematic') THEN 'Thematic Repository'
+ WHEN (d.typology = 'websource') THEN 'Web Source'
+ WHEN (d.typology = 'entityregistry::projects') THEN 'Funder database'
+ WHEN (d.typology = 'entityregistry::repositories') THEN 'Registry of repositories'
+ WHEN (d.typology = 'softwarerepository') THEN 'Software Repository'
+ WHEN (d.typology = 'aggregator::softwarerepository') THEN 'Software Repository Aggregator'
+ WHEN (d.typology = 'orprepository') THEN 'Repository'
+ ELSE 'Other'
+ END || '@@@dnet:datasource_typologies@@@dnet:datasource_typologies' AS datasourcetype,
+ 'sysimport:crosswalk:entityregistry@@@sysimport:crosswalk:entityregistry@@@dnet:provenance_actions@@@dnet:provenance_actions' AS provenanceaction,
+ CONCAT(d.issn, '@@@', d.eissn, '@@@', d.lissn) AS journal
+
+FROM dsm_datasources d
+
+LEFT OUTER JOIN dsm_datasources dc on (d.collectedfrom = dc.id)
+LEFT OUTER JOIN dsm_api a ON (d.id = a.datasource)
+LEFT OUTER JOIN dsm_datasourcepids di ON (d.id = di.datasource)
+
+GROUP BY
+ d.id,
+ d.officialname,
+ d.englishname,
+ d.websiteurl,
+ d.logourl,
+ d.contactemail,
+ d.namespaceprefix,
+ d.description,
+ d.latitude,
+ d.longitude,
+ d.dateofcollection,
+ d.dateofvalidation,
+ d.releasestartdate,
+ d.releaseenddate,
+ d.missionstatementurl,
+ d.dataprovider,
+ d.serviceprovider,
+ d.databaseaccesstype,
+ d.datauploadtype,
+ d.databaseaccessrestriction,
+ d.datauploadrestriction,
+ d.versioning,
+ d.citationguidelineurl,
+ d.qualitymanagementkind,
+ d.pidsystems,
+ d.certificates,
+ dc.id,
+ dc.officialname,
+ d.issn,
+ d.eissn,
+ d.lissn
diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/sql/queryOrganizations.sql b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/sql/queryOrganizations.sql
new file mode 100644
index 0000000000..aeb04aff91
--- /dev/null
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/sql/queryOrganizations.sql
@@ -0,0 +1,35 @@
+SELECT
+ o.id AS organizationid,
+ o.legalshortname AS legalshortname,
+ o.legalname AS legalname,
+ o.websiteurl AS websiteurl,
+ o.logourl AS logourl,
+ o.ec_legalbody AS eclegalbody,
+ o.ec_legalperson AS eclegalperson,
+ o.ec_nonprofit AS ecnonprofit,
+ o.ec_researchorganization AS ecresearchorganization,
+ o.ec_highereducation AS echighereducation,
+ o.ec_internationalorganizationeurinterests AS ecinternationalorganizationeurinterests,
+ o.ec_internationalorganization AS ecinternationalorganization,
+ o.ec_enterprise AS ecenterprise,
+ o.ec_smevalidated AS ecsmevalidated,
+ o.ec_nutscode AS ecnutscode,
+ o.dateofcollection AS dateofcollection,
+ o.lastupdate AS dateoftransformation,
+ false AS inferred,
+ false AS deletedbyinference,
+ o.trust AS trust,
+ '' AS inferenceprovenance,
+ d.id AS collectedfromid,
+ d.officialname AS collectedfromname,
+ o.country || '@@@' || o.country || '@@@dnet:countries@@@dnet:countries' AS country,
+ 'sysimport:crosswalk:entityregistry@@@sysimport:crosswalk:entityregistry@@@dnet:provenance_actions@@@dnet:provenance_actions' AS provenanceaction,
+
+ ARRAY[]::text[] AS pid
+FROM dsm_organizations o
+ LEFT OUTER JOIN dsm_datasources d ON (d.id = o.collectedfrom)
+
+
+
+
+
diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/sql/queryOrganizationsFromOpenOrgsDB.sql b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/sql/queryOrganizationsFromOpenOrgsDB.sql
new file mode 100644
index 0000000000..99c8e04b4b
--- /dev/null
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/sql/queryOrganizationsFromOpenOrgsDB.sql
@@ -0,0 +1,53 @@
+SELECT
+ o.id AS organizationid,
+ coalesce((array_agg(a.acronym))[1], o.name) AS legalshortname,
+ o.name AS legalname,
+ array_agg(DISTINCT n.name) AS "alternativeNames",
+ (array_agg(u.url))[1] AS websiteurl,
+ o.modification_date AS dateoftransformation,
+ false AS inferred,
+ false AS deletedbyinference,
+ 0.95 AS trust,
+ '' AS inferenceprovenance,
+ 'openaire____::openorgs' AS collectedfromid,
+ 'OpenOrgs Database' AS collectedfromname,
+ o.country || '@@@' || o.country || '@@@dnet:countries@@@dnet:countries' AS country,
+ 'sysimport:crosswalk:entityregistry@@@sysimport:crosswalk:entityregistry@@@dnet:provenance_actions@@@dnet:provenance_actions' AS provenanceaction,
+ array_agg(DISTINCT i.otherid || '###' || i.type || '@@@dnet:pid_types') AS pid
+FROM organizations o
+ LEFT OUTER JOIN acronyms a ON (a.id = o.id)
+ LEFT OUTER JOIN urls u ON (u.id = o.id)
+ LEFT OUTER JOIN other_ids i ON (i.id = o.id)
+ LEFT OUTER JOIN other_names n ON (n.id = o.id)
+GROUP BY
+ o.id,
+ o.name,
+ o.modification_date,
+ o.country
+
+UNION ALL
+
+SELECT
+ 'openorgsmesh'||substring(o.id, 13)||'-'||md5(n.name) AS organizationid,
+ n.name AS legalshortname,
+ n.name AS legalname,
+ ARRAY[]::text[] AS "alternativeNames",
+ (array_agg(u.url))[1] AS websiteurl,
+ o.modification_date AS dateoftransformation,
+ false AS inferred,
+ false AS deletedbyinference,
+ 0.88 AS trust,
+ '' AS inferenceprovenance,
+ 'openaire____::openorgs' AS collectedfromid,
+ 'OpenOrgs Database' AS collectedfromname,
+ o.country || '@@@' || o.country || '@@@dnet:countries@@@dnet:countries' AS country,
+ 'sysimport:crosswalk:entityregistry@@@sysimport:crosswalk:entityregistry@@@dnet:provenance_actions@@@dnet:provenance_actions' AS provenanceaction,
+ array_agg(DISTINCT i.otherid || '###' || i.type || '@@@dnet:pid_types') AS pid
+FROM other_names n
+ LEFT OUTER JOIN organizations o ON (n.id = o.id)
+ LEFT OUTER JOIN urls u ON (u.id = o.id)
+ LEFT OUTER JOIN other_ids i ON (i.id = o.id)
+GROUP BY
+ o.id, o.modification_date, o.country, n.name
+
+
diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/sql/queryProjectOrganization.sql b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/sql/queryProjectOrganization.sql
new file mode 100644
index 0000000000..4c06ca5b9a
--- /dev/null
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/sql/queryProjectOrganization.sql
@@ -0,0 +1,19 @@
+SELECT
+ po.project AS project,
+ po.resporganization AS resporganization,
+ po.participantnumber AS participantnumber,
+ po.contribution AS contribution,
+ NULL AS startdate,
+ NULL AS enddate,
+ false AS inferred,
+ false AS deletedbyinference,
+ po.trust AS trust,
+ NULL AS inferenceprovenance,
+ dc.id AS collectedfromid,
+ dc.officialname AS collectedfromname,
+ po.semanticclass || '@@@' || po.semanticclass || '@@@dnet:project_organization_relations@@@dnet:project_organization_relations' AS semantics,
+ 'sysimport:crosswalk:entityregistry@@@sysimport:crosswalk:entityregistry@@@dnet:provenance_actions@@@dnet:provenance_actions' AS provenanceaction
+
+FROM project_organization po
+ LEFT OUTER JOIN projects p ON (p.id = po.project)
+ LEFT OUTER JOIN dsm_datasources dc ON (dc.id = p.collectedfrom)
diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/sql/queryProjects.sql b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/sql/queryProjects.sql
new file mode 100644
index 0000000000..685b57ab65
--- /dev/null
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/sql/queryProjects.sql
@@ -0,0 +1,89 @@
+SELECT
+ p.id AS projectid,
+ p.code AS code,
+ p.websiteurl AS websiteurl,
+ p.acronym AS acronym,
+ p.title AS title,
+ p.startdate AS startdate,
+ p.enddate AS enddate,
+ p.call_identifier AS callidentifier,
+ p.keywords AS keywords,
+ p.duration AS duration,
+ p.ec_sc39 AS ecsc39,
+ p.oa_mandate_for_publications AS oamandatepublications,
+ p.ec_article29_3 AS ecarticle29_3,
+ p.dateofcollection AS dateofcollection,
+ p.lastupdate AS dateoftransformation,
+ p.inferred AS inferred,
+ p.deletedbyinference AS deletedbyinference,
+ p.trust AS trust,
+ p.inferenceprovenance AS inferenceprovenance,
+ p.optional1 AS optional1,
+ p.optional2 AS optional2,
+ p.jsonextrainfo AS jsonextrainfo,
+ p.contactfullname AS contactfullname,
+ p.contactfax AS contactfax,
+ p.contactphone AS contactphone,
+ p.contactemail AS contactemail,
+ p.summary AS summary,
+ p.currency AS currency,
+ p.totalcost AS totalcost,
+ p.fundedamount AS fundedamount,
+ dc.id AS collectedfromid,
+ dc.officialname AS collectedfromname,
+ p.contracttype || '@@@' || p.contracttypename || '@@@' || p.contracttypescheme || '@@@' || p.contracttypescheme AS contracttype,
+ pac.code || '@@@' || pac.name || '@@@' || pas.code || '@@@' || pas.name AS provenanceaction,
+ array_agg(DISTINCT i.pid || '###' || i.issuertype) AS pid,
+ array_agg(DISTINCT s.name || '###' || sc.code || '@@@' || sc.name || '@@@' || ss.code || '@@@' || ss.name) AS subjects,
+ array_agg(DISTINCT fp.path) AS fundingtree
+
+ FROM projects p
+
+ LEFT OUTER JOIN class pac ON (pac.code = p.provenanceactionclass)
+ LEFT OUTER JOIN scheme pas ON (pas.code = p.provenanceactionscheme)
+
+ LEFT OUTER JOIN projectpids pp ON (pp.project = p.id)
+ LEFT OUTER JOIN dsm_identities i ON (i.pid = pp.pid)
+
+ LEFT OUTER JOIN dsm_datasources dc ON (dc.id = p.collectedfrom)
+
+ LEFT OUTER JOIN project_fundingpath pf ON (pf.project = p.id)
+ LEFT OUTER JOIN fundingpaths fp ON (fp.id = pf.funding)
+
+ LEFT OUTER JOIN project_subject ps ON (ps.project = p.id)
+ LEFT OUTER JOIN subjects s ON (s.id = ps.subject)
+
+ LEFT OUTER JOIN class sc ON (sc.code = s.semanticclass)
+ LEFT OUTER JOIN scheme ss ON (ss.code = s.semanticscheme)
+
+ GROUP BY
+ p.id,
+ p.code,
+ p.websiteurl,
+ p.acronym,
+ p.title,
+ p.startdate,
+ p.enddate,
+ p.call_identifier,
+ p.keywords,
+ p.duration,
+ p.ec_sc39,
+ p.oa_mandate_for_publications,
+ p.ec_article29_3,
+ p.dateofcollection,
+ p.inferred,
+ p.deletedbyinference,
+ p.trust,
+ p.inferenceprovenance,
+ p.contactfullname,
+ p.contactfax,
+ p.contactphone,
+ p.contactemail,
+ p.summary,
+ p.currency,
+ p.totalcost,
+ p.fundedamount,
+ dc.id,
+ dc.officialname,
+ pac.code, pac.name, pas.code, pas.name,
+ p.contracttype , p.contracttypename, p.contracttypescheme;
\ No newline at end of file
diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/sql/queryProjects_production.sql b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/sql/queryProjects_production.sql
new file mode 100644
index 0000000000..6cff188756
--- /dev/null
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/sql/queryProjects_production.sql
@@ -0,0 +1,90 @@
+SELECT
+ p.id AS projectid,
+ p.code AS code,
+ p.websiteurl AS websiteurl,
+ p.acronym AS acronym,
+ p.title AS title,
+ p.startdate AS startdate,
+ p.enddate AS enddate,
+ p.call_identifier AS callidentifier,
+ p.keywords AS keywords,
+ p.duration AS duration,
+ p.ec_sc39 AS ecsc39,
+ p.oa_mandate_for_publications AS oamandatepublications,
+ p.ec_article29_3 AS ecarticle29_3,
+ p.dateofcollection AS dateofcollection,
+ p.lastupdate AS dateoftransformation,
+ p.inferred AS inferred,
+ p.deletedbyinference AS deletedbyinference,
+ p.trust AS trust,
+ p.inferenceprovenance AS inferenceprovenance,
+ p.optional1 AS optional1,
+ p.optional2 AS optional2,
+ p.jsonextrainfo AS jsonextrainfo,
+ p.contactfullname AS contactfullname,
+ p.contactfax AS contactfax,
+ p.contactphone AS contactphone,
+ p.contactemail AS contactemail,
+ p.summary AS summary,
+ p.currency AS currency,
+ p.totalcost AS totalcost,
+ p.fundedamount AS fundedamount,
+ dc.id AS collectedfromid,
+ dc.officialname AS collectedfromname,
+ ctc.code || '@@@' || ctc.name || '@@@' || cts.code || '@@@' || cts.name AS contracttype,
+ pac.code || '@@@' || pac.name || '@@@' || pas.code || '@@@' || pas.name AS provenanceaction,
+ array_agg(DISTINCT i.pid || '###' || i.issuertype) AS pid,
+ array_agg(DISTINCT s.name || '###' || sc.code || '@@@' || sc.name || '@@@' || ss.code || '@@@' || ss.name) AS subjects,
+ array_agg(DISTINCT fp.path) AS fundingtree
+ FROM projects p
+ LEFT OUTER JOIN class pac ON (pac.code = p.provenanceactionclass)
+ LEFT OUTER JOIN scheme pas ON (pas.code = p.provenanceactionscheme)
+
+ LEFT OUTER JOIN projectpids pp ON (pp.project = p.id)
+ LEFT OUTER JOIN dsm_identities i ON (i.pid = pp.pid)
+
+ LEFT OUTER JOIN dsm_datasources dc ON (dc.id = p.collectedfrom)
+
+ LEFT OUTER JOIN project_fundingpath pf ON (pf.project = p.id)
+ LEFT OUTER JOIN fundingpaths fp ON (fp.id = pf.funding)
+
+ LEFT OUTER JOIN project_subject ps ON (ps.project = p.id)
+ LEFT OUTER JOIN subjects s ON (s.id = ps.subject)
+
+ LEFT OUTER JOIN class sc ON (sc.code = s.semanticclass)
+ LEFT OUTER JOIN scheme ss ON (ss.code = s.semanticscheme)
+
+ LEFT OUTER JOIN class ctc ON (ctc.code = p.contracttypeclass)
+ LEFT OUTER JOIN scheme cts ON (cts.code = p.contracttypescheme)
+
+ GROUP BY
+ p.id,
+ p.code,
+ p.websiteurl,
+ p.acronym,
+ p.title,
+ p.startdate,
+ p.enddate,
+ p.call_identifier,
+ p.keywords,
+ p.duration,
+ p.ec_sc39,
+ p.oa_mandate_for_publications,
+ p.ec_article29_3,
+ p.dateofcollection,
+ p.inferred,
+ p.deletedbyinference,
+ p.trust,
+ p.inferenceprovenance,
+ p.contactfullname,
+ p.contactfax,
+ p.contactphone,
+ p.contactemail,
+ p.summary,
+ p.currency,
+ p.totalcost,
+ p.fundedamount,
+ dc.id,
+ dc.officialname,
+ pac.code, pac.name, pas.code, pas.name,
+ ctc.code, ctc.name, cts.code, cts.name;
\ No newline at end of file
diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/sql/querySimilarityFromOpenOrgsDB.sql b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/sql/querySimilarityFromOpenOrgsDB.sql
new file mode 100644
index 0000000000..4407559c61
--- /dev/null
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/sql/querySimilarityFromOpenOrgsDB.sql
@@ -0,0 +1,17 @@
+SELECT local_id AS id1, oa_original_id AS id2 FROM openaire_simrels WHERE reltype = 'is_similar'
+
+UNION ALL
+
+SELECT
+ o.id AS id1,
+ 'openorgsmesh'||substring(o.id, 13)||'-'||md5(a.acronym) AS id2
+FROM acronyms a
+ LEFT OUTER JOIN organizations o ON (a.id = o.id)
+
+UNION ALL
+
+SELECT
+ o.id AS id1,
+ 'openorgsmesh'||substring(o.id, 13)||'-'||md5(n.name) AS id2
+FROM other_names n
+ LEFT OUTER JOIN organizations o ON (n.id = o.id)
diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/transform_actionsets_parameters.json b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/transform_actionsets_parameters.json
new file mode 100644
index 0000000000..ce72f53ca6
--- /dev/null
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/transform_actionsets_parameters.json
@@ -0,0 +1,5 @@
+[
+ {"paramName":"mt", "paramLongName":"master", "paramDescription": "should be local or yarn", "paramRequired": true},
+ {"paramName":"is", "paramLongName":"isLookupUrl", "paramDescription": "URL of the isLookUp Service", "paramRequired": true},
+ {"paramName":"i", "paramLongName":"inputPaths", "paramDescription": "URL of the isLookUp Service", "paramRequired": true}
+]
diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/wfs/actions/oozie_app/config-default.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/wfs/actions/oozie_app/config-default.xml
new file mode 100644
index 0000000000..9637ebdc62
--- /dev/null
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/wfs/actions/oozie_app/config-default.xml
@@ -0,0 +1,30 @@
+
+
+ jobTracker
+ yarnRM
+
+
+ nameNode
+ hdfs://nameservice1
+
+
+ sourceNN
+ webhdfs://namenode2.hadoop.dm.openaire.eu:50071
+
+
+ oozie.use.system.libpath
+ true
+
+
+ oozie.action.sharelib.for.spark
+ spark2
+
+
+ spark2YarnHistoryServerAddress
+ http://iis-cdh5-test-gw.ocean.icm.edu.pl:18088
+
+
+ spark2EventLogDir
+ /user/spark/applicationHistory
+
+
\ No newline at end of file
diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/wfs/actions/oozie_app/workflow.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/wfs/actions/oozie_app/workflow.xml
new file mode 100644
index 0000000000..ed01c8de4c
--- /dev/null
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/wfs/actions/oozie_app/workflow.xml
@@ -0,0 +1,122 @@
+
+
+
+ sourceNN
+ the source name node
+
+
+ isLookupUrl
+ the isLookup service endpoint
+
+
+ workingDirectory
+ /tmp/actionsets
+ working directory
+
+
+ distcp_memory_mb
+ 6144
+ memory for distcp copying actionsets from remote cluster
+
+
+ distcp_task_timeout
+ 60000000
+ timeout for distcp copying actions from remote cluster
+
+
+ distcp_num_maps
+ 1
+ mmaximum number of map tasks used in the distcp process
+
+
+ transform_only
+ activate tranform-only mode. Only apply transformation step
+
+
+ sparkDriverMemory
+ memory for driver process
+
+
+ sparkExecutorMemory
+ memory for individual executor
+
+
+ sparkExecutorCores
+ number of cores used by single executor
+
+
+ spark2YarnHistoryServerAddress
+ spark 2.* yarn history server address
+
+
+ spark2EventLogDir
+ spark 2.* event log dir location
+
+
+
+
+ ${jobTracker}
+ ${nameNode}
+
+
+ mapreduce.job.queuename
+ ${queueName}
+
+
+ oozie.launcher.mapred.job.queue.name
+ ${oozieLauncherQueueName}
+
+
+
+
+
+
+
+
+ eu.dnetlib.dhp.migration.actions.MigrateActionSet
+ -Dmapred.task.timeout=${distcp_task_timeout}
+ -is${isLookupUrl}
+ -sn${sourceNN}
+ -tn${nameNode}
+ -w${workingDirectory}
+ -nm${distcp_num_maps}
+ -mm${distcp_memory_mb}
+ -tt${distcp_task_timeout}
+ -tr${transform_only}
+
+
+
+
+
+
+
+
+ yarn
+ cluster
+ transform_actions
+ eu.dnetlib.dhp.migration.actions.TransformActions
+ dhp-aggregation-${projectVersion}.jar
+
+ --executor-cores ${sparkExecutorCores}
+ --executor-memory ${sparkExecutorMemory}
+ --driver-memory=${sparkDriverMemory}
+ --conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener"
+ --conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener"
+ --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+ --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+
+ -mtyarn
+ -is${isLookupUrl}
+ --inputPaths${wf:actionData('migrate_actionsets')['target_paths']}
+
+
+
+
+
+
+ migrate_actions failed, error message[${wf:errorMessage(wf:lastErrorNode())}]
+
+
+
+
+
\ No newline at end of file
diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/wfs/claims/oozie_app/config-default.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/wfs/claims/oozie_app/config-default.xml
new file mode 100644
index 0000000000..2e0ed9aeea
--- /dev/null
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/wfs/claims/oozie_app/config-default.xml
@@ -0,0 +1,18 @@
+
+
+ jobTracker
+ yarnRM
+
+
+ nameNode
+ hdfs://nameservice1
+
+
+ oozie.use.system.libpath
+ true
+
+
+ oozie.action.sharelib.for.spark
+ spark2
+
+
\ No newline at end of file
diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/wfs/claims/oozie_app/workflow.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/wfs/claims/oozie_app/workflow.xml
new file mode 100644
index 0000000000..1ac456976d
--- /dev/null
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/wfs/claims/oozie_app/workflow.xml
@@ -0,0 +1,169 @@
+
+
+
+ migrationClaimsPathStep1
+ the base path to store hdfs file
+
+
+ migrationClaimsPathStep2
+ the temporary path to store entities before dispatching
+
+
+ migrationClaimsPathStep3
+ the graph Raw base path
+
+
+ postgresURL
+ the postgres URL to access to the database
+
+
+ postgresUser
+ the user postgres
+
+
+ postgresPassword
+ the password postgres
+
+
+ mongoURL
+ mongoDB url, example: mongodb://[username:password@]host[:port]
+
+
+ mongoDb
+ mongo database
+
+
+ sparkDriverMemory
+ memory for driver process
+
+
+ sparkExecutorMemory
+ memory for individual executor
+
+
+ sparkExecutorCores
+ number of cores used by single executor
+
+
+
+
+
+
+ Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]
+
+
+
+
+
+
+
+
+
+
+
+
+
+ ${jobTracker}
+ ${nameNode}
+ eu.dnetlib.dhp.migration.step1.MigrateDbEntitiesApplication
+ -p${migrationClaimsPathStep1}/db_claims
+ -pgurl${postgresURL}
+ -pguser${postgresUser}
+ -pgpasswd${postgresPassword}
+ -aclaims
+
+
+
+
+
+
+
+ ${jobTracker}
+ ${nameNode}
+ eu.dnetlib.dhp.migration.step1.MigrateMongoMdstoresApplication
+ -p${migrationClaimsPathStep1}/odf_claims
+ -mongourl${mongoURL}
+ -mongodb${mongoDb}
+ -fODF
+ -lstore
+ -iclaim
+
+
+
+
+
+
+
+ ${jobTracker}
+ ${nameNode}
+ eu.dnetlib.dhp.migration.step1.MigrateMongoMdstoresApplication
+ -p${migrationClaimsPathStep1}/oaf_claims
+ -mongourl${mongoURL}
+ -mongodb${mongoDb}
+ -fOAF
+ -lstore
+ -iclaim
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ ${jobTracker}
+ ${nameNode}
+ yarn-cluster
+ cluster
+ GenerateClaimEntities
+ eu.dnetlib.dhp.migration.step2.GenerateEntitiesApplication
+ dhp-aggregation-${projectVersion}.jar
+ --executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores} --driver-memory=${sparkDriverMemory} --conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" --conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" --conf spark.sql.warehouse.dir="/user/hive/warehouse"
+ -mt yarn-cluster
+ -s${migrationClaimsPathStep1}/db_claims,${migrationClaimsPathStep1}/oaf_claims,${migrationClaimsPathStep1}/odf_claims
+ -t${migrationClaimsPathStep2}/claim_entities
+ -pgurl${postgresURL}
+ -pguser${postgresUser}
+ -pgpasswd${postgresPassword}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ ${jobTracker}
+ ${nameNode}
+ yarn-cluster
+ cluster
+ GenerateClaimGraph
+ eu.dnetlib.dhp.migration.step3.DispatchEntitiesApplication
+ dhp-aggregation-${projectVersion}.jar
+ --executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores} --driver-memory=${sparkDriverMemory} --conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" --conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" --conf spark.sql.warehouse.dir="/user/hive/warehouse"
+ -mt yarn-cluster
+ -s${migrationClaimsPathStep2}/claim_entities
+ -g${migrationClaimsPathStep3}
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/wfs/regular_all_steps/oozie_app/config-default.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/wfs/regular_all_steps/oozie_app/config-default.xml
new file mode 100644
index 0000000000..2e0ed9aeea
--- /dev/null
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/wfs/regular_all_steps/oozie_app/config-default.xml
@@ -0,0 +1,18 @@
+
+
+ jobTracker
+ yarnRM
+
+
+ nameNode
+ hdfs://nameservice1
+
+
+ oozie.use.system.libpath
+ true
+
+
+ oozie.action.sharelib.for.spark
+ spark2
+
+
\ No newline at end of file
diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/wfs/regular_all_steps/oozie_app/workflow.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/wfs/regular_all_steps/oozie_app/workflow.xml
new file mode 100644
index 0000000000..42ab59822d
--- /dev/null
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/wfs/regular_all_steps/oozie_app/workflow.xml
@@ -0,0 +1,197 @@
+
+
+
+
+ workingPath
+ /tmp/dhp_migration
+ the base path to store temporary intermediate data
+
+
+ graphBasePath
+ the target path to store raw graph
+
+
+ reuseContent
+ false
+ should import content from the aggregator or reuse a previous version
+
+
+ postgresURL
+ the postgres URL to access to the database
+
+
+ postgresUser
+ the user postgres
+
+
+ postgresPassword
+ the password postgres
+
+
+ mongoURL
+ mongoDB url, example: mongodb://[username:password@]host[:port]
+
+
+ mongoDb
+ mongo database
+
+
+ sparkDriverMemory
+ memory for driver process
+
+
+ sparkExecutorMemory
+ memory for individual executor
+
+
+ sparkExecutorCores
+ number of cores used by single executor
+
+
+
+
+ ${jobTracker}
+ ${nameNode}
+
+
+ mapreduce.job.queuename
+ ${queueName}
+
+
+ oozie.launcher.mapred.job.queue.name
+ ${oozieLauncherQueueName}
+
+
+
+
+
+
+
+ Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]
+
+
+
+
+ ${wf:conf('reuseContent') eq false}
+ ${wf:conf('reuseContent') eq true}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ eu.dnetlib.dhp.migration.step1.MigrateDbEntitiesApplication
+ -p${workingPath}/db_records
+ -pgurl${postgresURL}
+ -pguser${postgresUser}
+ -pgpasswd${postgresPassword}
+
+
+
+
+
+
+
+ eu.dnetlib.dhp.migration.step1.MigrateMongoMdstoresApplication
+ -p${workingPath}/odf_records
+ -mongourl${mongoURL}
+ -mongodb${mongoDb}
+ -fODF
+ -lstore
+ -icleaned
+
+
+
+
+
+
+
+ eu.dnetlib.dhp.migration.step1.MigrateMongoMdstoresApplication
+ -p${workingPath}/oaf_records
+ -mongourl${mongoURL}
+ -mongodb${mongoDb}
+ -fOAF
+ -lstore
+ -icleaned
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ yarn
+ cluster
+ GenerateEntities
+ eu.dnetlib.dhp.migration.step2.GenerateEntitiesApplication
+ dhp-aggregation-${projectVersion}.jar
+
+ --executor-memory ${sparkExecutorMemory}
+ --executor-cores ${sparkExecutorCores}
+ --driver-memory=${sparkDriverMemory}
+ --conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener"
+ --conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener"
+ --conf spark.sql.warehouse.dir="/user/hive/warehouse"
+
+ -mt yarn-cluster
+ -s${workingPath}/db_records,${workingPath}/oaf_records,${workingPath}/odf_records
+ -t${workingPath}/all_entities
+ -pgurl${postgresURL}
+ -pguser${postgresUser}
+ -pgpasswd${postgresPassword}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ yarn
+ cluster
+ GenerateGraph
+ eu.dnetlib.dhp.migration.step3.DispatchEntitiesApplication
+ dhp-aggregation-${projectVersion}.jar
+
+ --executor-memory ${sparkExecutorMemory}
+ --executor-cores ${sparkExecutorCores}
+ --driver-memory=${sparkDriverMemory}
+ --conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener"
+ --conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener"
+ --conf spark.sql.warehouse.dir="/user/hive/warehouse"
+
+ -mt yarn-cluster
+ -s${workingPath}/all_entities
+ -g${graphBasePath}/graph_raw
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/wfs/regular_step1/oozie_app/config-default.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/wfs/regular_step1/oozie_app/config-default.xml
new file mode 100644
index 0000000000..2e0ed9aeea
--- /dev/null
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/wfs/regular_step1/oozie_app/config-default.xml
@@ -0,0 +1,18 @@
+
+
+ jobTracker
+ yarnRM
+
+
+ nameNode
+ hdfs://nameservice1
+
+
+ oozie.use.system.libpath
+ true
+
+
+ oozie.action.sharelib.for.spark
+ spark2
+
+
\ No newline at end of file
diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/wfs/regular_step1/oozie_app/workflow.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/wfs/regular_step1/oozie_app/workflow.xml
new file mode 100644
index 0000000000..f16e22f957
--- /dev/null
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/wfs/regular_step1/oozie_app/workflow.xml
@@ -0,0 +1,103 @@
+
+
+
+ migrationPathStep1
+ the base path to store hdfs file
+
+
+ postgresURL
+ the postgres URL to access to the database
+
+
+ postgresUser
+ the user postgres
+
+
+ postgresPassword
+ the password postgres
+
+
+ mongoURL
+ mongoDB url, example: mongodb://[username:password@]host[:port]
+
+
+ mongoDb
+ mongo database
+
+
+ sparkDriverMemory
+ memory for driver process
+
+
+ sparkExecutorMemory
+ memory for individual executor
+
+
+ sparkExecutorCores
+ number of cores used by single executor
+
+
+
+
+
+
+ Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]
+
+
+
+
+
+
+
+
+
+
+
+
+
+ ${jobTracker}
+ ${nameNode}
+ eu.dnetlib.dhp.migration.step1.MigrateDbEntitiesApplication
+ -p${migrationPathStep1}/db_records
+ -pgurl${postgresURL}
+ -pguser${postgresUser}
+ -pgpasswd${postgresPassword}
+
+
+
+
+
+
+
+ ${jobTracker}
+ ${nameNode}
+ eu.dnetlib.dhp.migration.step1.MigrateMongoMdstoresApplication
+ -p${migrationPathStep1}/odf_records
+ -mongourl${mongoURL}
+ -mongodb${mongoDb}
+ -fODF
+ -lstore
+ -icleaned
+
+
+
+
+
+
+
+ ${jobTracker}
+ ${nameNode}
+ eu.dnetlib.dhp.migration.step1.MigrateMongoMdstoresApplication
+ -p${migrationPathStep1}/oaf_records
+ -mongourl${mongoURL}
+ -mongodb${mongoDb}
+ -fOAF
+ -lstore
+ -icleaned
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/wfs/regular_step1_onlydb/oozie_app/config-default.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/wfs/regular_step1_onlydb/oozie_app/config-default.xml
new file mode 100644
index 0000000000..2e0ed9aeea
--- /dev/null
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/wfs/regular_step1_onlydb/oozie_app/config-default.xml
@@ -0,0 +1,18 @@
+
+
+ jobTracker
+ yarnRM
+
+
+ nameNode
+ hdfs://nameservice1
+
+
+ oozie.use.system.libpath
+ true
+
+
+ oozie.action.sharelib.for.spark
+ spark2
+
+
\ No newline at end of file
diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/wfs/regular_step1_onlydb/oozie_app/workflow.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/wfs/regular_step1_onlydb/oozie_app/workflow.xml
new file mode 100644
index 0000000000..0730f3a1f1
--- /dev/null
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/wfs/regular_step1_onlydb/oozie_app/workflow.xml
@@ -0,0 +1,62 @@
+
+
+
+ migrationPathStep1
+ the base path to store hdfs file
+
+
+ postgresURL
+ the postgres URL to access to the database
+
+
+ postgresUser
+ the user postgres
+
+
+ postgresPassword
+ the password postgres
+
+
+ sparkDriverMemory
+ memory for driver process
+
+
+ sparkExecutorMemory
+ memory for individual executor
+
+
+ sparkExecutorCores
+ number of cores used by single executor
+
+
+
+
+
+
+ Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]
+
+
+
+
+
+
+
+
+
+
+
+
+ ${jobTracker}
+ ${nameNode}
+ eu.dnetlib.dhp.migration.step1.MigrateDbEntitiesApplication
+ -p${migrationPathStep1}/db_records
+ -pgurl${postgresURL}
+ -pguser${postgresUser}
+ -pgpasswd${postgresPassword}
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/wfs/regular_step2/oozie_app/config-default.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/wfs/regular_step2/oozie_app/config-default.xml
new file mode 100644
index 0000000000..2e0ed9aeea
--- /dev/null
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/wfs/regular_step2/oozie_app/config-default.xml
@@ -0,0 +1,18 @@
+
+
+ jobTracker
+ yarnRM
+
+
+ nameNode
+ hdfs://nameservice1
+
+
+ oozie.use.system.libpath
+ true
+
+
+ oozie.action.sharelib.for.spark
+ spark2
+
+
\ No newline at end of file
diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/wfs/regular_step2/oozie_app/workflow.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/wfs/regular_step2/oozie_app/workflow.xml
new file mode 100644
index 0000000000..cd0a4025e8
--- /dev/null
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/wfs/regular_step2/oozie_app/workflow.xml
@@ -0,0 +1,74 @@
+
+
+
+ migrationPathStep1
+ the base path to store hdfs file
+
+
+ migrationPathStep2
+ the temporary path to store entities before dispatching
+
+
+ postgresURL
+ the postgres URL to access to the database
+
+
+ postgresUser
+ the user postgres
+
+
+ postgresPassword
+ the password postgres
+
+
+ sparkDriverMemory
+ memory for driver process
+
+
+ sparkExecutorMemory
+ memory for individual executor
+
+
+ sparkExecutorCores
+ number of cores used by single executor
+
+
+
+
+
+
+ Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]
+
+
+
+
+
+
+
+
+
+
+
+
+
+ ${jobTracker}
+ ${nameNode}
+ yarn-cluster
+ cluster
+ GenerateEntities
+ eu.dnetlib.dhp.migration.step2.GenerateEntitiesApplication
+ dhp-aggregation-${projectVersion}.jar
+ --executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores} --driver-memory=${sparkDriverMemory} --conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" --conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" --conf spark.sql.warehouse.dir="/user/hive/warehouse"
+ -mt yarn-cluster
+ -s${migrationPathStep1}/db_records,${migrationPathStep1}/oaf_records,${migrationPathStep1}/odf_records
+ -t${migrationPathStep2}/all_entities
+ -pgurl${postgresURL}
+ -pguser${postgresUser}
+ -pgpasswd${postgresPassword}
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/wfs/regular_step3/oozie_app/config-default.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/wfs/regular_step3/oozie_app/config-default.xml
new file mode 100644
index 0000000000..2e0ed9aeea
--- /dev/null
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/wfs/regular_step3/oozie_app/config-default.xml
@@ -0,0 +1,18 @@
+
+
+ jobTracker
+ yarnRM
+
+
+ nameNode
+ hdfs://nameservice1
+
+
+ oozie.use.system.libpath
+ true
+
+
+ oozie.action.sharelib.for.spark
+ spark2
+
+
\ No newline at end of file
diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/wfs/regular_step3/oozie_app/workflow.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/wfs/regular_step3/oozie_app/workflow.xml
new file mode 100644
index 0000000000..8688f09d18
--- /dev/null
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/wfs/regular_step3/oozie_app/workflow.xml
@@ -0,0 +1,60 @@
+
+
+
+
+ migrationPathStep2
+ the temporary path to store entities before dispatching
+
+
+ migrationPathStep3
+ the graph Raw base path
+
+
+ sparkDriverMemory
+ memory for driver process
+
+
+ sparkExecutorMemory
+ memory for individual executor
+
+
+ sparkExecutorCores
+ number of cores used by single executor
+
+
+
+
+
+
+ Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]
+
+
+
+
+
+
+
+
+
+
+
+
+
+ ${jobTracker}
+ ${nameNode}
+ yarn-cluster
+ cluster
+ GenerateGraph
+ eu.dnetlib.dhp.migration.step3.DispatchEntitiesApplication
+ dhp-aggregation-${projectVersion}.jar
+ --executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores} --driver-memory=${sparkDriverMemory} --conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" --conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" --conf spark.sql.warehouse.dir="/user/hive/warehouse"
+ -mt yarn-cluster
+ -s${migrationPathStep2}/all_entities
+ -g${migrationPathStep3}
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/log4j.properties b/dhp-workflows/dhp-aggregation/src/main/resources/log4j.properties
new file mode 100644
index 0000000000..63cba917ee
--- /dev/null
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/log4j.properties
@@ -0,0 +1,9 @@
+# Set root logger level to DEBUG and its only appender to A1.
+log4j.rootLogger=INFO, A1
+
+# A1 is set to be a ConsoleAppender.
+log4j.appender.A1=org.apache.log4j.ConsoleAppender
+
+# A1 uses PatternLayout.
+log4j.appender.A1.layout=org.apache.log4j.PatternLayout
+log4j.appender.A1.layout.ConversionPattern=%-4r [%t] %-5p %c %x - %m%n
diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/CollectionJobTest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/CollectionJobTest.java
index 848fbe17da..fde928a8b6 100644
--- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/CollectionJobTest.java
+++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/CollectionJobTest.java
@@ -1,79 +1,89 @@
package eu.dnetlib.dhp.collection;
-import com.fasterxml.jackson.databind.ObjectMapper;
-import eu.dnetlib.dhp.model.mdstore.MetadataRecord;
-import eu.dnetlib.dhp.model.mdstore.Provenance;
-import org.apache.commons.io.FileUtils;
-import org.apache.commons.io.IOUtils;
-import org.junit.*;
-
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
+import org.apache.commons.io.FileUtils;
+import org.apache.commons.io.IOUtils;
+import org.junit.jupiter.api.AfterEach;
+
+import org.junit.jupiter.api.BeforeEach;
+import org.junit.jupiter.api.Test;
+
+import com.fasterxml.jackson.databind.ObjectMapper;
+
+import eu.dnetlib.dhp.model.mdstore.MetadataRecord;
+import eu.dnetlib.dhp.model.mdstore.Provenance;
+
+import static org.junit.jupiter.api.Assertions.*;
+
public class CollectionJobTest {
- private Path testDir;
- @Before
- public void setup() throws IOException {
- testDir = Files.createTempDirectory("dhp-collection");
- }
+ private Path testDir;
- @After
- public void teadDown() throws IOException {
- FileUtils.deleteDirectory(testDir.toFile());
- }
+ @BeforeEach
+ public void setup() throws IOException {
+ testDir = Files.createTempDirectory("dhp-collection");
+ }
- @Test
- public void tesCollection() throws Exception {
- Provenance provenance = new Provenance("pippo", "puppa", "ns_prefix");
- GenerateNativeStoreSparkJob.main(new String[] {
- "-mt", "local",
- "-w", "wid",
- "-e", "XML",
- "-d", ""+System.currentTimeMillis(),
- "-p", new ObjectMapper().writeValueAsString(provenance),
- "-x", "./*[local-name()='record']/*[local-name()='header']/*[local-name()='identifier']",
- "-i", this.getClass().getResource("/eu/dnetlib/dhp/collection/native.seq").toString(),
- "-o", testDir.toString()+"/store",
- "-t", "true",
- "-ru", "",
- "-rp", "",
- "-rh", "",
- "-ro", "",
- "-rr", ""});
- System.out.println(new ObjectMapper().writeValueAsString(provenance));
- }
+ @AfterEach
+ public void teadDown() throws IOException {
+ FileUtils.deleteDirectory(testDir.toFile());
+ }
+ @Test
+ public void tesCollection() throws Exception {
+ final Provenance provenance = new Provenance("pippo", "puppa", "ns_prefix");
+ GenerateNativeStoreSparkJob.main(new String[] {
+ "-mt", "local",
+ "-w", "wid",
+ "-e", "XML",
+ "-d", "" + System.currentTimeMillis(),
+ "-p", new ObjectMapper().writeValueAsString(provenance),
+ "-x", "./*[local-name()='record']/*[local-name()='header']/*[local-name()='identifier']",
+ "-i", this.getClass().getResource("/eu/dnetlib/dhp/collection/native.seq").toString(),
+ "-o", testDir.toString() + "/store",
+ "-t", "true",
+ "-ru", "",
+ "-rp", "",
+ "-rh", "",
+ "-ro", "",
+ "-rr", "" });
+ System.out.println(new ObjectMapper().writeValueAsString(provenance));
+ }
+ @Test
+ public void testGenerationMetadataRecord() throws Exception {
- @Test
- public void testGenerationMetadataRecord() throws Exception {
+ final String xml = IOUtils.toString(this.getClass().getResourceAsStream("./record.xml"));
- final String xml = IOUtils.toString(this.getClass().getResourceAsStream("./record.xml"));
+ final MetadataRecord record = GenerateNativeStoreSparkJob
+ .parseRecord(xml, "./*[local-name()='record']/*[local-name()='header']/*[local-name()='identifier']", "XML", new Provenance("foo", "bar",
+ "ns_prefix"), System.currentTimeMillis(), null, null);
- MetadataRecord record = GenerateNativeStoreSparkJob.parseRecord(xml, "./*[local-name()='record']/*[local-name()='header']/*[local-name()='identifier']", "XML", new Provenance("foo", "bar", "ns_prefix"), System.currentTimeMillis(), null,null);
+ assert record != null;
+ System.out.println(record.getId());
+ System.out.println(record.getOriginalId());
- assert record != null;
- System.out.println(record.getId());
- System.out.println(record.getOriginalId());
+ }
+ @Test
+ public void TestEquals() throws IOException {
- }
+ final String xml = IOUtils.toString(this.getClass().getResourceAsStream("./record.xml"));
+ final MetadataRecord record = GenerateNativeStoreSparkJob
+ .parseRecord(xml, "./*[local-name()='record']/*[local-name()='header']/*[local-name()='identifier']", "XML", new Provenance("foo", "bar",
+ "ns_prefix"), System.currentTimeMillis(), null, null);
+ final MetadataRecord record1 = GenerateNativeStoreSparkJob
+ .parseRecord(xml, "./*[local-name()='record']/*[local-name()='header']/*[local-name()='identifier']", "XML", new Provenance("foo", "bar",
+ "ns_prefix"), System.currentTimeMillis(), null, null);
+ assert record != null;
+ record.setBody("ciao");
+ assert record1 != null;
+ record1.setBody("mondo");
+ assertEquals(record, record1);
-
- @Test
- public void TestEquals () throws IOException {
-
- final String xml = IOUtils.toString(this.getClass().getResourceAsStream("./record.xml"));
- MetadataRecord record = GenerateNativeStoreSparkJob.parseRecord(xml, "./*[local-name()='record']/*[local-name()='header']/*[local-name()='identifier']", "XML", new Provenance("foo", "bar", "ns_prefix"), System.currentTimeMillis(), null,null);
- MetadataRecord record1 = GenerateNativeStoreSparkJob.parseRecord(xml, "./*[local-name()='record']/*[local-name()='header']/*[local-name()='identifier']", "XML", new Provenance("foo", "bar", "ns_prefix"), System.currentTimeMillis(), null,null);
- assert record != null;
- record.setBody("ciao");
- assert record1 != null;
- record1.setBody("mondo");
- Assert.assertEquals(record, record1);
-
- }
+ }
}
diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collector/worker/DnetCollectorWorkerApplicationTests.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collector/worker/DnetCollectorWorkerApplicationTests.java
index 6a9417097f..665e989d83 100644
--- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collector/worker/DnetCollectorWorkerApplicationTests.java
+++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collector/worker/DnetCollectorWorkerApplicationTests.java
@@ -7,13 +7,13 @@ import eu.dnetlib.dhp.collection.worker.DnetCollectorWorker;
import eu.dnetlib.dhp.collection.worker.utils.CollectorPluginFactory;
import eu.dnetlib.message.Message;
import eu.dnetlib.message.MessageManager;
-import org.junit.After;
-import org.junit.Before;
-import org.junit.Test;
+import org.junit.jupiter.api.AfterEach;
+import org.junit.jupiter.api.BeforeEach;
+import org.junit.jupiter.api.Test;
import java.io.File;
-import static org.junit.Assert.assertNotNull;
+import static org.junit.jupiter.api.Assertions.assertNotNull;
import static org.mockito.Mockito.*;
@@ -24,7 +24,7 @@ public class DnetCollectorWorkerApplicationTests {
private MessageManager messageManager = mock(MessageManager.class);
private DnetCollectorWorker worker;
- @Before
+ @BeforeEach
public void setup() throws Exception {
ObjectMapper mapper = new ObjectMapper();
final String apiJson = mapper.writeValueAsString(getApi());
@@ -47,7 +47,7 @@ public class DnetCollectorWorkerApplicationTests {
}
- @After
+ @AfterEach
public void dropDown(){
File f = new File("/tmp/file.seq");
f.delete();
diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/migration/step1/MigrateDbEntitiesApplicationTest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/migration/step1/MigrateDbEntitiesApplicationTest.java
new file mode 100644
index 0000000000..d63bb3ee32
--- /dev/null
+++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/migration/step1/MigrateDbEntitiesApplicationTest.java
@@ -0,0 +1,293 @@
+package eu.dnetlib.dhp.migration.step1;
+
+import com.fasterxml.jackson.core.type.TypeReference;
+import com.fasterxml.jackson.databind.ObjectMapper;
+import eu.dnetlib.dhp.schema.oaf.*;
+import org.apache.commons.io.IOUtils;
+import org.junit.jupiter.api.BeforeEach;
+import org.junit.jupiter.api.Test;
+import org.junit.jupiter.api.extension.ExtendWith;
+import org.mockito.Mock;
+import org.mockito.Mockito;
+import org.mockito.junit.jupiter.MockitoExtension;
+
+import java.io.IOException;
+import java.sql.Array;
+import java.sql.Date;
+import java.sql.ResultSet;
+import java.sql.SQLException;
+import java.util.List;
+import java.util.Objects;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+
+@ExtendWith(MockitoExtension.class)
+public class MigrateDbEntitiesApplicationTest {
+
+ private MigrateDbEntitiesApplication app;
+
+ @Mock
+ private ResultSet rs;
+
+ @BeforeEach
+ public void setUp() {
+ this.app = new MigrateDbEntitiesApplication();
+ }
+
+ @Test
+ public void testProcessDatasource() throws Exception {
+ final List fields = prepareMocks("datasources_resultset_entry.json");
+
+ final List list = app.processDatasource(rs);
+ assertEquals(1, list.size());
+ verifyMocks(fields);
+
+ final Datasource ds = (Datasource) list.get(0);
+ assertValidId(ds.getId());
+ assertEquals(ds.getOfficialname().getValue(), getValueAsString("officialname", fields));
+ assertEquals(ds.getEnglishname().getValue(), getValueAsString("englishname", fields));
+ assertEquals(ds.getContactemail().getValue(), getValueAsString("contactemail", fields));
+ assertEquals(ds.getWebsiteurl().getValue(), getValueAsString("websiteurl", fields));
+ assertEquals(ds.getNamespaceprefix().getValue(), getValueAsString("namespaceprefix", fields));
+ assertEquals(ds.getCollectedfrom().get(0).getKey(), getValueAsString("collectedfromid", fields));
+ assertEquals(ds.getCollectedfrom().get(0).getValue(), getValueAsString("collectedfromname", fields));
+ }
+
+ @Test
+ public void testProcessProject() throws Exception {
+ final List fields = prepareMocks("projects_resultset_entry.json");
+
+ final List list = app.processProject(rs);
+ assertEquals(1, list.size());
+ verifyMocks(fields);
+
+ final Project p = (Project) list.get(0);
+ assertValidId(p.getId());
+ assertEquals(p.getAcronym().getValue(), getValueAsString("acronym", fields));
+ assertEquals(p.getTitle().getValue(), getValueAsString("title", fields));
+ assertEquals(p.getCollectedfrom().get(0).getKey(), getValueAsString("collectedfromid", fields));
+ assertEquals(p.getCollectedfrom().get(0).getValue(), getValueAsString("collectedfromname", fields));
+ }
+
+ @Test
+ public void testProcessOrganization() throws Exception {
+ final List fields = prepareMocks("organizations_resultset_entry.json");
+
+ final List list = app.processOrganization(rs);
+
+ assertEquals(1, list.size());
+
+ verifyMocks(fields);
+
+ final Organization o = (Organization) list.get(0);
+ assertValidId(o.getId());
+ assertEquals(o.getLegalshortname().getValue(), getValueAsString("legalshortname", fields));
+ assertEquals(o.getLegalname().getValue(), getValueAsString("legalname", fields));
+ assertEquals(o.getWebsiteurl().getValue(), getValueAsString("websiteurl", fields));
+ assertEquals(o.getCountry().getClassid(), getValueAsString("country", fields).split("@@@")[0]);
+ assertEquals(o.getCountry().getClassname(), getValueAsString("country", fields).split("@@@")[1]);
+ assertEquals(o.getCountry().getSchemeid(), getValueAsString("country", fields).split("@@@")[2]);
+ assertEquals(o.getCountry().getSchemename(), getValueAsString("country", fields).split("@@@")[3]);
+ assertEquals(o.getCollectedfrom().get(0).getKey(), getValueAsString("collectedfromid", fields));
+ assertEquals(o.getCollectedfrom().get(0).getValue(), getValueAsString("collectedfromname", fields));
+ }
+
+ @Test
+ public void testProcessDatasourceOrganization() throws Exception {
+ final List fields = prepareMocks("datasourceorganization_resultset_entry.json");
+
+ final List list = app.processDatasourceOrganization(rs);
+
+ assertEquals(2, list.size());
+ verifyMocks(fields);
+
+ final Relation r1 = (Relation) list.get(0);
+ final Relation r2 = (Relation) list.get(1);
+ assertValidId(r1.getSource());
+ assertValidId(r2.getSource());
+ assertEquals(r1.getSource(), r2.getTarget());
+ assertEquals(r2.getSource(), r1.getTarget());
+ }
+
+ @Test
+ public void testProcessProjectOrganization() throws Exception {
+ final List fields = prepareMocks("projectorganization_resultset_entry.json");
+
+ final List list = app.processProjectOrganization(rs);
+
+ assertEquals(2, list.size());
+ verifyMocks(fields);
+
+ final Relation r1 = (Relation) list.get(0);
+ final Relation r2 = (Relation) list.get(1);
+ assertValidId(r1.getSource());
+ assertValidId(r2.getSource());
+ assertEquals(r1.getSource(), r2.getTarget());
+ assertEquals(r2.getSource(), r1.getTarget());
+ }
+
+ @Test
+ public void testProcessClaims_context() throws Exception {
+ final List fields = prepareMocks("claimscontext_resultset_entry.json");
+
+ final List list = app.processClaims(rs);
+
+ assertEquals(1, list.size());
+ verifyMocks(fields);
+ }
+
+ @Test
+ public void testProcessClaims_rels() throws Exception {
+ final List fields = prepareMocks("claimsrel_resultset_entry.json");
+
+ final List list = app.processClaims(rs);
+
+ assertEquals(2, list.size());
+ verifyMocks(fields);
+ }
+
+ private List prepareMocks(final String jsonFile) throws IOException, SQLException {
+ final String json = IOUtils.toString(getClass().getResourceAsStream(jsonFile));
+ final ObjectMapper mapper = new ObjectMapper();
+ final List list = mapper.readValue(json, new TypeReference>() {});
+
+ for (final TypedField tf : list) {
+ if (tf.getValue() == null) {
+ switch (tf.getType()) {
+ case "not_used":
+ break;
+ case "boolean":
+ Mockito.when(rs.getBoolean(tf.getField())).thenReturn(false);
+ break;
+ case "date":
+ Mockito.when(rs.getDate(tf.getField())).thenReturn(null);
+ break;
+ case "int":
+ Mockito.when(rs.getInt(tf.getField())).thenReturn(0);
+ break;
+ case "double":
+ Mockito.when(rs.getDouble(tf.getField())).thenReturn(0.0);
+ break;
+ case "array":
+ Mockito.when(rs.getArray(tf.getField())).thenReturn(null);
+ break;
+ case "string":
+ default:
+ Mockito.when(rs.getString(tf.getField())).thenReturn(null);
+ break;
+ }
+ } else {
+ switch (tf.getType()) {
+ case "not_used":
+ break;
+ case "boolean":
+ Mockito.when(rs.getBoolean(tf.getField())).thenReturn(Boolean.parseBoolean(tf.getValue().toString()));
+ break;
+ case "date":
+ Mockito.when(rs.getDate(tf.getField())).thenReturn(Date.valueOf(tf.getValue().toString()));
+ break;
+ case "int":
+ Mockito.when(rs.getInt(tf.getField())).thenReturn(new Integer(tf.getValue().toString()));
+ break;
+ case "double":
+ Mockito.when(rs.getDouble(tf.getField())).thenReturn(new Double(tf.getValue().toString()));
+ break;
+ case "array":
+ final Array arr = Mockito.mock(Array.class);
+ final String[] values = ((List>) tf.getValue()).stream()
+ .filter(Objects::nonNull)
+ .map(o -> o.toString())
+ .toArray(String[]::new);
+
+ Mockito.when(arr.getArray()).thenReturn(values);
+ Mockito.when(rs.getArray(tf.getField())).thenReturn(arr);
+ break;
+ case "string":
+ default:
+ Mockito.when(rs.getString(tf.getField())).thenReturn(tf.getValue().toString());
+ break;
+ }
+ }
+ }
+
+ return list;
+ }
+
+ private void verifyMocks(final List list) throws SQLException {
+ for (final TypedField tf : list) {
+
+ switch (tf.getType()) {
+ case "not_used":
+ break;
+ case "boolean":
+ Mockito.verify(rs, Mockito.atLeastOnce()).getBoolean(tf.getField());
+ break;
+ case "date":
+ Mockito.verify(rs, Mockito.atLeastOnce()).getDate(tf.getField());
+ break;
+ case "int":
+ Mockito.verify(rs, Mockito.atLeastOnce()).getInt(tf.getField());
+ break;
+ case "double":
+ Mockito.verify(rs, Mockito.atLeastOnce()).getDouble(tf.getField());
+ break;
+ case "array":
+ Mockito.verify(rs, Mockito.atLeastOnce()).getArray(tf.getField());
+ break;
+ case "string":
+ default:
+ Mockito.verify(rs, Mockito.atLeastOnce()).getString(tf.getField());
+ break;
+ }
+ }
+ }
+
+ private void assertValidId(final String id) {
+ assertEquals(49, id.length());
+ assertEquals('|', id.charAt(2));
+ assertEquals(':', id.charAt(15));
+ assertEquals(':', id.charAt(16));
+ }
+
+ private String getValueAsString(final String name, final List fields) {
+ return fields.stream()
+ .filter(f -> f.getField().equals(name))
+ .map(TypedField::getValue)
+ .filter(Objects::nonNull)
+ .map(o -> o.toString())
+ .findFirst()
+ .get();
+ }
+}
+
+class TypedField {
+
+ private String field;
+ private String type;
+ private Object value;
+
+ public String getField() {
+ return field;
+ }
+
+ public void setField(final String field) {
+ this.field = field;
+ }
+
+ public String getType() {
+ return type;
+ }
+
+ public void setType(final String type) {
+ this.type = type;
+ }
+
+ public Object getValue() {
+ return value;
+ }
+
+ public void setValue(final Object value) {
+ this.value = value;
+ }
+
+}
diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/transformation/TransformationJobTest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/transformation/TransformationJobTest.java
index 5e5e42f1e2..dfa0c37203 100644
--- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/transformation/TransformationJobTest.java
+++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/transformation/TransformationJobTest.java
@@ -6,47 +6,32 @@ import eu.dnetlib.dhp.transformation.vocabulary.Vocabulary;
import eu.dnetlib.dhp.transformation.vocabulary.VocabularyHelper;
import eu.dnetlib.dhp.utils.DHPUtils;
import net.sf.saxon.s9api.*;
-import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOUtils;
import org.apache.spark.util.LongAccumulator;
import org.dom4j.Document;
import org.dom4j.Node;
import org.dom4j.io.SAXReader;
-import org.junit.*;
+import org.junit.jupiter.api.DisplayName;
+import org.junit.jupiter.api.Test;
+import org.junit.jupiter.api.extension.ExtendWith;
+import org.junit.jupiter.api.io.TempDir;
import org.mockito.Mock;
-import org.mockito.junit.MockitoJUnit;
-import org.mockito.junit.MockitoRule;
+import org.mockito.junit.jupiter.MockitoExtension;
import javax.xml.transform.stream.StreamSource;
-import java.io.File;
-import java.io.IOException;
import java.io.StringWriter;
import java.nio.file.Files;
import java.nio.file.Path;
-import java.util.Comparator;
import java.util.HashMap;
import java.util.Map;
+import static org.junit.jupiter.api.Assertions.assertNotNull;
+
+@ExtendWith(MockitoExtension.class)
public class TransformationJobTest {
@Mock
- LongAccumulator accumulator;
-
- @Rule
- public MockitoRule mockitoRule = MockitoJUnit.rule();
-
- private Path testDir;
-
- @Before
- public void setup() throws IOException {
- testDir = Files.createTempDirectory("dhp-collection");
- }
-
- @After
- public void tearDown() throws IOException {
- FileUtils.deleteDirectory(testDir.toFile());
- }
-
+ private LongAccumulator accumulator;
@Test
public void testTransformSaxonHE() throws Exception {
@@ -70,9 +55,9 @@ public class TransformationJobTest {
System.out.println(output.toString());
}
-
+ @DisplayName("Test TransformSparkJobNode.main")
@Test
- public void transformTest() throws Exception {
+ public void transformTest(@TempDir Path testDir) throws Exception {
final String mdstore_input = this.getClass().getResource("/eu/dnetlib/dhp/transform/mdstorenative").getFile();
final String mdstore_output = testDir.toString()+"/version";
final String xslt = DHPUtils.compressString(IOUtils.toString(this.getClass().getResourceAsStream("/eu/dnetlib/dhp/transform/tr.xml")));
@@ -89,8 +74,6 @@ public class TransformationJobTest {
"-rh", "",
"-ro", "",
"-rr", ""});
-
-
}
@Test
@@ -121,7 +104,7 @@ public class TransformationJobTest {
record.setBody(IOUtils.toString(this.getClass().getResourceAsStream("/eu/dnetlib/dhp/transform/input.xml")));
final MetadataRecord result = tf.call(record);
- Assert.assertNotNull(result.getBody());
+ assertNotNull(result.getBody());
System.out.println(result.getBody());
}
diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/transformation/vocabulary/VocabularyTest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/transformation/vocabulary/VocabularyTest.java
index d96a7ac4c8..c2db17a9d7 100644
--- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/transformation/vocabulary/VocabularyTest.java
+++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/transformation/vocabulary/VocabularyTest.java
@@ -1,7 +1,7 @@
package eu.dnetlib.dhp.transformation.vocabulary;
-import org.junit.Test;
-import static org.junit.Assert.*;
+import org.junit.jupiter.api.Test;
+import static org.junit.jupiter.api.Assertions.*;
public class VocabularyTest {
diff --git a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/migration/step1/claimscontext_resultset_entry.json b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/migration/step1/claimscontext_resultset_entry.json
new file mode 100644
index 0000000000..72bd01a966
--- /dev/null
+++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/migration/step1/claimscontext_resultset_entry.json
@@ -0,0 +1,27 @@
+[
+ {
+ "field": "source_type",
+ "type": "string",
+ "value": "context"
+ },
+ {
+ "field": "source_id",
+ "type": "string",
+ "value": "oa-pg"
+ },
+ {
+ "field": "target_type",
+ "type": "string",
+ "value": "publication"
+ },
+ {
+ "field": "target_id",
+ "type": "string",
+ "value": "userclaim___::d99de49026e79d271f3e7451d8de18b6"
+ },
+ {
+ "field": "semantics",
+ "type": "not_used",
+ "value": "isRelevantTo"
+ }
+]
\ No newline at end of file
diff --git a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/migration/step1/claimsrel_resultset_entry.json b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/migration/step1/claimsrel_resultset_entry.json
new file mode 100644
index 0000000000..28fa700356
--- /dev/null
+++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/migration/step1/claimsrel_resultset_entry.json
@@ -0,0 +1,27 @@
+[
+ {
+ "field": "source_type",
+ "type": "string",
+ "value": "project"
+ },
+ {
+ "field": "source_id",
+ "type": "string",
+ "value": "corda__h2020::b38a638a93b505d670fcacc47a0283d6"
+ },
+ {
+ "field": "target_type",
+ "type": "string",
+ "value": "publication"
+ },
+ {
+ "field": "target_id",
+ "type": "string",
+ "value": "userclaim___::5b5117253d3c64c79809d0b92fa287b4"
+ },
+ {
+ "field": "semantics",
+ "type": "not_used",
+ "value": "resultProject_outcome_produces"
+ }
+]
\ No newline at end of file
diff --git a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/migration/step1/datasourceorganization_resultset_entry.json b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/migration/step1/datasourceorganization_resultset_entry.json
new file mode 100644
index 0000000000..3a0318ed7b
--- /dev/null
+++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/migration/step1/datasourceorganization_resultset_entry.json
@@ -0,0 +1,62 @@
+[
+ {
+ "field": "datasource",
+ "type": "string",
+ "value": "openaire____::revistasunicauca"
+ },
+ {
+ "field": "organization",
+ "type": "string",
+ "value": "openaire____::openaire____::revistasunicauca"
+ },
+ {
+ "field": "startdate",
+ "type": "not_used",
+ "value": null
+ },
+ {
+ "field": "enddate",
+ "type": "not_used",
+ "value": null
+ },
+ {
+ "field": "inferred",
+ "type": "boolean",
+ "value": false
+ },
+ {
+ "field": "deletedbyinference",
+ "type": "boolean",
+ "value": false
+ },
+ {
+ "field": "trust",
+ "type": "string",
+ "value": "0.9"
+ },
+ {
+ "field": "inferenceprovenance",
+ "type": "string",
+ "value": null
+ },
+ {
+ "field": "collectedfromid",
+ "type": "string",
+ "value": null
+ },
+ {
+ "field": "collectedfromname",
+ "type": "string",
+ "value": null
+ },
+ {
+ "field": "semantics",
+ "type": "not_used",
+ "value": "providedBy@@@provided by@@@dnet:datasources_organizations_typologies@@@dnet:datasources_organizations_typologies"
+ },
+ {
+ "field": "provenanceaction",
+ "type": "not_used",
+ "value": null
+ }
+]
\ No newline at end of file
diff --git a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/migration/step1/datasources_resultset_entry.json b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/migration/step1/datasources_resultset_entry.json
new file mode 100644
index 0000000000..71e84954f6
--- /dev/null
+++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/migration/step1/datasources_resultset_entry.json
@@ -0,0 +1,234 @@
+[
+ {
+ "field": "datasourceid",
+ "type": "string",
+ "value": "274269ac6f3b::2579-5449"
+ },
+ {
+ "field": "identities",
+ "type": "not_used",
+ "value": [
+ "274269ac6f3b::2579-5449",
+ null
+ ]
+ },
+ {
+ "field": "officialname",
+ "type": "string",
+ "value": "Jurnal Ilmiah Pendidikan Scholastic"
+ },
+ {
+ "field": "englishname",
+ "type": "string",
+ "value": "Jurnal Ilmiah Pendidikan Scholastic"
+ },
+ {
+ "field": "contactemail",
+ "type": "string",
+ "value": "test@test.it"
+ },
+ {
+ "field": "openairecompatibility",
+ "type": "string",
+ "value": "hostedBy@@@collected from a compatible aggregator@@@dnet:datasourceCompatibilityLevel@@@dnet:datasourceCompatibilityLevel"
+ },
+ {
+ "field": "websiteurl",
+ "type": "string",
+ "value": "http://e-journal.sastra-unes.com/index.php/JIPS/index"
+ },
+ {
+ "field": "logourl",
+ "type": "string",
+ "value": null
+ },
+ {
+ "field": "accessinfopackage",
+ "type": "array",
+ "value": [
+ null
+ ]
+ },
+ {
+ "field": "latitude",
+ "type": "double",
+ "value": 0
+ },
+ {
+ "field": "longitude",
+ "type": "double",
+ "value": 0
+ },
+ {
+ "field": "namespaceprefix",
+ "type": "string",
+ "value": "ojs_25795449"
+ },
+ {
+ "field": "odnumberofitems",
+ "type": "int",
+ "value": null
+ },
+ {
+ "field": "odnumberofitemsdate",
+ "type": "date",
+ "value": null
+ },
+ {
+ "field": "subjects",
+ "type": "array",
+ "value": null
+ },
+ {
+ "field": "description",
+ "type": "string",
+ "value": null
+ },
+ {
+ "field": "odpolicies",
+ "type": "string",
+ "value": null
+ },
+ {
+ "field": "odlanguages",
+ "type": "array",
+ "value": []
+ },
+ {
+ "field": "odcontenttypes",
+ "type": "array",
+ "value": [
+ "Journal articles"
+ ]
+ },
+ {
+ "field": "inferred",
+ "type": "boolean",
+ "value": false
+ },
+ {
+ "field": "deletedbyinference",
+ "type": "boolean",
+ "value": false
+ },
+ {
+ "field": "trust",
+ "type": "string",
+ "value": "0.9"
+ },
+ {
+ "field": "inferenceprovenance",
+ "type": "string",
+ "value": null
+ },
+ {
+ "field": "dateofcollection",
+ "type": "date",
+ "value": "2020-01-21"
+ },
+ {
+ "field": "dateofvalidation",
+ "type": "date",
+ "value": null
+ },
+ {
+ "field": "releasestartdate",
+ "type": "date",
+ "value": null
+ },
+ {
+ "field": "releaseenddate",
+ "type": "date",
+ "value": null
+ },
+ {
+ "field": "missionstatementurl",
+ "type": "string",
+ "value": null
+ },
+ {
+ "field": "dataprovider",
+ "type": "boolean",
+ "value": null
+ },
+ {
+ "field": "serviceprovider",
+ "type": "boolean",
+ "value": null
+ },
+ {
+ "field": "databaseaccesstype",
+ "type": "string",
+ "value": null
+ },
+ {
+ "field": "datauploadtype",
+ "type": "string",
+ "value": null
+ },
+ {
+ "field": "databaseaccessrestriction",
+ "type": "string",
+ "value": null
+ },
+ {
+ "field": "datauploadrestriction",
+ "type": "string",
+ "value": null
+ },
+ {
+ "field": "versioning",
+ "type": "boolean",
+ "value": null
+ },
+ {
+ "field": "citationguidelineurl",
+ "type": "string",
+ "value": null
+ },
+ {
+ "field": "qualitymanagementkind",
+ "type": "string",
+ "value": null
+ },
+ {
+ "field": "pidsystems",
+ "type": "string",
+ "value": null
+ },
+ {
+ "field": "certificates",
+ "type": "string",
+ "value": null
+ },
+ {
+ "field": "policies",
+ "type": "not_used",
+ "value": []
+ },
+ {
+ "field": "collectedfromid",
+ "type": "string",
+ "value": "openaire____::SnVybmFsIEZha3VsdGFzIFNhc3RyYSBVbml2ZXJzaXRhcyBFa2FzYWt0aQ=="
+ },
+ {
+ "field": "collectedfromname",
+ "type": "string",
+ "value": "Jurnal Fakultas Sastra Universitas Ekasakti"
+ },
+ {
+ "field": "datasourcetype",
+ "type": "string",
+ "value": "pubsrepository::journal@@@Journal@@@dnet:datasource_typologies@@@dnet:datasource_typologies"
+ },
+ {
+ "field": "provenanceaction",
+ "type": "not_used",
+ "value": "sysimport:crosswalk:entityregistry@@@sysimport:crosswalk:entityregistry@@@dnet:provenance_actions@@@dnet:provenance_actions"
+ },
+ {
+ "field": "journal",
+ "type": "string",
+ "value": "2579-5449@@@2597-6540@@@"
+ }
+]
diff --git a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/migration/step1/organizations_resultset_entry.json b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/migration/step1/organizations_resultset_entry.json
new file mode 100644
index 0000000000..f766246bcc
--- /dev/null
+++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/migration/step1/organizations_resultset_entry.json
@@ -0,0 +1,127 @@
+[
+ {
+ "field": "organizationid",
+ "type": "string",
+ "value": "openaire____::openaire____::microsoft"
+ },
+ {
+ "field": "legalshortname",
+ "type": "string",
+ "value": "MSFTResearch"
+ },
+ {
+ "field": "legalname",
+ "type": "string",
+ "value": "Microsoft Research"
+ },
+ {
+ "field": "websiteurl",
+ "type": "string",
+ "value": "https://www.microsoft.com/en-us/research/"
+ },
+ {
+ "field": "logourl",
+ "type": "string",
+ "value": null
+ },
+ {
+ "field": "eclegalbody",
+ "type": "boolean",
+ "value": false
+ },
+ {
+ "field": "eclegalperson",
+ "type": "boolean",
+ "value": false
+ },
+ {
+ "field": "ecnonprofit",
+ "type": "boolean",
+ "value": false
+ },
+ {
+ "field": "ecresearchorganization",
+ "type": "boolean",
+ "value": false
+ },
+ {
+ "field": "echighereducation",
+ "type": "boolean",
+ "value": false
+ },
+ {
+ "field": "ecinternationalorganizationeurinterests",
+ "type": "boolean",
+ "value": false
+ },
+ {
+ "field": "ecinternationalorganization",
+ "type": "boolean",
+ "value": false
+ },
+ {
+ "field": "ecenterprise",
+ "type": "boolean",
+ "value": false
+ },
+ {
+ "field": "ecsmevalidated",
+ "type": "boolean",
+ "value": false
+ },
+ {
+ "field": "ecnutscode",
+ "type": "boolean",
+ "value": false
+ },
+ {
+ "field": "dateofcollection",
+ "type": "date",
+ "value": "2018-10-19"
+ },
+ {
+ "field": "dateoftransformation",
+ "type": "date",
+ "value": "2018-10-19"
+ },
+ {
+ "field": "inferred",
+ "type": "boolean",
+ "value": false
+ },
+ {
+ "field": "deletedbyinference",
+ "type": "boolean",
+ "value": false
+ },
+ {
+ "field": "trust",
+ "type": "string",
+ "value": "0.9"
+ },
+ {
+ "field": "inferenceprovenance",
+ "type": "string",
+ "value": ""
+ },
+ {
+ "field": "collectedfromid",
+ "type": "string",
+ "value": "openaire____::TEST"
+ },
+ {
+ "field": "collectedfromname",
+ "type": "string",
+ "value": "TEST"
+ },
+ {
+ "field": "country",
+ "type": "string",
+ "value": "US@@@US@@@dnet:countries@@@dnet:countries"
+ },
+ {
+ "field": "provenanceaction",
+ "type": "not_used",
+ "value": "sysimport:crosswalk:entityregistry@@@sysimport:crosswalk:entityregistry@@@dnet:provenance_actions@@@dnet:provenance_actions"
+ }
+]
\ No newline at end of file
diff --git a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/migration/step1/projectorganization_resultset_entry.json b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/migration/step1/projectorganization_resultset_entry.json
new file mode 100644
index 0000000000..855e1a4839
--- /dev/null
+++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/migration/step1/projectorganization_resultset_entry.json
@@ -0,0 +1,72 @@
+[
+ {
+ "field": "project",
+ "type": "string",
+ "value": "nsf_________::1700003"
+ },
+ {
+ "field": "resporganization",
+ "type": "string",
+ "value": "nsf_________::University_of_Notre_Dame"
+ },
+ {
+ "field": "participantnumber",
+ "type": "not_used",
+ "value": 1
+ },
+ {
+ "field": "contribution",
+ "type": "not_used",
+ "value": null
+ },
+ {
+ "field": "startdate",
+ "type": "not_used",
+ "value": null
+ },
+ {
+ "field": "enddate",
+ "type": "not_used",
+ "value": null
+ },
+ {
+ "field": "inferred",
+ "type": "boolean",
+ "value": false
+ },
+ {
+ "field": "deletedbyinference",
+ "type": "boolean",
+ "value": false
+ },
+ {
+ "field": "trust",
+ "type": "string",
+ "value": "0.9"
+ },
+ {
+ "field": "inferenceprovenance",
+ "type": "string",
+ "value": null
+ },
+ {
+ "field": "collectedfromid",
+ "type": "string",
+ "value": "openaire____::nsf"
+ },
+ {
+ "field": "collectedfromname",
+ "type": "string",
+ "value": "NSF - National Science Foundation"
+ },
+ {
+ "field": "semantics",
+ "type": "not_used",
+ "value": "coordinator@@@coordinator@@@dnet:project_organization_relations@@@dnet:project_organization_relations"
+ },
+ {
+ "field": "provenanceaction",
+ "type": "not_used",
+ "value": "sysimport:crosswalk:entityregistry@@@sysimport:crosswalk:entityregistry@@@dnet:provenance_actions@@@dnet:provenance_actions"
+ }
+]
\ No newline at end of file
diff --git a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/migration/step1/projects_resultset_entry.json b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/migration/step1/projects_resultset_entry.json
new file mode 100644
index 0000000000..7d6ebffbee
--- /dev/null
+++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/migration/step1/projects_resultset_entry.json
@@ -0,0 +1,193 @@
+[
+ {
+ "field": "projectid",
+ "type": "string",
+ "value": "aka_________::100469"
+ },
+ {
+ "field": "code",
+ "type": "string",
+ "value": "100469"
+ },
+ {
+ "field": "websiteurl",
+ "type": "string",
+ "value": "http://test"
+ },
+ {
+ "field": "acronym",
+ "type": "string",
+ "value": "RMCAG"
+ },
+ {
+ "field": "title",
+ "type": "string",
+ "value": "Regulation of melanoma cell autonomous growth"
+ },
+ {
+ "field": "startdate",
+ "type": "date",
+ "value": null
+ },
+ {
+ "field": "enddate",
+ "type": "date",
+ "value": null
+ },
+ {
+ "field": "callidentifier",
+ "type": "string",
+ "value": "Tutkijankoulutus ja työskentely ulkomailla/kevät TT"
+ },
+ {
+ "field": "keywords",
+ "type": "string",
+ "value": null
+ },
+ {
+ "field": "duration",
+ "type": "int",
+ "value": null
+ },
+ {
+ "field": "ecsc39",
+ "type": "boolean",
+ "value": null
+ },
+ {
+ "field": "oamandatepublications",
+ "type": "boolean",
+ "value": false
+ },
+ {
+ "field": "ecarticle29_3",
+ "type": "boolean",
+ "value": null
+ },
+ {
+ "field": "dateofcollection",
+ "type": "date",
+ "value": "2019-01-25"
+ },
+ {
+ "field": "dateoftransformation",
+ "type": "date",
+ "value": "2019-04-16"
+ },
+ {
+ "field": "inferred",
+ "type": "boolean",
+ "value": false
+ },
+ {
+ "field": "deletedbyinference",
+ "type": "boolean",
+ "value": false
+ },
+ {
+ "field": "trust",
+ "type": "string",
+ "value": "0.9"
+ },
+ {
+ "field": "inferenceprovenance",
+ "type": "string",
+ "value": null
+ },
+ {
+ "field": "optional1",
+ "type": "string",
+ "value": "9,284 €"
+ },
+ {
+ "field": "optional2",
+ "type": "string",
+ "value": null
+ },
+ {
+ "field": "jsonextrainfo",
+ "type": "string",
+ "value": "{}"
+ },
+ {
+ "field": "contactfullname",
+ "type": "string",
+ "value": null
+ },
+ {
+ "field": "contactfax",
+ "type": "string",
+ "value": null
+ },
+ {
+ "field": "contactphone",
+ "type": "string",
+ "value": null
+ },
+ {
+ "field": "contactemail",
+ "type": "string",
+ "value": null
+ },
+ {
+ "field": "summary",
+ "type": "string",
+ "value": null
+ },
+ {
+ "field": "currency",
+ "type": "string",
+ "value": null
+ },
+ {
+ "field": "totalcost",
+ "type": "double",
+ "value": null
+ },
+ {
+ "field": "fundedamount",
+ "type": "double",
+ "value": null
+ },
+ {
+ "field": "collectedfromid",
+ "type": "string",
+ "value": "openaire____::aka"
+ },
+ {
+ "field": "collectedfromname",
+ "type": "string",
+ "value": "Academy of Finland"
+ },
+ {
+ "field": "contracttype",
+ "type": "string",
+ "value": null
+ },
+ {
+ "field": "provenanceaction",
+ "type": "not_used",
+ "value": "sysimport:crosswalk:entityregistry@@@Harvested@@@dnet:provenanceActions@@@dnet:provenanceActions"
+ },
+ {
+ "field": "pid",
+ "type": "not_used",
+ "value": [
+ null
+ ]
+ },
+ {
+ "field": "subjects",
+ "type": "array",
+ "value": [
+ null
+ ]
+ },
+ {
+ "field": "fundingtree",
+ "type": "array",
+ "value": [
+ "\n aka_________::AKA\n AKA\n Academy of Finland\n Academy of Finland\n FI\n "
+ ]
+ }
+]
\ No newline at end of file
diff --git a/dhp-workflows/dhp-dedup-openaire/pom.xml b/dhp-workflows/dhp-dedup-openaire/pom.xml
new file mode 100644
index 0000000000..a4793da897
--- /dev/null
+++ b/dhp-workflows/dhp-dedup-openaire/pom.xml
@@ -0,0 +1,97 @@
+
+
+
+ dhp-workflows
+ eu.dnetlib.dhp
+ 1.1.6-SNAPSHOT
+
+ 4.0.0
+ dhp-dedup-openaire
+
+
+
+
+ net.alchim31.maven
+ scala-maven-plugin
+ 4.0.1
+
+
+ scala-compile-first
+ initialize
+
+