diff --git a/.gitignore b/.gitignore index 28ec2ec194..a208e171fe 100644 --- a/.gitignore +++ b/.gitignore @@ -3,6 +3,8 @@ *.iws *.ipr *.iml +*.ipr +*.iws *~ .vscode .classpath @@ -21,5 +23,5 @@ /*/build /build spark-warehouse -/*/*/job-override.properties +/**/job-override.properties diff --git a/dhp-build/dhp-build-assembly-resources/pom.xml b/dhp-build/dhp-build-assembly-resources/pom.xml index 834af77fa9..0c4637def4 100644 --- a/dhp-build/dhp-build-assembly-resources/pom.xml +++ b/dhp-build/dhp-build-assembly-resources/pom.xml @@ -6,7 +6,7 @@ eu.dnetlib.dhp dhp-build - 1.0.5-SNAPSHOT + 1.1.6-SNAPSHOT dhp-build-assembly-resources diff --git a/dhp-build/dhp-build-properties-maven-plugin/pom.xml b/dhp-build/dhp-build-properties-maven-plugin/pom.xml index 4f99d5298b..308d787157 100644 --- a/dhp-build/dhp-build-properties-maven-plugin/pom.xml +++ b/dhp-build/dhp-build-properties-maven-plugin/pom.xml @@ -6,7 +6,7 @@ eu.dnetlib.dhp dhp-build - 1.0.5-SNAPSHOT + 1.1.6-SNAPSHOT dhp-build-properties-maven-plugin @@ -76,6 +76,41 @@ + + + + + org.eclipse.m2e + lifecycle-mapping + 1.0.0 + + + + + + + org.apache.maven.plugins + + + maven-plugin-plugin + + + [3.2,) + + + descriptor + + + + + + + + + + + + diff --git a/dhp-build/dhp-build-properties-maven-plugin/src/test/java/eu/dnetlib/maven/plugin/properties/GenerateOoziePropertiesMojoTest.java b/dhp-build/dhp-build-properties-maven-plugin/src/test/java/eu/dnetlib/maven/plugin/properties/GenerateOoziePropertiesMojoTest.java index 6f55828ef4..a2cb8e0f1e 100644 --- a/dhp-build/dhp-build-properties-maven-plugin/src/test/java/eu/dnetlib/maven/plugin/properties/GenerateOoziePropertiesMojoTest.java +++ b/dhp-build/dhp-build-properties-maven-plugin/src/test/java/eu/dnetlib/maven/plugin/properties/GenerateOoziePropertiesMojoTest.java @@ -1,22 +1,21 @@ package eu.dnetlib.maven.plugin.properties; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + import static eu.dnetlib.maven.plugin.properties.GenerateOoziePropertiesMojo.PROPERTY_NAME_SANDBOX_NAME; import static eu.dnetlib.maven.plugin.properties.GenerateOoziePropertiesMojo.PROPERTY_NAME_WF_SOURCE_DIR; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertNull; - -import org.junit.Before; -import org.junit.Test; +import static org.junit.jupiter.api.Assertions.*; /** - * @author mhorst + * @author mhorst, claudio.atzori * */ public class GenerateOoziePropertiesMojoTest { private GenerateOoziePropertiesMojo mojo = new GenerateOoziePropertiesMojo(); - @Before + @BeforeEach public void clearSystemProperties() { System.clearProperty(PROPERTY_NAME_SANDBOX_NAME); System.clearProperty(PROPERTY_NAME_WF_SOURCE_DIR); @@ -28,7 +27,7 @@ public class GenerateOoziePropertiesMojoTest { mojo.execute(); // assert - assertNull(System.getProperty(PROPERTY_NAME_SANDBOX_NAME)); + assertNull(System.getProperty(PROPERTY_NAME_SANDBOX_NAME)); } @Test diff --git a/dhp-build/dhp-build-properties-maven-plugin/src/test/java/eu/dnetlib/maven/plugin/properties/WritePredefinedProjectPropertiesTest.java b/dhp-build/dhp-build-properties-maven-plugin/src/test/java/eu/dnetlib/maven/plugin/properties/WritePredefinedProjectPropertiesTest.java index 51d9575ffd..4b72130787 100644 --- a/dhp-build/dhp-build-properties-maven-plugin/src/test/java/eu/dnetlib/maven/plugin/properties/WritePredefinedProjectPropertiesTest.java +++ b/dhp-build/dhp-build-properties-maven-plugin/src/test/java/eu/dnetlib/maven/plugin/properties/WritePredefinedProjectPropertiesTest.java @@ -1,51 +1,41 @@ package eu.dnetlib.maven.plugin.properties; -import static eu.dnetlib.maven.plugin.properties.WritePredefinedProjectProperties.PROPERTY_PREFIX_ENV; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertFalse; -import static org.junit.Assert.assertTrue; -import static org.mockito.Mockito.doReturn; - -import java.io.File; -import java.io.FileInputStream; -import java.io.FileNotFoundException; -import java.io.FileOutputStream; -import java.io.FileWriter; -import java.io.IOException; -import java.util.Properties; - import org.apache.maven.plugin.MojoExecutionException; import org.apache.maven.project.MavenProject; -import org.junit.Before; -import org.junit.Rule; -import org.junit.Test; -import org.junit.rules.TemporaryFolder; -import org.junit.runner.RunWith; +import org.junit.jupiter.api.*; +import org.junit.jupiter.api.extension.ExtendWith; +import org.junit.jupiter.api.io.TempDir; import org.mockito.Mock; -import org.mockito.runners.MockitoJUnitRunner; +import org.mockito.MockitoAnnotations; +import org.mockito.junit.jupiter.MockitoExtension; +import java.io.*; +import java.util.Properties; + +import static eu.dnetlib.maven.plugin.properties.WritePredefinedProjectProperties.PROPERTY_PREFIX_ENV; +import static org.junit.jupiter.api.Assertions.*; +import static org.mockito.Mockito.doReturn; +import static org.mockito.Mockito.lenient; /** - * @author mhorst + * @author mhorst, claudio.atzori * */ -@RunWith(MockitoJUnitRunner.class) +@ExtendWith(MockitoExtension.class) public class WritePredefinedProjectPropertiesTest { - @Rule - public TemporaryFolder testFolder = new TemporaryFolder(); - @Mock private MavenProject mavenProject; private WritePredefinedProjectProperties mojo; - @Before - public void init() { + @BeforeEach + public void init(@TempDir File testFolder) { + MockitoAnnotations.initMocks(this); mojo = new WritePredefinedProjectProperties(); - mojo.outputFile = getPropertiesFileLocation(); + mojo.outputFile = getPropertiesFileLocation(testFolder); mojo.project = mavenProject; - doReturn(new Properties()).when(mavenProject).getProperties(); + lenient().doReturn(new Properties()).when(mavenProject).getProperties(); } // ----------------------------------- TESTS --------------------------------------------- @@ -57,7 +47,7 @@ public class WritePredefinedProjectPropertiesTest { // assert assertTrue(mojo.outputFile.exists()); - Properties storedProperties = getStoredProperties(); + Properties storedProperties = getStoredProperties(mojo.outputFile.getParentFile()); assertEquals(0, storedProperties.size()); } @@ -75,28 +65,28 @@ public class WritePredefinedProjectPropertiesTest { // assert assertTrue(mojo.outputFile.exists()); - Properties storedProperties = getStoredProperties(); + Properties storedProperties = getStoredProperties(mojo.outputFile.getParentFile()); assertEquals(1, storedProperties.size()); assertTrue(storedProperties.containsKey(key)); assertEquals(value, storedProperties.getProperty(key)); } - @Test(expected=MojoExecutionException.class) - public void testExecuteWithProjectPropertiesAndInvalidOutputFile() throws Exception { + @Test() + public void testExecuteWithProjectPropertiesAndInvalidOutputFile(@TempDir File testFolder) { // given String key = "projectPropertyKey"; String value = "projectPropertyValue"; Properties projectProperties = new Properties(); projectProperties.setProperty(key, value); doReturn(projectProperties).when(mavenProject).getProperties(); - mojo.outputFile = testFolder.getRoot(); + mojo.outputFile = testFolder; // execute - mojo.execute(); + Assertions.assertThrows(MojoExecutionException.class, () -> mojo.execute()); } @Test - public void testExecuteWithProjectPropertiesExclusion() throws Exception { + public void testExecuteWithProjectPropertiesExclusion(@TempDir File testFolder) throws Exception { // given String key = "projectPropertyKey"; String value = "projectPropertyValue"; @@ -113,14 +103,14 @@ public class WritePredefinedProjectPropertiesTest { // assert assertTrue(mojo.outputFile.exists()); - Properties storedProperties = getStoredProperties(); + Properties storedProperties = getStoredProperties(testFolder); assertEquals(1, storedProperties.size()); assertTrue(storedProperties.containsKey(key)); assertEquals(value, storedProperties.getProperty(key)); } @Test - public void testExecuteWithProjectPropertiesInclusion() throws Exception { + public void testExecuteWithProjectPropertiesInclusion(@TempDir File testFolder) throws Exception { // given String key = "projectPropertyKey"; String value = "projectPropertyValue"; @@ -137,14 +127,14 @@ public class WritePredefinedProjectPropertiesTest { // assert assertTrue(mojo.outputFile.exists()); - Properties storedProperties = getStoredProperties(); + Properties storedProperties = getStoredProperties(testFolder); assertEquals(1, storedProperties.size()); assertTrue(storedProperties.containsKey(includedKey)); assertEquals(includedValue, storedProperties.getProperty(includedKey)); } @Test - public void testExecuteIncludingPropertyKeysFromFile() throws Exception { + public void testExecuteIncludingPropertyKeysFromFile(@TempDir File testFolder) throws Exception { // given String key = "projectPropertyKey"; String value = "projectPropertyValue"; @@ -155,7 +145,7 @@ public class WritePredefinedProjectPropertiesTest { projectProperties.setProperty(includedKey, includedValue); doReturn(projectProperties).when(mavenProject).getProperties(); - File includedPropertiesFile = new File(testFolder.getRoot(), "included.properties"); + File includedPropertiesFile = new File(testFolder, "included.properties"); Properties includedProperties = new Properties(); includedProperties.setProperty(includedKey, "irrelevantValue"); includedProperties.store(new FileWriter(includedPropertiesFile), null); @@ -167,14 +157,14 @@ public class WritePredefinedProjectPropertiesTest { // assert assertTrue(mojo.outputFile.exists()); - Properties storedProperties = getStoredProperties(); + Properties storedProperties = getStoredProperties(testFolder); assertEquals(1, storedProperties.size()); assertTrue(storedProperties.containsKey(includedKey)); assertEquals(includedValue, storedProperties.getProperty(includedKey)); } @Test - public void testExecuteIncludingPropertyKeysFromClasspathResource() throws Exception { + public void testExecuteIncludingPropertyKeysFromClasspathResource(@TempDir File testFolder) throws Exception { // given String key = "projectPropertyKey"; String value = "projectPropertyValue"; @@ -192,14 +182,14 @@ public class WritePredefinedProjectPropertiesTest { // assert assertTrue(mojo.outputFile.exists()); - Properties storedProperties = getStoredProperties(); + Properties storedProperties = getStoredProperties(testFolder); assertEquals(1, storedProperties.size()); assertTrue(storedProperties.containsKey(includedKey)); assertEquals(includedValue, storedProperties.getProperty(includedKey)); } - @Test(expected=MojoExecutionException.class) - public void testExecuteIncludingPropertyKeysFromBlankLocation() throws Exception { + @Test + public void testExecuteIncludingPropertyKeysFromBlankLocation() { // given String key = "projectPropertyKey"; String value = "projectPropertyValue"; @@ -213,11 +203,11 @@ public class WritePredefinedProjectPropertiesTest { mojo.setIncludePropertyKeysFromFiles(new String[] {""}); // execute - mojo.execute(); + Assertions.assertThrows(MojoExecutionException.class, () -> mojo.execute()); } @Test - public void testExecuteIncludingPropertyKeysFromXmlFile() throws Exception { + public void testExecuteIncludingPropertyKeysFromXmlFile(@TempDir File testFolder) throws Exception { // given String key = "projectPropertyKey"; String value = "projectPropertyValue"; @@ -228,7 +218,7 @@ public class WritePredefinedProjectPropertiesTest { projectProperties.setProperty(includedKey, includedValue); doReturn(projectProperties).when(mavenProject).getProperties(); - File includedPropertiesFile = new File(testFolder.getRoot(), "included.xml"); + File includedPropertiesFile = new File(testFolder, "included.xml"); Properties includedProperties = new Properties(); includedProperties.setProperty(includedKey, "irrelevantValue"); includedProperties.storeToXML(new FileOutputStream(includedPropertiesFile), null); @@ -240,14 +230,14 @@ public class WritePredefinedProjectPropertiesTest { // assert assertTrue(mojo.outputFile.exists()); - Properties storedProperties = getStoredProperties(); + Properties storedProperties = getStoredProperties(testFolder); assertEquals(1, storedProperties.size()); assertTrue(storedProperties.containsKey(includedKey)); assertEquals(includedValue, storedProperties.getProperty(includedKey)); } - @Test(expected=MojoExecutionException.class) - public void testExecuteIncludingPropertyKeysFromInvalidXmlFile() throws Exception { + @Test + public void testExecuteIncludingPropertyKeysFromInvalidXmlFile(@TempDir File testFolder) throws Exception { // given String key = "projectPropertyKey"; String value = "projectPropertyValue"; @@ -258,7 +248,7 @@ public class WritePredefinedProjectPropertiesTest { projectProperties.setProperty(includedKey, includedValue); doReturn(projectProperties).when(mavenProject).getProperties(); - File includedPropertiesFile = new File(testFolder.getRoot(), "included.xml"); + File includedPropertiesFile = new File(testFolder, "included.xml"); Properties includedProperties = new Properties(); includedProperties.setProperty(includedKey, "irrelevantValue"); includedProperties.store(new FileOutputStream(includedPropertiesFile), null); @@ -266,11 +256,11 @@ public class WritePredefinedProjectPropertiesTest { mojo.setIncludePropertyKeysFromFiles(new String[] {includedPropertiesFile.getAbsolutePath()}); // execute - mojo.execute(); + Assertions.assertThrows(MojoExecutionException.class, () -> mojo.execute()); } @Test - public void testExecuteWithQuietModeOn() throws Exception { + public void testExecuteWithQuietModeOn(@TempDir File testFolder) throws Exception { // given mojo.setQuiet(true); mojo.setIncludePropertyKeysFromFiles(new String[] {"invalid location"}); @@ -280,21 +270,21 @@ public class WritePredefinedProjectPropertiesTest { // assert assertTrue(mojo.outputFile.exists()); - Properties storedProperties = getStoredProperties(); + Properties storedProperties = getStoredProperties(testFolder); assertEquals(0, storedProperties.size()); } - @Test(expected=MojoExecutionException.class) - public void testExecuteIncludingPropertyKeysFromInvalidFile() throws Exception { + @Test + public void testExecuteIncludingPropertyKeysFromInvalidFile() { // given mojo.setIncludePropertyKeysFromFiles(new String[] {"invalid location"}); // execute - mojo.execute(); + Assertions.assertThrows(MojoExecutionException.class, () -> mojo.execute()); } @Test - public void testExecuteWithEnvironmentProperties() throws Exception { + public void testExecuteWithEnvironmentProperties(@TempDir File testFolder) throws Exception { // given mojo.setIncludeEnvironmentVariables(true); @@ -303,7 +293,7 @@ public class WritePredefinedProjectPropertiesTest { // assert assertTrue(mojo.outputFile.exists()); - Properties storedProperties = getStoredProperties(); + Properties storedProperties = getStoredProperties(testFolder); assertTrue(storedProperties.size() > 0); for (Object currentKey : storedProperties.keySet()) { assertTrue(((String)currentKey).startsWith(PROPERTY_PREFIX_ENV)); @@ -311,7 +301,7 @@ public class WritePredefinedProjectPropertiesTest { } @Test - public void testExecuteWithSystemProperties() throws Exception { + public void testExecuteWithSystemProperties(@TempDir File testFolder) throws Exception { // given String key = "systemPropertyKey"; String value = "systemPropertyValue"; @@ -323,14 +313,14 @@ public class WritePredefinedProjectPropertiesTest { // assert assertTrue(mojo.outputFile.exists()); - Properties storedProperties = getStoredProperties(); + Properties storedProperties = getStoredProperties(testFolder); assertTrue(storedProperties.size() > 0); assertTrue(storedProperties.containsKey(key)); assertEquals(value, storedProperties.getProperty(key)); } @Test - public void testExecuteWithSystemPropertiesAndEscapeChars() throws Exception { + public void testExecuteWithSystemPropertiesAndEscapeChars(@TempDir File testFolder) throws Exception { // given String key = "systemPropertyKey "; String value = "systemPropertyValue"; @@ -344,7 +334,7 @@ public class WritePredefinedProjectPropertiesTest { // assert assertTrue(mojo.outputFile.exists()); - Properties storedProperties = getStoredProperties(); + Properties storedProperties = getStoredProperties(testFolder); assertTrue(storedProperties.size() > 0); assertFalse(storedProperties.containsKey(key)); assertTrue(storedProperties.containsKey(key.trim())); @@ -353,13 +343,13 @@ public class WritePredefinedProjectPropertiesTest { // ----------------------------------- PRIVATE ------------------------------------------- - private File getPropertiesFileLocation() { - return new File(testFolder.getRoot(), "test.properties"); + private File getPropertiesFileLocation(File testFolder) { + return new File(testFolder, "test.properties"); } - private Properties getStoredProperties() throws FileNotFoundException, IOException { + private Properties getStoredProperties(File testFolder) throws FileNotFoundException, IOException { Properties properties = new Properties(); - properties.load(new FileInputStream(getPropertiesFileLocation())); + properties.load(new FileInputStream(getPropertiesFileLocation(testFolder))); return properties; } } diff --git a/dhp-build/pom.xml b/dhp-build/pom.xml index e0b20204c9..e471af76d5 100644 --- a/dhp-build/pom.xml +++ b/dhp-build/pom.xml @@ -4,7 +4,7 @@ eu.dnetlib.dhp dhp - 1.0.5-SNAPSHOT + 1.1.6-SNAPSHOT dhp-build pom diff --git a/dhp-common/pom.xml b/dhp-common/pom.xml index 345a5475fa..f6283d450c 100644 --- a/dhp-common/pom.xml +++ b/dhp-common/pom.xml @@ -5,7 +5,7 @@ eu.dnetlib.dhp dhp - 1.0.5-SNAPSHOT + 1.1.6-SNAPSHOT ../ @@ -42,6 +42,23 @@ com.rabbitmq amqp-client + + net.sf.saxon + Saxon-HE + + + org.slf4j + jcl-over-slf4j + + + org.apache.cxf + cxf-rt-transports-http + + + eu.dnetlib + cnr-rmi-api + + com.ximpleware vtd-xml diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/utils/ISLookupClientFactory.java b/dhp-common/src/main/java/eu/dnetlib/dhp/utils/ISLookupClientFactory.java new file mode 100644 index 0000000000..c74cf3c111 --- /dev/null +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/utils/ISLookupClientFactory.java @@ -0,0 +1,24 @@ +package eu.dnetlib.dhp.utils; + +import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.cxf.jaxws.JaxWsProxyFactoryBean; + +public class ISLookupClientFactory { + + private static final Log log = LogFactory.getLog(ISLookupClientFactory.class); + + public static ISLookUpService getLookUpService(final String isLookupUrl) { + return getServiceStub(ISLookUpService.class, isLookupUrl); + } + + @SuppressWarnings("unchecked") + private static T getServiceStub(final Class clazz, final String endpoint) { + log.info(String.format("creating %s stub from %s", clazz.getName(), endpoint)); + final JaxWsProxyFactoryBean jaxWsProxyFactory = new JaxWsProxyFactoryBean(); + jaxWsProxyFactory.setServiceClass(clazz); + jaxWsProxyFactory.setAddress(endpoint); + return (T) jaxWsProxyFactory.create(); + } +} diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/utils/saxon/AbstractExtensionFunction.java b/dhp-common/src/main/java/eu/dnetlib/dhp/utils/saxon/AbstractExtensionFunction.java new file mode 100644 index 0000000000..bd39624404 --- /dev/null +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/utils/saxon/AbstractExtensionFunction.java @@ -0,0 +1,32 @@ +package eu.dnetlib.dhp.utils.saxon; + +import net.sf.saxon.expr.XPathContext; +import net.sf.saxon.lib.ExtensionFunctionCall; +import net.sf.saxon.lib.ExtensionFunctionDefinition; +import net.sf.saxon.om.Sequence; +import net.sf.saxon.om.StructuredQName; +import net.sf.saxon.trans.XPathException; + +public abstract class AbstractExtensionFunction extends ExtensionFunctionDefinition { + + public static String DEFAULT_SAXON_EXT_NS_URI = "http://www.d-net.research-infrastructures.eu/saxon-extension"; + + public abstract String getName(); + public abstract Sequence doCall(XPathContext context, Sequence[] arguments) throws XPathException; + + @Override + public StructuredQName getFunctionQName() { + return new StructuredQName("dnet", DEFAULT_SAXON_EXT_NS_URI, getName()); + } + + @Override + public ExtensionFunctionCall makeCallExpression() { + return new ExtensionFunctionCall() { + @Override + public Sequence call(XPathContext context, Sequence[] arguments) throws XPathException { + return doCall(context, arguments); + } + }; + } + +} \ No newline at end of file diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/utils/saxon/ExtractYear.java b/dhp-common/src/main/java/eu/dnetlib/dhp/utils/saxon/ExtractYear.java new file mode 100644 index 0000000000..f90e2a23e8 --- /dev/null +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/utils/saxon/ExtractYear.java @@ -0,0 +1,67 @@ +package eu.dnetlib.dhp.utils.saxon; + +import net.sf.saxon.expr.XPathContext; +import net.sf.saxon.om.Item; +import net.sf.saxon.om.Sequence; +import net.sf.saxon.trans.XPathException; +import net.sf.saxon.value.SequenceType; +import net.sf.saxon.value.StringValue; + +import java.text.ParseException; +import java.text.SimpleDateFormat; +import java.util.Calendar; +import java.util.GregorianCalendar; + +public class ExtractYear extends AbstractExtensionFunction { + + private static final String[] dateFormats = { "yyyy-MM-dd", "yyyy/MM/dd" }; + + @Override + public String getName() { + return "extractYear"; + } + + @Override + public Sequence doCall(XPathContext context, Sequence[] arguments) throws XPathException { + if (arguments == null | arguments.length == 0) { + return new StringValue(""); + } + final Item item = arguments[0].head(); + if (item == null) { + return new StringValue(""); + } + return new StringValue(_year(item.getStringValue())); + } + + @Override + public int getMinimumNumberOfArguments() { + return 0; + } + + @Override + public int getMaximumNumberOfArguments() { + return 1; + } + + @Override + public SequenceType[] getArgumentTypes() { + return new SequenceType[] { SequenceType.OPTIONAL_ITEM }; + } + + @Override + public SequenceType getResultType(SequenceType[] suppliedArgumentTypes) { + return SequenceType.SINGLE_STRING; + } + + private String _year(String s) { + Calendar c = new GregorianCalendar(); + for (String format : dateFormats) { + try { + c.setTime(new SimpleDateFormat(format).parse(s)); + String year = String.valueOf(c.get(Calendar.YEAR)); + return year; + } catch (ParseException e) {} + } + return ""; + } +} diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/utils/saxon/NormalizeDate.java b/dhp-common/src/main/java/eu/dnetlib/dhp/utils/saxon/NormalizeDate.java new file mode 100644 index 0000000000..634e08788b --- /dev/null +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/utils/saxon/NormalizeDate.java @@ -0,0 +1,66 @@ +package eu.dnetlib.dhp.utils.saxon; + +import net.sf.saxon.expr.XPathContext; +import net.sf.saxon.om.Sequence; +import net.sf.saxon.trans.XPathException; +import net.sf.saxon.value.SequenceType; +import net.sf.saxon.value.StringValue; + +import java.text.ParseException; +import java.text.SimpleDateFormat; +import java.util.Date; + +public class NormalizeDate extends AbstractExtensionFunction { + + private static final String[] normalizeDateFormats = { "yyyy-MM-dd'T'hh:mm:ss", "yyyy-MM-dd", "yyyy/MM/dd", "yyyy" }; + + private static final String normalizeOutFormat = new String("yyyy-MM-dd'T'hh:mm:ss'Z'"); + + @Override + public String getName() { + return "normalizeDate"; + } + + @Override + public Sequence doCall(XPathContext context, Sequence[] arguments) throws XPathException { + if (arguments == null | arguments.length == 0) { + return new StringValue(""); + } + String s = arguments[0].head().getStringValue(); + return new StringValue(_year(s)); + } + + @Override + public int getMinimumNumberOfArguments() { + return 0; + } + + @Override + public int getMaximumNumberOfArguments() { + return 1; + } + + @Override + public SequenceType[] getArgumentTypes() { + return new SequenceType[] { SequenceType.OPTIONAL_ITEM }; + } + + @Override + public SequenceType getResultType(SequenceType[] suppliedArgumentTypes) { + return SequenceType.SINGLE_STRING; + } + + private String _year(String s) { + final String date = s != null ? s.trim() : ""; + + for (String format : normalizeDateFormats) { + try { + Date parse = new SimpleDateFormat(format).parse(date); + String res = new SimpleDateFormat(normalizeOutFormat).format(parse); + return res; + } catch (ParseException e) {} + } + return ""; + } + +} diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/utils/saxon/PickFirst.java b/dhp-common/src/main/java/eu/dnetlib/dhp/utils/saxon/PickFirst.java new file mode 100644 index 0000000000..a221e37c67 --- /dev/null +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/utils/saxon/PickFirst.java @@ -0,0 +1,60 @@ +package eu.dnetlib.dhp.utils.saxon; + +import net.sf.saxon.expr.XPathContext; +import net.sf.saxon.om.Item; +import net.sf.saxon.om.Sequence; +import net.sf.saxon.trans.XPathException; +import net.sf.saxon.value.SequenceType; +import net.sf.saxon.value.StringValue; +import org.apache.commons.lang3.StringUtils; + +public class PickFirst extends AbstractExtensionFunction { + + @Override + public String getName() { + return "pickFirst"; + } + + @Override + public Sequence doCall(XPathContext context, Sequence[] arguments) throws XPathException { + if (arguments == null | arguments.length == 0) { + return new StringValue(""); + } + + final String s1 = getValue(arguments[0]); + final String s2 = getValue(arguments[1]); + + return new StringValue(StringUtils.isNotBlank(s1) ? s1 : StringUtils.isNotBlank(s2) ? s2 : ""); + } + + private String getValue(final Sequence arg) throws XPathException { + if (arg != null) { + final Item item = arg.head(); + if (item != null) { + return item.getStringValue(); + } + } + return ""; + } + + @Override + public int getMinimumNumberOfArguments() { + return 0; + } + + @Override + public int getMaximumNumberOfArguments() { + return 2; + } + + @Override + public SequenceType[] getArgumentTypes() { + return new SequenceType[] { SequenceType.OPTIONAL_ITEM }; + } + + @Override + public SequenceType getResultType(SequenceType[] suppliedArgumentTypes) { + return SequenceType.SINGLE_STRING; + } + +} diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/utils/saxon/SaxonTransformerFactory.java b/dhp-common/src/main/java/eu/dnetlib/dhp/utils/saxon/SaxonTransformerFactory.java new file mode 100644 index 0000000000..611709ff0d --- /dev/null +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/utils/saxon/SaxonTransformerFactory.java @@ -0,0 +1,30 @@ +package eu.dnetlib.dhp.utils.saxon; + +import net.sf.saxon.Configuration; +import net.sf.saxon.TransformerFactoryImpl; + +import javax.xml.transform.Transformer; +import javax.xml.transform.TransformerException; +import javax.xml.transform.stream.StreamSource; +import java.io.StringReader; + +public class SaxonTransformerFactory { + + /** + * Creates the index record transformer from the given XSLT + * @param xslt + * @return + * @throws TransformerException + */ + public static Transformer newInstance(final String xslt) throws TransformerException { + + final TransformerFactoryImpl factory = new TransformerFactoryImpl(); + final Configuration conf = factory.getConfiguration(); + conf.registerExtensionFunction(new ExtractYear()); + conf.registerExtensionFunction(new NormalizeDate()); + conf.registerExtensionFunction(new PickFirst()); + + return factory.newTransformer(new StreamSource(new StringReader(xslt))); + } + +} diff --git a/dhp-common/src/test/java/eu/dnetlib/dhp/application/ArgumentApplicationParserTest.java b/dhp-common/src/test/java/eu/dnetlib/dhp/application/ArgumentApplicationParserTest.java index fdea3c2d41..f4598ebd49 100644 --- a/dhp-common/src/test/java/eu/dnetlib/dhp/application/ArgumentApplicationParserTest.java +++ b/dhp-common/src/test/java/eu/dnetlib/dhp/application/ArgumentApplicationParserTest.java @@ -1,18 +1,13 @@ package eu.dnetlib.dhp.application; import org.apache.commons.io.IOUtils; -import org.junit.Test; +import org.junit.jupiter.api.Test; -import java.io.ByteArrayOutputStream; -import java.util.Base64; -import java.util.zip.GZIPOutputStream; - -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotNull; public class ArgumentApplicationParserTest { - @Test public void testParseParameter() throws Exception { final String jsonConfiguration = IOUtils.toString(this.getClass().getResourceAsStream("/eu/dnetlib/application/parameters.json")); diff --git a/dhp-common/src/test/java/eu/dnetlib/dhp/model/mdstore/MetadataRecordTest.java b/dhp-common/src/test/java/eu/dnetlib/dhp/model/mdstore/MetadataRecordTest.java index 4515429eab..a2bac54baf 100644 --- a/dhp-common/src/test/java/eu/dnetlib/dhp/model/mdstore/MetadataRecordTest.java +++ b/dhp-common/src/test/java/eu/dnetlib/dhp/model/mdstore/MetadataRecordTest.java @@ -1,8 +1,8 @@ package eu.dnetlib.dhp.model.mdstore; -import org.junit.Test; +import org.junit.jupiter.api.Test; -import static org.junit.Assert.assertTrue; +import static org.junit.jupiter.api.Assertions.assertTrue; public class MetadataRecordTest { @@ -10,6 +10,6 @@ public class MetadataRecordTest { public void getTimestamp() { MetadataRecord r = new MetadataRecord(); - assertTrue(r.getDateOfCollection() >0); + assertTrue(r.getDateOfCollection() > 0); } } \ No newline at end of file diff --git a/dhp-common/src/test/java/eu/dnetlib/message/MessageTest.java b/dhp-common/src/test/java/eu/dnetlib/message/MessageTest.java index fbc9dc2514..73df63b321 100644 --- a/dhp-common/src/test/java/eu/dnetlib/message/MessageTest.java +++ b/dhp-common/src/test/java/eu/dnetlib/message/MessageTest.java @@ -1,12 +1,12 @@ package eu.dnetlib.message; -import org.junit.Test; +import org.junit.jupiter.api.Test; import java.io.IOException; import java.util.HashMap; import java.util.Map; -import static org.junit.Assert.*; +import static org.junit.jupiter.api.Assertions.*; public class MessageTest { diff --git a/dhp-common/src/test/java/eu/dnetlib/scholexplorer/relation/RelationMapperTest.java b/dhp-common/src/test/java/eu/dnetlib/scholexplorer/relation/RelationMapperTest.java index db6f4429a2..eb9fb172d6 100644 --- a/dhp-common/src/test/java/eu/dnetlib/scholexplorer/relation/RelationMapperTest.java +++ b/dhp-common/src/test/java/eu/dnetlib/scholexplorer/relation/RelationMapperTest.java @@ -1,7 +1,7 @@ package eu.dnetlib.scholexplorer.relation; -import org.apache.commons.io.IOUtils; -import org.junit.Test; +import org.junit.jupiter.api.Test; + public class RelationMapperTest { diff --git a/dhp-doc-resources/img/data_provision_workflow.png b/dhp-doc-resources/img/data_provision_workflow.png new file mode 100644 index 0000000000..31979fbb49 Binary files /dev/null and b/dhp-doc-resources/img/data_provision_workflow.png differ diff --git a/dhp-schemas/README.md b/dhp-schemas/README.md index 473ad4cf19..7431cda426 100644 --- a/dhp-schemas/README.md +++ b/dhp-schemas/README.md @@ -1,3 +1,11 @@ Description of the project -------------------------- -This project defines **serialization schemas** of Avro data store files that are used to pass data between workflow nodes in the system. +This project defines **object schemas** of the OpenAIRE main entities and the relationships that intercur among them. +Namely it defines the model for + +- **research product (result)** which subclasses in publication, dataset, other research product, software +- **data source** object describing the data provider (institutional repository, aggregators, cris systems) +- **organization** research bodies managing a data source or participating to a research project +- **project** research project + +Te serialization of such objects (data store files) are used to pass data between workflow nodes in the processing pipeline. diff --git a/dhp-schemas/pom.xml b/dhp-schemas/pom.xml index 8bc30a8b0e..a85c0dd230 100644 --- a/dhp-schemas/pom.xml +++ b/dhp-schemas/pom.xml @@ -5,7 +5,7 @@ eu.dnetlib.dhp dhp - 1.0.5-SNAPSHOT + 1.1.6-SNAPSHOT ../ @@ -27,18 +27,15 @@ - eu.dnetlib.dhp - dhp-common - ${project.version} + com.fasterxml.jackson.core + jackson-databind - com.fasterxml.jackson.core - jackson-databind - test + com.google.guava + guava - diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/action/AtomicAction.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/action/AtomicAction.java new file mode 100644 index 0000000000..0f9aa3adbc --- /dev/null +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/action/AtomicAction.java @@ -0,0 +1,38 @@ +package eu.dnetlib.dhp.schema.action; + +import com.fasterxml.jackson.databind.annotation.JsonDeserialize; +import eu.dnetlib.dhp.schema.oaf.Oaf; + +import java.io.Serializable; + +@JsonDeserialize(using = AtomicActionDeserializer.class) +public class AtomicAction implements Serializable { + + private Class clazz; + + private T payload; + + public AtomicAction() { + } + + public AtomicAction(Class clazz, T payload) { + this.clazz = clazz; + this.payload = payload; + } + + public Class getClazz() { + return clazz; + } + + public void setClazz(Class clazz) { + this.clazz = clazz; + } + + public T getPayload() { + return payload; + } + + public void setPayload(T payload) { + this.payload = payload; + } +} diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/action/AtomicActionDeserializer.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/action/AtomicActionDeserializer.java new file mode 100644 index 0000000000..e6017288fa --- /dev/null +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/action/AtomicActionDeserializer.java @@ -0,0 +1,29 @@ +package eu.dnetlib.dhp.schema.action; + +import com.fasterxml.jackson.core.JsonParser; +import com.fasterxml.jackson.core.JsonProcessingException; +import com.fasterxml.jackson.databind.DeserializationContext; +import com.fasterxml.jackson.databind.JsonDeserializer; +import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.ObjectMapper; +import eu.dnetlib.dhp.schema.oaf.Oaf; + +import java.io.IOException; + +public class AtomicActionDeserializer extends JsonDeserializer { + + @Override + public Object deserialize(JsonParser jp, DeserializationContext ctxt) throws IOException, JsonProcessingException { + JsonNode node = jp.getCodec().readTree(jp); + String classTag = node.get("clazz").asText(); + JsonNode payload = node.get("payload"); + ObjectMapper mapper = new ObjectMapper(); + + try { + final Class clazz = Class.forName(classTag); + return new AtomicAction(clazz, (Oaf) mapper.readValue(payload.toString(), clazz)); + } catch (ClassNotFoundException e) { + throw new IOException(e); + } + } +} diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Country.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Country.java new file mode 100644 index 0000000000..e81120e428 --- /dev/null +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Country.java @@ -0,0 +1,15 @@ +package eu.dnetlib.dhp.schema.oaf; + +public class Country extends Qualifier { + + private DataInfo dataInfo; + + public DataInfo getDataInfo() { + return dataInfo; + } + + public void setDataInfo(DataInfo dataInfo) { + this.dataInfo = dataInfo; + } + +} diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Datasource.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Datasource.java index f52a500fe6..032468de2f 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Datasource.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Datasource.java @@ -40,9 +40,9 @@ public class Datasource extends OafEntity implements Serializable { private List> odlanguages; - private List< Field> odcontenttypes; + private List> odcontenttypes; - private List< Field> accessinfopackage; + private List> accessinfopackage; // re3data fields private Field releasestartdate; diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/GeoLocation.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/GeoLocation.java index 43af60286d..1839fbd53e 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/GeoLocation.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/GeoLocation.java @@ -1,5 +1,6 @@ package eu.dnetlib.dhp.schema.oaf; +import com.fasterxml.jackson.annotation.JsonIgnore; import org.apache.commons.lang3.StringUtils; import java.io.Serializable; @@ -36,7 +37,7 @@ public class GeoLocation implements Serializable { this.place = place; } - + @JsonIgnore public boolean isBlank() { return StringUtils.isBlank(point) && StringUtils.isBlank(box) && diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Instance.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Instance.java index 8f852af65d..f82296d8bf 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Instance.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Instance.java @@ -22,6 +22,14 @@ public class Instance implements Serializable { private Field dateofacceptance; + // ( article | book ) processing charges. Defined here to cope with possible wrongly typed results + private Field processingchargeamount; + + // currency - alphabetic code describe in ISO-4217. Defined here to cope with possible wrongly typed results + private Field processingchargecurrency; + + private Field refereed; //peer-review status + public Field getLicense() { return license; } @@ -86,7 +94,29 @@ public class Instance implements Serializable { this.dateofacceptance = dateofacceptance; } + public Field getProcessingchargeamount() { + return processingchargeamount; + } + public void setProcessingchargeamount(Field processingchargeamount) { + this.processingchargeamount = processingchargeamount; + } + + public Field getProcessingchargecurrency() { + return processingchargecurrency; + } + + public void setProcessingchargecurrency(Field processingchargecurrency) { + this.processingchargecurrency = processingchargecurrency; + } + + public Field getRefereed() { + return refereed; + } + + public void setRefereed(Field refereed) { + this.refereed = refereed; + } public String toComparableString(){ return String.format("%s::%s::%s::%s", diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/KeyValue.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/KeyValue.java index 59cefa40e9..5a841b96f5 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/KeyValue.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/KeyValue.java @@ -1,12 +1,10 @@ package eu.dnetlib.dhp.schema.oaf; -import com.fasterxml.jackson.annotation.JacksonInject; import com.fasterxml.jackson.annotation.JsonIgnore; -import com.fasterxml.jackson.annotation.JsonIgnoreProperties; import org.apache.commons.lang3.StringUtils; import java.io.Serializable; -@JsonIgnoreProperties({"blank"}) + public class KeyValue implements Serializable { private String key; @@ -39,7 +37,6 @@ public class KeyValue implements Serializable { this.dataInfo = dataInfo; } - @JsonIgnore public String toComparableString() { return isBlank()?"":String.format("%s::%s", key != null ? key.toLowerCase() : "", value != null ? value.toLowerCase() : ""); } diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Qualifier.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Qualifier.java index 7e4660f4b4..ae2bf1a602 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Qualifier.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Qualifier.java @@ -1,5 +1,6 @@ package eu.dnetlib.dhp.schema.oaf; +import com.fasterxml.jackson.annotation.JsonIgnore; import org.apache.commons.lang3.StringUtils; import java.io.Serializable; @@ -50,6 +51,8 @@ public class Qualifier implements Serializable { schemeid != null ? schemeid : "", schemename != null ? schemename : ""); } + + @JsonIgnore public boolean isBlank() { return StringUtils.isBlank(classid) && StringUtils.isBlank(classname) && diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Relation.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Relation.java index 03122983dc..6738b86938 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Relation.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Relation.java @@ -1,85 +1,104 @@ package eu.dnetlib.dhp.schema.oaf; -import java.util.*; +import java.util.ArrayList; +import java.util.List; +import java.util.Objects; import java.util.stream.Collectors; +import java.util.stream.Stream; + +import static com.google.common.base.Preconditions.checkArgument; public class Relation extends Oaf { - private String relType; + private String relType; - private String subRelType; + private String subRelType; - private String relClass; + private String relClass; - private String source; + private String source; - private String target; + private String target; - private List collectedFrom; + private List collectedFrom = new ArrayList<>(); - public String getRelType() { - return relType; - } + public String getRelType() { + return relType; + } - public void setRelType(String relType) { - this.relType = relType; - } + public void setRelType(final String relType) { + this.relType = relType; + } - public String getSubRelType() { - return subRelType; - } + public String getSubRelType() { + return subRelType; + } - public void setSubRelType(String subRelType) { - this.subRelType = subRelType; - } + public void setSubRelType(final String subRelType) { + this.subRelType = subRelType; + } - public String getRelClass() { - return relClass; - } + public String getRelClass() { + return relClass; + } - public void setRelClass(String relClass) { - this.relClass = relClass; - } + public void setRelClass(final String relClass) { + this.relClass = relClass; + } - public String getSource() { - return source; - } + public String getSource() { + return source; + } - public void setSource(String source) { - this.source = source; - } + public void setSource(final String source) { + this.source = source; + } - public String getTarget() { - return target; - } + public String getTarget() { + return target; + } - public void setTarget(String target) { - this.target = target; - } + public void setTarget(final String target) { + this.target = target; + } - public List getCollectedFrom() { - return collectedFrom; - } + public List getCollectedFrom() { + return collectedFrom; + } - public void setCollectedFrom(List collectedFrom) { - this.collectedFrom = collectedFrom; - } + public void setCollectedFrom(final List collectedFrom) { + this.collectedFrom = collectedFrom; + } - public void mergeFrom(Relation other) { - this.mergeOAFDataInfo(other); - if (other.getCollectedFrom() == null || other.getCollectedFrom().size() == 0) - return; - if (collectedFrom == null && other.getCollectedFrom() != null) { - collectedFrom = other.getCollectedFrom(); - return; - } - if (other.getCollectedFrom() != null) { - collectedFrom.addAll(other.getCollectedFrom()); + public void mergeFrom(final Relation r) { + + checkArgument(Objects.equals(getSource(), r.getSource()),"source ids must be equal"); + checkArgument(Objects.equals(getTarget(), r.getTarget()),"target ids must be equal"); + checkArgument(Objects.equals(getRelType(), r.getRelType()),"relType(s) must be equal"); + checkArgument(Objects.equals(getSubRelType(), r.getSubRelType()),"subRelType(s) must be equal"); + checkArgument(Objects.equals(getRelClass(), r.getRelClass()),"relClass(es) must be equal"); + + setCollectedFrom(Stream.concat(getCollectedFrom().stream(), r.getCollectedFrom().stream()) + .distinct() // relies on KeyValue.equals + .collect(Collectors.toList())); + } + + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + Relation relation = (Relation) o; + return relType.equals(relation.relType) && + subRelType.equals(relation.subRelType) && + relClass.equals(relation.relClass) && + source.equals(relation.source) && + target.equals(relation.target) && + Objects.equals(collectedFrom, relation.collectedFrom); + } + + @Override + public int hashCode() { + return Objects.hash(relType, subRelType, relClass, source, target, collectedFrom); + } - collectedFrom = new ArrayList<>(collectedFrom - .stream() - .collect(Collectors.toMap(KeyValue::toComparableString, x -> x, (x1, x2) -> x1)) - .values()); - } - } } diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Result.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Result.java index eb5572ce13..b6c73e84a6 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Result.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Result.java @@ -1,12 +1,10 @@ package eu.dnetlib.dhp.schema.oaf; -import org.apache.commons.lang3.StringUtils; - import java.io.Serializable; -import java.util.*; -import java.util.stream.Collectors; +import java.util.Comparator; +import java.util.List; -public abstract class Result extends OafEntity implements Serializable { +public class Result extends OafEntity implements Serializable { private List author; @@ -16,7 +14,7 @@ public abstract class Result extends OafEntity implements Serializable { // common fields private Qualifier language; - private List country; + private List country; private List subject; @@ -44,16 +42,10 @@ public abstract class Result extends OafEntity implements Serializable { private List> coverage; - private Field refereed; //peer-review status + private Qualifier bestaccessright; private List context; - // ( article | book ) processing charges. Defined here to cope with possible wrongly typed results - private Field processingchargeamount; - - // currency - alphabetic code describe in ISO-4217. Defined here to cope with possible wrongly typed results - private Field processingchargecurrency; - private List externalReference; private List instance; @@ -82,11 +74,11 @@ public abstract class Result extends OafEntity implements Serializable { this.language = language; } - public List getCountry() { + public List getCountry() { return country; } - public void setCountry(List country) { + public void setCountry(List country) { this.country = country; } @@ -194,12 +186,12 @@ public abstract class Result extends OafEntity implements Serializable { this.coverage = coverage; } - public Field getRefereed() { - return refereed; + public Qualifier getBestaccessright() { + return bestaccessright; } - public void setRefereed(Field refereed) { - this.refereed = refereed; + public void setBestaccessright(Qualifier bestaccessright) { + this.bestaccessright = bestaccessright; } public List getContext() { @@ -226,24 +218,6 @@ public abstract class Result extends OafEntity implements Serializable { this.instance = instance; } - public Field getProcessingchargeamount() { - return processingchargeamount; - } - - public Result setProcessingchargeamount(Field processingchargeamount) { - this.processingchargeamount = processingchargeamount; - return this; - } - - public Field getProcessingchargecurrency() { - return processingchargecurrency; - } - - public Result setProcessingchargecurrency(Field processingchargecurrency) { - this.processingchargecurrency = processingchargecurrency; - return this; - } - @Override public void mergeFrom(OafEntity e) { super.mergeFrom(e); @@ -287,19 +261,9 @@ public abstract class Result extends OafEntity implements Serializable { coverage = mergeLists(coverage, r.getCoverage()); - if (r.getRefereed() != null && compareTrust(this, r) < 0) - refereed = r.getRefereed(); - context = mergeLists(context, r.getContext()); - if (r.getProcessingchargeamount() != null && compareTrust(this, r) < 0) - processingchargeamount = r.getProcessingchargeamount(); - - if (r.getProcessingchargecurrency() != null && compareTrust(this, r) < 0) - processingchargecurrency = r.getProcessingchargecurrency(); - externalReference = mergeLists(externalReference, r.getExternalReference()); - } @@ -314,5 +278,4 @@ public abstract class Result extends OafEntity implements Serializable { return a.size() > b.size() ? a : b; } - } diff --git a/dhp-schemas/src/test/java/eu/dnetlib/dhp/schema/action/AtomicActionTest.java b/dhp-schemas/src/test/java/eu/dnetlib/dhp/schema/action/AtomicActionTest.java new file mode 100644 index 0000000000..d216c05d5f --- /dev/null +++ b/dhp-schemas/src/test/java/eu/dnetlib/dhp/schema/action/AtomicActionTest.java @@ -0,0 +1,42 @@ +package eu.dnetlib.dhp.schema.action; + +import com.fasterxml.jackson.databind.ObjectMapper; +import eu.dnetlib.dhp.schema.oaf.Relation; +import org.apache.commons.lang3.StringUtils; + +import org.junit.jupiter.api.Test; + +import java.io.IOException; + +import static org.junit.jupiter.api.Assertions.*; + +/** + * @author claudio.atzori + */ +public class AtomicActionTest { + + @Test + public void serializationTest() throws IOException { + + Relation rel = new Relation(); + rel.setSource("1"); + rel.setTarget("2"); + rel.setRelType("resultResult"); + rel.setSubRelType("dedup"); + rel.setRelClass("merges"); + + AtomicAction aa1 = new AtomicAction(Relation.class, rel); + + final ObjectMapper mapper = new ObjectMapper(); + String json = mapper.writeValueAsString(aa1); + + assertTrue(StringUtils.isNotBlank(json)); + + AtomicAction aa2 = mapper.readValue(json, AtomicAction.class); + + assertEquals(aa1.getClazz(), aa2.getClazz()); + assertEquals(aa1.getPayload(), aa2.getPayload()); + + } + +} diff --git a/dhp-schemas/src/test/java/eu/dnetlib/dhp/schema/oaf/MergeTest.java b/dhp-schemas/src/test/java/eu/dnetlib/dhp/schema/oaf/MergeTest.java index e487ddcbaa..ac4bd5d27d 100644 --- a/dhp-schemas/src/test/java/eu/dnetlib/dhp/schema/oaf/MergeTest.java +++ b/dhp-schemas/src/test/java/eu/dnetlib/dhp/schema/oaf/MergeTest.java @@ -1,11 +1,9 @@ package eu.dnetlib.dhp.schema.oaf; -import org.junit.Assert; -import org.junit.Before; -import org.junit.Test; +import static org.junit.jupiter.api.Assertions.*; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; -import java.io.IOException; -import java.util.ArrayList; import java.util.Arrays; import java.util.List; @@ -13,7 +11,7 @@ public class MergeTest { OafEntity oaf; - @Before + @BeforeEach public void setUp() { oaf = new Publication(); } @@ -44,8 +42,8 @@ public class MergeTest { a.mergeFrom(b); - Assert.assertNotNull(a.getCollectedfrom()); - Assert.assertEquals(3, a.getCollectedfrom().size()); + assertNotNull(a.getCollectedfrom()); + assertEquals(3, a.getCollectedfrom().size()); } @@ -60,8 +58,8 @@ public class MergeTest { a.mergeFrom(b); - Assert.assertNotNull(a.getSubject()); - Assert.assertEquals(3, a.getSubject().size()); + assertNotNull(a.getSubject()); + assertEquals(3, a.getSubject().size()); } diff --git a/dhp-schemas/src/test/java/eu/dnetlib/dhp/schema/scholexplorer/DLItest.java b/dhp-schemas/src/test/java/eu/dnetlib/dhp/schema/scholexplorer/DLItest.java index 54f5f5f06f..6a88151c95 100644 --- a/dhp-schemas/src/test/java/eu/dnetlib/dhp/schema/scholexplorer/DLItest.java +++ b/dhp-schemas/src/test/java/eu/dnetlib/dhp/schema/scholexplorer/DLItest.java @@ -6,7 +6,7 @@ import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.SerializationFeature; import eu.dnetlib.dhp.schema.oaf.Qualifier; import eu.dnetlib.dhp.schema.oaf.StructuredProperty; -import org.junit.Test; +import org.junit.jupiter.api.Test; import java.io.IOException; import java.util.Arrays; diff --git a/dhp-workflows/dhp-aggregation/pom.xml b/dhp-workflows/dhp-aggregation/pom.xml index c6bb99fc3e..95e9578478 100644 --- a/dhp-workflows/dhp-aggregation/pom.xml +++ b/dhp-workflows/dhp-aggregation/pom.xml @@ -4,7 +4,7 @@ eu.dnetlib.dhp dhp-workflows - 1.0.5-SNAPSHOT + 1.1.6-SNAPSHOT dhp-aggregation @@ -24,6 +24,61 @@ eu.dnetlib.dhp dhp-common ${project.version} + + + com.sun.xml.bind + jaxb-core + + + + + + eu.dnetlib.dhp + dhp-schemas + ${project.version} + + + + eu.dnetlib + dnet-actionmanager-common + + + eu.dnetlib + dnet-openaireplus-mapping-utils + + + saxonica + saxon + + + saxonica + saxon-dom + + + jgrapht + jgrapht + + + net.sf.ehcache + ehcache + + + org.springframework + spring-test + + + org.apache.* + * + + + apache + * + + + + + eu.dnetlib + dnet-openaire-data-protos @@ -44,13 +99,22 @@ jaxen jaxen + + + org.mongodb + mongo-java-driver + - org.mockito - mockito-core - 2.25.0 - test + org.apache.hadoop + hadoop-distcp + + + + org.postgresql + postgresql + 42.2.10 diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/actions/LicenseComparator.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/actions/LicenseComparator.java new file mode 100644 index 0000000000..9d0e82aca9 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/actions/LicenseComparator.java @@ -0,0 +1,49 @@ +package eu.dnetlib.dhp.migration.actions; + +import eu.dnetlib.data.proto.FieldTypeProtos.Qualifier; + +import java.util.Comparator; + +public class LicenseComparator implements Comparator { + + @Override + public int compare(Qualifier left, Qualifier right) { + + if (left == null && right == null) return 0; + if (left == null) return 1; + if (right == null) return -1; + + String lClass = left.getClassid(); + String rClass = right.getClassid(); + + if (lClass.equals(rClass)) return 0; + + if (lClass.equals("OPEN SOURCE")) return -1; + if (rClass.equals("OPEN SOURCE")) return 1; + + if (lClass.equals("OPEN")) return -1; + if (rClass.equals("OPEN")) return 1; + + if (lClass.equals("6MONTHS")) return -1; + if (rClass.equals("6MONTHS")) return 1; + + if (lClass.equals("12MONTHS")) return -1; + if (rClass.equals("12MONTHS")) return 1; + + if (lClass.equals("EMBARGO")) return -1; + if (rClass.equals("EMBARGO")) return 1; + + if (lClass.equals("RESTRICTED")) return -1; + if (rClass.equals("RESTRICTED")) return 1; + + if (lClass.equals("CLOSED")) return -1; + if (rClass.equals("CLOSED")) return 1; + + if (lClass.equals("UNKNOWN")) return -1; + if (rClass.equals("UNKNOWN")) return 1; + + // Else (but unlikely), lexicographical ordering will do. + return lClass.compareTo(rClass); + } + +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/actions/MigrateActionSet.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/actions/MigrateActionSet.java new file mode 100644 index 0000000000..487fac359c --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/actions/MigrateActionSet.java @@ -0,0 +1,170 @@ +package eu.dnetlib.dhp.migration.actions; + +import com.google.common.base.Splitter; +import com.google.common.collect.Lists; +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.utils.ISLookupClientFactory; +import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; +import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; +import org.apache.commons.io.IOUtils; +import org.apache.commons.lang3.StringUtils; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.tools.DistCp; +import org.apache.hadoop.tools.DistCpOptions; +import org.apache.hadoop.util.ToolRunner; + +import java.io.File; +import java.io.FileOutputStream; +import java.io.OutputStream; +import java.util.*; +import java.util.stream.Collectors; + +public class MigrateActionSet { + + private static final Log log = LogFactory.getLog(MigrateActionSet.class); + + private static final String SEPARATOR = "/"; + private static final String TARGET_PATHS = "target_paths"; + private static final String RAWSET_PREFIX = "rawset_"; + + private static Boolean DEFAULT_TRANSFORM_ONLY = false; + + public static void main(String[] args) throws Exception { + final ArgumentApplicationParser parser = new ArgumentApplicationParser( + IOUtils.toString(MigrateActionSet.class.getResourceAsStream( + "/eu/dnetlib/dhp/migration/migrate_actionsets_parameters.json"))); + parser.parseArgument(args); + + new MigrateActionSet().run(parser); + } + + private void run(ArgumentApplicationParser parser) throws Exception { + + final String isLookupUrl = parser.get("isLookupUrl"); + final String sourceNN = parser.get("sourceNameNode"); + final String targetNN = parser.get("targetNameNode"); + final String workDir = parser.get("workingDirectory"); + final Integer distcp_num_maps = Integer.parseInt(parser.get("distcp_num_maps")); + + final String distcp_memory_mb = parser.get("distcp_memory_mb"); + final String distcp_task_timeout = parser.get("distcp_task_timeout"); + + final String transform_only_s = parser.get("transform_only"); + + log.info("transform only param: " + transform_only_s); + + final Boolean transformOnly = Boolean.valueOf(parser.get("transform_only")); + + log.info("transform only: " + transformOnly); + + ISLookUpService isLookUp = ISLookupClientFactory.getLookUpService(isLookupUrl); + + Configuration conf = getConfiguration(distcp_task_timeout, distcp_memory_mb, distcp_num_maps); + FileSystem targetFS = FileSystem.get(conf); + + Configuration sourceConf = getConfiguration(distcp_task_timeout, distcp_memory_mb, distcp_num_maps); + sourceConf.set(FileSystem.FS_DEFAULT_NAME_KEY, sourceNN); + FileSystem sourceFS = FileSystem.get(sourceConf); + + Properties props = new Properties(); + + List targetPaths = new ArrayList<>(); + + final List sourcePaths = getSourcePaths(sourceNN, isLookUp); + log.info(String.format("paths to process:\n%s", sourcePaths.stream().map(p -> p.toString()).collect(Collectors.joining("\n")))); + for(Path source : sourcePaths) { + + if (!sourceFS.exists(source)) { + log.warn(String.format("skipping unexisting path: %s", source)); + } else { + + LinkedList pathQ = Lists.newLinkedList(Splitter.on(SEPARATOR).split(source.toUri().getPath())); + + final String rawSet = pathQ.pollLast(); + log.info(String.format("got RAWSET: %s", rawSet)); + + if (StringUtils.isNotBlank(rawSet) && rawSet.startsWith(RAWSET_PREFIX)) { + + final String actionSetDirectory = pathQ.pollLast(); + + final Path targetPath = new Path(targetNN + workDir + SEPARATOR + actionSetDirectory + SEPARATOR + rawSet); + + log.info(String.format("using TARGET PATH: %s", targetPath)); + + if (!transformOnly) { + if (targetFS.exists(targetPath)) { + targetFS.delete(targetPath, true); + } + runDistcp(distcp_num_maps, distcp_memory_mb, distcp_task_timeout, conf, source, targetPath); + } + + targetPaths.add(targetPath); + } + } + } + + props.setProperty(TARGET_PATHS, targetPaths + .stream() + .map(p -> p.toString()) + .collect(Collectors.joining(","))); + File file = new File(System.getProperty("oozie.action.output.properties")); + + try(OutputStream os = new FileOutputStream(file)) { + props.store(os, ""); + } + System.out.println(file.getAbsolutePath()); + } + + private void runDistcp(Integer distcp_num_maps, String distcp_memory_mb, String distcp_task_timeout, Configuration conf, Path source, Path targetPath) throws Exception { + + final DistCpOptions op = new DistCpOptions(source, targetPath); + op.setMaxMaps(distcp_num_maps); + op.preserve(DistCpOptions.FileAttribute.BLOCKSIZE); + op.preserve(DistCpOptions.FileAttribute.REPLICATION); + op.preserve(DistCpOptions.FileAttribute.CHECKSUMTYPE); + + int res = ToolRunner.run(new DistCp(conf, op), new String[]{ + "-Dmapred.task.timeout=" + distcp_task_timeout, + "-Dmapreduce.map.memory.mb=" + distcp_memory_mb, + "-pb", + "-m " + distcp_num_maps, + source.toString(), + targetPath.toString()}); + + if (res != 0) { + throw new RuntimeException(String.format("distcp exited with code %s", res)); + } + } + + private Configuration getConfiguration(String distcp_task_timeout, String distcp_memory_mb, Integer distcp_num_maps) { + final Configuration conf = new Configuration(); + conf.set("dfs.webhdfs.socket.connect-timeout", distcp_task_timeout); + conf.set("dfs.webhdfs.socket.read-timeout", distcp_task_timeout); + conf.set("dfs.http.client.retry.policy.enabled", "true"); + conf.set("mapred.task.timeout", distcp_task_timeout); + conf.set("mapreduce.map.memory.mb", distcp_memory_mb); + conf.set("mapred.map.tasks", String.valueOf(distcp_num_maps)); + return conf; + } + + private List getSourcePaths(String sourceNN, ISLookUpService isLookUp) throws ISLookUpException { + String XQUERY = "distinct-values(\n" + + "let $basePath := collection('/db/DRIVER/ServiceResources/ActionManagerServiceResourceType')//SERVICE_PROPERTIES/PROPERTY[@key = 'basePath']/@value/string()\n" + + "for $x in collection('/db/DRIVER/ActionManagerSetDSResources/ActionManagerSetDSResourceType') \n" + + "let $setDir := $x//SET/@directory/string()\n" + + "let $rawSet := $x//RAW_SETS/LATEST/@id/string()\n" + + "return concat($basePath, '/', $setDir, '/', $rawSet))"; + + log.info(String.format("running xquery:\n%s", XQUERY)); + return isLookUp.quickSearchProfile(XQUERY) + .stream() + .map(p -> sourceNN + p) + .map(Path::new) + .collect(Collectors.toList()); + } + +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/actions/ProtoConverter.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/actions/ProtoConverter.java new file mode 100644 index 0000000000..a7e70ee813 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/actions/ProtoConverter.java @@ -0,0 +1,580 @@ +package eu.dnetlib.dhp.migration.actions; + +import com.google.common.collect.Lists; +import com.googlecode.protobuf.format.JsonFormat; +import eu.dnetlib.data.proto.*; +import eu.dnetlib.dhp.schema.oaf.*; +import org.apache.commons.lang3.StringUtils; + +import java.io.Serializable; +import java.util.List; +import java.util.Optional; +import java.util.stream.Collectors; + +public class ProtoConverter implements Serializable { + + public static final String UNKNOWN = "UNKNOWN"; + public static final String NOT_AVAILABLE = "not available"; + public static final String DNET_ACCESS_MODES = "dnet:access_modes"; + + public static Oaf convert(OafProtos.Oaf oaf) { + try { + switch (oaf.getKind()) { + case entity: + return convertEntity(oaf); + case relation: + return convertRelation(oaf); + default: + throw new IllegalArgumentException("invalid kind " + oaf.getKind()); + } + } catch (Throwable e) { + throw new RuntimeException("error on getting " + JsonFormat.printToString(oaf), e); + } + } + + private static Relation convertRelation(OafProtos.Oaf oaf) { + final OafProtos.OafRel r = oaf.getRel(); + final Relation rel = new Relation(); + rel.setDataInfo(mapDataInfo(oaf.getDataInfo())); + rel.setLastupdatetimestamp(oaf.getLastupdatetimestamp()); + rel.setSource(r.getSource()); + rel.setTarget(r.getTarget()); + rel.setRelType(r.getRelType().toString()); + rel.setSubRelType(r.getSubRelType().toString()); + rel.setRelClass(r.getRelClass()); + rel.setCollectedFrom(r.getCollectedfromCount() > 0 ? + r.getCollectedfromList().stream() + .map(kv -> mapKV(kv)) + .collect(Collectors.toList()) : null); + return rel; + } + + private static OafEntity convertEntity(OafProtos.Oaf oaf) { + + switch (oaf.getEntity().getType()) { + case result: + final Result r = convertResult(oaf); + r.setInstance(convertInstances(oaf)); + return r; + case project: + return convertProject(oaf); + case datasource: + return convertDataSource(oaf); + case organization: + return convertOrganization(oaf); + default: + throw new RuntimeException("received unknown type"); + } + } + + private static List convertInstances(OafProtos.Oaf oaf) { + + final ResultProtos.Result r = oaf.getEntity().getResult(); + if (r.getInstanceCount() > 0) { + return r.getInstanceList() + .stream() + .map(i -> convertInstance(i)) + .collect(Collectors.toList()); + } + return Lists.newArrayList(); + } + + private static Instance convertInstance(ResultProtos.Result.Instance ri) { + final Instance i = new Instance(); + i.setAccessright(mapQualifier(ri.getAccessright())); + i.setCollectedfrom(mapKV(ri.getCollectedfrom())); + i.setDateofacceptance(mapStringField(ri.getDateofacceptance())); + i.setDistributionlocation(ri.getDistributionlocation()); + i.setHostedby(mapKV(ri.getHostedby())); + i.setInstancetype(mapQualifier(ri.getInstancetype())); + i.setLicense(mapStringField(ri.getLicense())); + i.setUrl(ri.getUrlList()); + i.setRefereed(mapStringField(ri.getRefereed())); + i.setProcessingchargeamount(mapStringField(ri.getProcessingchargeamount())); + i.setProcessingchargecurrency(mapStringField(ri.getProcessingchargecurrency())); + return i; + } + + private static Organization convertOrganization(OafProtos.Oaf oaf) { + final OrganizationProtos.Organization.Metadata m = oaf.getEntity().getOrganization().getMetadata(); + final Organization org = setOaf(new Organization(), oaf); + setEntity(org, oaf); + org.setLegalshortname(mapStringField(m.getLegalshortname())); + org.setLegalname(mapStringField(m.getLegalname())); + org.setAlternativeNames(m.getAlternativeNamesList(). + stream() + .map(ProtoConverter::mapStringField) + .collect(Collectors.toList())); + org.setWebsiteurl(mapStringField(m.getWebsiteurl())); + org.setLogourl(mapStringField(m.getLogourl())); + org.setEclegalbody(mapStringField(m.getEclegalbody())); + org.setEclegalperson(mapStringField(m.getEclegalperson())); + org.setEcnonprofit(mapStringField(m.getEcnonprofit())); + org.setEcresearchorganization(mapStringField(m.getEcresearchorganization())); + org.setEchighereducation(mapStringField(m.getEchighereducation())); + org.setEcinternationalorganizationeurinterests(mapStringField(m.getEcinternationalorganizationeurinterests())); + org.setEcinternationalorganization(mapStringField(m.getEcinternationalorganization())); + org.setEcenterprise(mapStringField(m.getEcenterprise())); + org.setEcsmevalidated(mapStringField(m.getEcsmevalidated())); + org.setEcnutscode(mapStringField(m.getEcnutscode())); + org.setCountry(mapQualifier(m.getCountry())); + + return org; + } + + private static Datasource convertDataSource(OafProtos.Oaf oaf) { + final DatasourceProtos.Datasource.Metadata m = oaf.getEntity().getDatasource().getMetadata(); + final Datasource datasource = setOaf(new Datasource(), oaf); + setEntity(datasource, oaf); + datasource.setAccessinfopackage(m.getAccessinfopackageList() + .stream() + .map(ProtoConverter::mapStringField) + .collect(Collectors.toList())); + datasource.setCertificates(mapStringField(m.getCertificates())); + datasource.setCitationguidelineurl(mapStringField(m.getCitationguidelineurl())); + datasource.setContactemail(mapStringField(m.getContactemail())); + datasource.setDatabaseaccessrestriction(mapStringField(m.getDatabaseaccessrestriction())); + datasource.setDatabaseaccesstype(mapStringField(m.getDatabaseaccesstype())); + datasource.setDataprovider(mapBoolField(m.getDataprovider())); + datasource.setDatasourcetype(mapQualifier(m.getDatasourcetype())); + datasource.setDatauploadrestriction(mapStringField(m.getDatauploadrestriction())); + datasource.setCitationguidelineurl(mapStringField(m.getCitationguidelineurl())); + datasource.setDatauploadtype(mapStringField(m.getDatauploadtype())); + datasource.setDateofvalidation(mapStringField(m.getDateofvalidation())); + datasource.setDescription(mapStringField(m.getDescription())); + datasource.setEnglishname(mapStringField(m.getEnglishname())); + datasource.setLatitude(mapStringField(m.getLatitude())); + datasource.setLongitude(mapStringField(m.getLongitude())); + datasource.setLogourl(mapStringField(m.getLogourl())); + datasource.setMissionstatementurl(mapStringField(m.getMissionstatementurl())); + datasource.setNamespaceprefix(mapStringField(m.getNamespaceprefix())); + datasource.setOdcontenttypes(m.getOdcontenttypesList() + .stream() + .map(ProtoConverter::mapStringField) + .collect(Collectors.toList())); + datasource.setOdlanguages(m.getOdlanguagesList() + .stream() + .map(ProtoConverter::mapStringField) + .collect(Collectors.toList())); + datasource.setOdnumberofitems(mapStringField(m.getOdnumberofitems())); + datasource.setOdnumberofitemsdate(mapStringField(m.getOdnumberofitemsdate())); + datasource.setOdpolicies(mapStringField(m.getOdpolicies())); + datasource.setOfficialname(mapStringField(m.getOfficialname())); + datasource.setOpenairecompatibility(mapQualifier(m.getOpenairecompatibility())); + datasource.setPidsystems(mapStringField(m.getPidsystems())); + datasource.setPolicies(m.getPoliciesList() + .stream() + .map(ProtoConverter::mapKV) + .collect(Collectors.toList())); + datasource.setQualitymanagementkind(mapStringField(m.getQualitymanagementkind())); + datasource.setReleaseenddate(mapStringField(m.getReleaseenddate())); + datasource.setServiceprovider(mapBoolField(m.getServiceprovider())); + datasource.setReleasestartdate(mapStringField(m.getReleasestartdate())); + datasource.setSubjects(m.getSubjectsList() + .stream() + .map(ProtoConverter::mapStructuredProperty) + .collect(Collectors.toList())); + datasource.setVersioning(mapBoolField(m.getVersioning())); + datasource.setWebsiteurl(mapStringField(m.getWebsiteurl())); + datasource.setJournal(mapJournal(m.getJournal())); + + + return datasource; + } + + private static Project convertProject(OafProtos.Oaf oaf) { + final ProjectProtos.Project.Metadata m = oaf.getEntity().getProject().getMetadata(); + final Project project = setOaf(new Project(), oaf); + setEntity(project, oaf); + project.setAcronym(mapStringField(m.getAcronym())); + project.setCallidentifier(mapStringField(m.getCallidentifier())); + project.setCode(mapStringField(m.getCode())); + project.setContactemail(mapStringField(m.getContactemail())); + project.setContactfax(mapStringField(m.getContactfax())); + project.setContactfullname(mapStringField(m.getContactfullname())); + project.setContactphone(mapStringField(m.getContactphone())); + project.setContracttype(mapQualifier(m.getContracttype())); + project.setCurrency(mapStringField(m.getCurrency())); + project.setDuration(mapStringField(m.getDuration())); + project.setEcarticle29_3(mapStringField(m.getEcarticle293())); + project.setEcsc39(mapStringField(m.getEcsc39())); + project.setOamandatepublications(mapStringField(m.getOamandatepublications())); + project.setStartdate(mapStringField(m.getStartdate())); + project.setEnddate(mapStringField(m.getEnddate())); + project.setFundedamount(m.getFundedamount()); + project.setTotalcost(m.getTotalcost()); + project.setKeywords(mapStringField(m.getKeywords())); + project.setSubjects(m.getSubjectsList().stream() + .map(sp -> mapStructuredProperty(sp)) + .collect(Collectors.toList())); + project.setTitle(mapStringField(m.getTitle())); + project.setWebsiteurl(mapStringField(m.getWebsiteurl())); + project.setFundingtree(m.getFundingtreeList().stream() + .map(f -> mapStringField(f)) + .collect(Collectors.toList())); + project.setJsonextrainfo(mapStringField(m.getJsonextrainfo())); + project.setSummary(mapStringField(m.getSummary())); + project.setOptional1(mapStringField(m.getOptional1())); + project.setOptional2(mapStringField(m.getOptional2())); + return project; + } + + private static Result convertResult(OafProtos.Oaf oaf) { + switch (oaf.getEntity().getResult().getMetadata().getResulttype().getClassid()) { + case "dataset": + return createDataset(oaf); + case "publication": + return createPublication(oaf); + case "software": + return createSoftware(oaf); + case "other": + return createORP(oaf); + default: + Result result = setOaf(new Result(), oaf); + setEntity(result, oaf); + return setResult(result, oaf); + } + } + + private static Software createSoftware(OafProtos.Oaf oaf) { + ResultProtos.Result.Metadata m = oaf.getEntity().getResult().getMetadata(); + Software software = setOaf(new Software(), oaf); + setEntity(software, oaf); + setResult(software, oaf); + + software.setDocumentationUrl(m.getDocumentationUrlList() + .stream() + .map(ProtoConverter::mapStringField) + .collect(Collectors.toList())); + software.setLicense(m.getLicenseList() + .stream() + .map(ProtoConverter::mapStructuredProperty) + .collect(Collectors.toList())); + software.setCodeRepositoryUrl(mapStringField(m.getCodeRepositoryUrl())); + software.setProgrammingLanguage(mapQualifier(m.getProgrammingLanguage())); + return software; + } + + private static OtherResearchProduct createORP(OafProtos.Oaf oaf) { + ResultProtos.Result.Metadata m = oaf.getEntity().getResult().getMetadata(); + OtherResearchProduct otherResearchProducts = setOaf(new OtherResearchProduct(), oaf); + setEntity(otherResearchProducts, oaf); + setResult(otherResearchProducts, oaf); + otherResearchProducts.setContactperson(m.getContactpersonList() + .stream() + .map(ProtoConverter::mapStringField) + .collect(Collectors.toList())); + otherResearchProducts.setContactgroup(m.getContactgroupList() + .stream() + .map(ProtoConverter::mapStringField) + .collect(Collectors.toList())); + otherResearchProducts.setTool(m.getToolList() + .stream() + .map(ProtoConverter::mapStringField) + .collect(Collectors.toList())); + + return otherResearchProducts; + } + + private static Publication createPublication(OafProtos.Oaf oaf) { + + ResultProtos.Result.Metadata m = oaf.getEntity().getResult().getMetadata(); + Publication publication = setOaf(new Publication(), oaf); + setEntity(publication, oaf); + setResult(publication, oaf); + publication.setJournal(mapJournal(m.getJournal())); + return publication; + } + + private static Dataset createDataset(OafProtos.Oaf oaf) { + + ResultProtos.Result.Metadata m = oaf.getEntity().getResult().getMetadata(); + Dataset dataset = setOaf(new Dataset(), oaf); + setEntity(dataset, oaf); + setResult(dataset, oaf); + dataset.setStoragedate(mapStringField(m.getStoragedate())); + dataset.setDevice(mapStringField(m.getDevice())); + dataset.setSize(mapStringField(m.getSize())); + dataset.setVersion(mapStringField(m.getVersion())); + dataset.setLastmetadataupdate(mapStringField(m.getLastmetadataupdate())); + dataset.setMetadataversionnumber(mapStringField(m.getMetadataversionnumber())); + dataset.setGeolocation(m.getGeolocationList() + .stream() + .map(ProtoConverter::mapGeolocation) + .collect(Collectors.toList())); + return dataset; + + } + + public static T setOaf(T oaf, OafProtos.Oaf o) { + oaf.setDataInfo(mapDataInfo(o.getDataInfo())); + oaf.setLastupdatetimestamp(o.getLastupdatetimestamp()); + return oaf; + } + + public static T setEntity(T entity, OafProtos.Oaf oaf) { + //setting Entity fields + final OafProtos.OafEntity e = oaf.getEntity(); + entity.setId(e.getId()); + entity.setOriginalId(e.getOriginalIdList()); + entity.setCollectedfrom(e.getCollectedfromList() + .stream() + .map(ProtoConverter::mapKV) + .collect(Collectors.toList())); + entity.setPid(e.getPidList().stream() + .map(ProtoConverter::mapStructuredProperty) + .collect(Collectors.toList())); + entity.setDateofcollection(e.getDateofcollection()); + entity.setDateoftransformation(e.getDateoftransformation()); + entity.setExtraInfo(e.getExtraInfoList() + .stream() + .map(ProtoConverter::mapExtraInfo) + .collect(Collectors.toList())); + return entity; + } + + public static T setResult(T entity, OafProtos.Oaf oaf) { + //setting Entity fields + final ResultProtos.Result.Metadata m = oaf.getEntity().getResult().getMetadata(); + entity.setAuthor(m.getAuthorList() + .stream() + .map(ProtoConverter::mapAuthor) + .collect(Collectors.toList())); + entity.setResulttype(mapQualifier(m.getResulttype())); + entity.setLanguage(mapQualifier(m.getLanguage())); + entity.setCountry(m.getCountryList() + .stream() + .map(ProtoConverter::mapQualifierAsCountry) + .collect(Collectors.toList())); + entity.setSubject(m.getSubjectList() + .stream() + .map(ProtoConverter::mapStructuredProperty) + .collect(Collectors.toList())); + entity.setTitle(m.getTitleList() + .stream() + .map(ProtoConverter::mapStructuredProperty) + .collect(Collectors.toList())); + entity.setRelevantdate(m.getRelevantdateList() + .stream() + .map(ProtoConverter::mapStructuredProperty) + .collect(Collectors.toList())); + entity.setDescription(m.getDescriptionList() + .stream() + .map(ProtoConverter::mapStringField) + .collect(Collectors.toList())); + entity.setDateofacceptance(mapStringField(m.getDateofacceptance())); + entity.setPublisher(mapStringField(m.getPublisher())); + entity.setEmbargoenddate(mapStringField(m.getEmbargoenddate())); + entity.setSource(m.getSourceList() + .stream() + .map(ProtoConverter::mapStringField) + .collect(Collectors.toList())); + entity.setFulltext(m.getFulltextList() + .stream() + .map(ProtoConverter::mapStringField) + .collect(Collectors.toList())); + entity.setFormat(m.getFormatList() + .stream() + .map(ProtoConverter::mapStringField) + .collect(Collectors.toList())); + entity.setContributor(m.getContributorList() + .stream() + .map(ProtoConverter::mapStringField) + .collect(Collectors.toList())); + entity.setResourcetype(mapQualifier(m.getResourcetype())); + entity.setCoverage(m.getCoverageList() + .stream() + .map(ProtoConverter::mapStringField) + .collect(Collectors.toList())); + entity.setContext(m.getContextList() + .stream() + .map(ProtoConverter::mapContext) + .collect(Collectors.toList())); + + entity.setBestaccessright(getBestAccessRights(oaf.getEntity().getResult().getInstanceList())); + + return entity; + } + + private static Qualifier getBestAccessRights(List instanceList) { + if (instanceList != null) { + final Optional min = instanceList.stream() + .map(i -> i.getAccessright()).min(new LicenseComparator()); + + final Qualifier rights = min.isPresent() ? mapQualifier(min.get()) : new Qualifier(); + + if (StringUtils.isBlank(rights.getClassid())) { + rights.setClassid(UNKNOWN); + } + if (StringUtils.isBlank(rights.getClassname()) || UNKNOWN.equalsIgnoreCase(rights.getClassname())) { + rights.setClassname(NOT_AVAILABLE); + } + if (StringUtils.isBlank(rights.getSchemeid())) { + rights.setSchemeid(DNET_ACCESS_MODES); + } + if (StringUtils.isBlank(rights.getSchemename())) { + rights.setSchemename(DNET_ACCESS_MODES); + } + + return rights; + } + return null; + } + + private static Context mapContext(ResultProtos.Result.Context context) { + + final Context entity = new Context(); + entity.setId(context.getId()); + entity.setDataInfo(context.getDataInfoList() + .stream() + .map(ProtoConverter::mapDataInfo) + .collect(Collectors.toList())); + return entity; + } + + + public static KeyValue mapKV(FieldTypeProtos.KeyValue kv) { + final KeyValue keyValue = new KeyValue(); + keyValue.setKey(kv.getKey()); + keyValue.setValue(kv.getValue()); + keyValue.setDataInfo(mapDataInfo(kv.getDataInfo())); + return keyValue; + } + + public static DataInfo mapDataInfo(FieldTypeProtos.DataInfo d) { + final DataInfo dataInfo = new DataInfo(); + dataInfo.setDeletedbyinference(d.getDeletedbyinference()); + dataInfo.setInferenceprovenance(d.getInferenceprovenance()); + dataInfo.setInferred(d.getInferred()); + dataInfo.setInvisible(d.getInvisible()); + dataInfo.setProvenanceaction(mapQualifier(d.getProvenanceaction())); + dataInfo.setTrust(d.getTrust()); + return dataInfo; + } + + public static Qualifier mapQualifier(FieldTypeProtos.Qualifier q) { + final Qualifier qualifier = new Qualifier(); + qualifier.setClassid(q.getClassid()); + qualifier.setClassname(q.getClassname()); + qualifier.setSchemeid(q.getSchemeid()); + qualifier.setSchemename(q.getSchemename()); + return qualifier; + } + + public static Country mapQualifierAsCountry(FieldTypeProtos.Qualifier q) { + final Country c = new Country(); + c.setClassid(q.getClassid()); + c.setClassname(q.getClassname()); + c.setSchemeid(q.getSchemeid()); + c.setSchemename(q.getSchemename()); + c.setDataInfo(mapDataInfo(q.getDataInfo())); + return c; + } + + public static StructuredProperty mapStructuredProperty(FieldTypeProtos.StructuredProperty sp) { + final StructuredProperty structuredProperty = new StructuredProperty(); + structuredProperty.setValue(sp.getValue()); + structuredProperty.setQualifier(mapQualifier(sp.getQualifier())); + structuredProperty.setDataInfo(mapDataInfo(sp.getDataInfo())); + return structuredProperty; + } + + public static ExtraInfo mapExtraInfo(FieldTypeProtos.ExtraInfo extraInfo) { + final ExtraInfo entity = new ExtraInfo(); + entity.setName(extraInfo.getName()); + entity.setTypology(extraInfo.getTypology()); + entity.setProvenance(extraInfo.getProvenance()); + entity.setTrust(extraInfo.getTrust()); + entity.setValue(extraInfo.getValue()); + return entity; + } + + public static OAIProvenance mapOAIProvenance(FieldTypeProtos.OAIProvenance oaiProvenance) { + final OAIProvenance entity = new OAIProvenance(); + entity.setOriginDescription(mapOriginalDescription(oaiProvenance.getOriginDescription())); + return entity; + } + + public static OriginDescription mapOriginalDescription(FieldTypeProtos.OAIProvenance.OriginDescription originDescription) { + final OriginDescription originDescriptionResult = new OriginDescription(); + originDescriptionResult.setHarvestDate(originDescription.getHarvestDate()); + originDescriptionResult.setAltered(originDescription.getAltered()); + originDescriptionResult.setBaseURL(originDescription.getBaseURL()); + originDescriptionResult.setIdentifier(originDescription.getIdentifier()); + originDescriptionResult.setDatestamp(originDescription.getDatestamp()); + originDescriptionResult.setMetadataNamespace(originDescription.getMetadataNamespace()); + return originDescriptionResult; + } + + public static Field mapStringField(FieldTypeProtos.StringField s) { + final Field stringField = new Field<>(); + stringField.setValue(s.getValue()); + stringField.setDataInfo(mapDataInfo(s.getDataInfo())); + return stringField; + } + + public static Field mapBoolField(FieldTypeProtos.BoolField b) { + final Field booleanField = new Field<>(); + booleanField.setValue(b.getValue()); + booleanField.setDataInfo(mapDataInfo(b.getDataInfo())); + return booleanField; + } + + public static Field mapIntField(FieldTypeProtos.IntField b) { + final Field entity = new Field<>(); + entity.setValue(b.getValue()); + entity.setDataInfo(mapDataInfo(b.getDataInfo())); + return entity; + } + + public static Journal mapJournal(FieldTypeProtos.Journal j) { + final Journal journal = new Journal(); + journal.setConferencedate(j.getConferencedate()); + journal.setConferenceplace(j.getConferenceplace()); + journal.setEdition(j.getEdition()); + journal.setEp(j.getEp()); + journal.setIss(j.getIss()); + journal.setIssnLinking(j.getIssnLinking()); + journal.setIssnOnline(j.getIssnOnline()); + journal.setIssnPrinted(j.getIssnPrinted()); + journal.setName(j.getName()); + journal.setSp(j.getSp()); + journal.setVol(j.getVol()); + journal.setDataInfo(mapDataInfo(j.getDataInfo())); + return journal; + } + + public static Author mapAuthor(FieldTypeProtos.Author author) { + final Author entity = new Author(); + entity.setFullname(author.getFullname()); + entity.setName(author.getName()); + entity.setSurname(author.getSurname()); + entity.setRank(author.getRank()); + entity.setPid(author.getPidList() + .stream() + .map(kv -> { + final StructuredProperty sp = new StructuredProperty(); + sp.setValue(kv.getValue()); + final Qualifier q = new Qualifier(); + q.setClassid(kv.getKey()); + q.setClassname(kv.getKey()); + sp.setQualifier(q); + return sp; + }) + .collect(Collectors.toList())); + entity.setAffiliation(author.getAffiliationList() + .stream() + .map(ProtoConverter::mapStringField) + .collect(Collectors.toList())); + return entity; + + } + + public static GeoLocation mapGeolocation(ResultProtos.Result.GeoLocation geoLocation) { + final GeoLocation entity = new GeoLocation(); + entity.setPoint(geoLocation.getPoint()); + entity.setBox(geoLocation.getBox()); + entity.setPlace(geoLocation.getPlace()); + return entity; + } +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/actions/TransformActions.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/actions/TransformActions.java new file mode 100644 index 0000000000..19a0cb5c9d --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/actions/TransformActions.java @@ -0,0 +1,194 @@ +package eu.dnetlib.dhp.migration.actions; + +import com.fasterxml.jackson.core.JsonProcessingException; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.google.common.base.Splitter; +import com.google.common.collect.Lists; +import com.google.protobuf.InvalidProtocolBufferException; +import eu.dnetlib.dhp.schema.action.AtomicAction; +import eu.dnetlib.data.proto.OafProtos; +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.schema.oaf.*; +import eu.dnetlib.dhp.utils.ISLookupClientFactory; +import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; +import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; + +import org.apache.commons.io.IOUtils; +import org.apache.commons.lang3.StringUtils; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.compress.GzipCodec; +import org.apache.hadoop.mapred.SequenceFileOutputFormat; +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.sql.SparkSession; +import scala.Tuple2; + +import java.io.IOException; +import java.io.Serializable; +import java.util.LinkedList; + +public class TransformActions implements Serializable { + + private static final Log log = LogFactory.getLog(TransformActions.class); + private static final String SEPARATOR = "/"; + + public static void main(String[] args) throws Exception { + final ArgumentApplicationParser parser = new ArgumentApplicationParser( + IOUtils.toString(MigrateActionSet.class.getResourceAsStream( + "/eu/dnetlib/dhp/migration/transform_actionsets_parameters.json"))); + parser.parseArgument(args); + + new TransformActions().run(parser); + } + + private void run(ArgumentApplicationParser parser) throws ISLookUpException, IOException { + + final String isLookupUrl = parser.get("isLookupUrl"); + log.info("isLookupUrl: " + isLookupUrl); + + final String inputPaths = parser.get("inputPaths"); + + if (StringUtils.isBlank(inputPaths)) { + throw new RuntimeException("empty inputPaths"); + } + log.info("inputPaths: " + inputPaths); + + final String targetBaseDir = getTargetBaseDir(isLookupUrl); + + try(SparkSession spark = getSparkSession(parser)) { + final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); + final FileSystem fs = FileSystem.get(spark.sparkContext().hadoopConfiguration()); + + for(String sourcePath : Lists.newArrayList(Splitter.on(",").split(inputPaths))) { + + LinkedList pathQ = Lists.newLinkedList(Splitter.on(SEPARATOR).split(sourcePath)); + + final String rawset = pathQ.pollLast(); + final String actionSetDirectory = pathQ.pollLast(); + + final Path targetDirectory = new Path(targetBaseDir + SEPARATOR + actionSetDirectory + SEPARATOR + rawset); + + if (fs.exists(targetDirectory)) { + log.info(String.format("found target directory '%s", targetDirectory)); + fs.delete(targetDirectory, true); + log.info(String.format("deleted target directory '%s", targetDirectory)); + } + + log.info(String.format("transforming actions from '%s' to '%s'", sourcePath, targetDirectory)); + + sc.sequenceFile(sourcePath, Text.class, Text.class) + .mapToPair(a -> new Tuple2<>(a._1(), eu.dnetlib.actionmanager.actions.AtomicAction.fromJSON(a._2().toString()))) + .mapToPair(a -> new Tuple2<>(a._1(), transformAction(a._1().toString(), a._2()))) + .filter(t -> StringUtils.isNotBlank(t._2().toString())) + .saveAsHadoopFile(targetDirectory.toString(), Text.class, Text.class, SequenceFileOutputFormat.class, GzipCodec.class); + } + } + } + + private Text transformAction(String atomicaActionId, eu.dnetlib.actionmanager.actions.AtomicAction aa) throws InvalidProtocolBufferException, JsonProcessingException { + final Text out = new Text(); + final ObjectMapper mapper = new ObjectMapper(); + if (aa.getTargetValue() != null && aa.getTargetValue().length > 0) { + out.set(mapper.writeValueAsString(doTransform(aa))); + } else { + if (atomicaActionId.contains("dedupSimilarity")) { + out.set(mapper.writeValueAsString(getRelationAtomicAction(atomicaActionId))); + } + } + + return out; + } + + private AtomicAction getRelationAtomicAction(String atomicaActionId) { + final String[] splitId = atomicaActionId.split("@"); + + String source = splitId[0]; + String target = splitId[2]; + + String[] relSemantic = splitId[1].split("_"); + + Relation rel = new Relation(); + rel.setSource(source); + rel.setTarget(target); + rel.setRelType(relSemantic[0]); + rel.setSubRelType(relSemantic[1]); + rel.setRelClass(relSemantic[2]); + + DataInfo d = new DataInfo(); + d.setDeletedbyinference(false); + d.setInferenceprovenance("deduplication"); + d.setInferred(true); + d.setInvisible(false); + Qualifier provenanceaction = new Qualifier(); + + provenanceaction.setClassid("deduplication"); + provenanceaction.setClassname("deduplication"); + provenanceaction.setSchemeid("dnet:provenanceActions"); + provenanceaction.setSchemename("dnet:provenanceActions"); + + d.setProvenanceaction(provenanceaction); + + rel.setDataInfo(d); + + return new AtomicAction<>(Relation.class, rel); + } + + private AtomicAction doTransform(eu.dnetlib.actionmanager.actions.AtomicAction aa) throws InvalidProtocolBufferException { + final OafProtos.Oaf proto_oaf = OafProtos.Oaf.parseFrom(aa.getTargetValue()); + final Oaf oaf = ProtoConverter.convert(proto_oaf); + switch (proto_oaf.getKind()) { + case entity: + switch (proto_oaf.getEntity().getType()) { + case datasource: + return new AtomicAction<>(Datasource.class, (Datasource) oaf); + case organization: + return new AtomicAction<>(Organization.class, (Organization) oaf); + case project: + return new AtomicAction<>(Project.class, (Project) oaf); + case result: + final String resulttypeid = proto_oaf.getEntity().getResult().getMetadata().getResulttype().getClassid(); + switch (resulttypeid) { + case "publication": + return new AtomicAction<>(Publication.class, (Publication) oaf); + case "software": + return new AtomicAction<>(Software.class, (Software) oaf); + case "other": + return new AtomicAction<>(OtherResearchProduct.class, (OtherResearchProduct) oaf); + case "dataset": + return new AtomicAction<>(Dataset.class, (Dataset) oaf); + default: + // can be an update, where the resulttype is not specified + return new AtomicAction<>(Result.class, (Result) oaf); + } + default: + throw new IllegalArgumentException("invalid entity type: " + proto_oaf.getEntity().getType()); + } + case relation: + return new AtomicAction<>(Relation.class, (Relation) oaf); + default: + throw new IllegalArgumentException("invalid kind: " + proto_oaf.getKind()); + } + } + + private String getTargetBaseDir(String isLookupUrl) throws ISLookUpException { + ISLookUpService isLookUp = ISLookupClientFactory.getLookUpService(isLookupUrl); + String XQUERY = "collection('/db/DRIVER/ServiceResources/ActionManagerServiceResourceType')//SERVICE_PROPERTIES/PROPERTY[@key = 'basePath']/@value/string()"; + return isLookUp.getResourceProfileByQuery(XQUERY); + } + + private static SparkSession getSparkSession(ArgumentApplicationParser parser) { + SparkConf conf = new SparkConf(); + + return SparkSession + .builder() + .appName(TransformActions.class.getSimpleName()) + .master(parser.get("master")) + .config(conf) + .enableHiveSupport() + .getOrCreate(); + } +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/step1/MigrateDbEntitiesApplication.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/step1/MigrateDbEntitiesApplication.java new file mode 100644 index 0000000000..7db2b17726 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/step1/MigrateDbEntitiesApplication.java @@ -0,0 +1,481 @@ +package eu.dnetlib.dhp.migration.step1; + +import static eu.dnetlib.dhp.migration.utils.OafMapperUtils.asString; +import static eu.dnetlib.dhp.migration.utils.OafMapperUtils.createOpenaireId; +import static eu.dnetlib.dhp.migration.utils.OafMapperUtils.dataInfo; +import static eu.dnetlib.dhp.migration.utils.OafMapperUtils.field; +import static eu.dnetlib.dhp.migration.utils.OafMapperUtils.journal; +import static eu.dnetlib.dhp.migration.utils.OafMapperUtils.listFields; +import static eu.dnetlib.dhp.migration.utils.OafMapperUtils.listKeyValues; +import static eu.dnetlib.dhp.migration.utils.OafMapperUtils.qualifier; +import static eu.dnetlib.dhp.migration.utils.OafMapperUtils.structuredProperty; + +import java.io.Closeable; +import java.io.IOException; +import java.sql.Array; +import java.sql.ResultSet; +import java.sql.SQLException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Date; +import java.util.List; +import java.util.function.Consumer; +import java.util.function.Function; + +import org.apache.commons.io.IOUtils; +import org.apache.commons.lang3.StringUtils; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; + +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.migration.utils.AbstractMigrationApplication; +import eu.dnetlib.dhp.migration.utils.DbClient; +import eu.dnetlib.dhp.schema.oaf.Context; +import eu.dnetlib.dhp.schema.oaf.DataInfo; +import eu.dnetlib.dhp.schema.oaf.Dataset; +import eu.dnetlib.dhp.schema.oaf.Datasource; +import eu.dnetlib.dhp.schema.oaf.Field; +import eu.dnetlib.dhp.schema.oaf.Journal; +import eu.dnetlib.dhp.schema.oaf.KeyValue; +import eu.dnetlib.dhp.schema.oaf.Oaf; +import eu.dnetlib.dhp.schema.oaf.Organization; +import eu.dnetlib.dhp.schema.oaf.OtherResearchProduct; +import eu.dnetlib.dhp.schema.oaf.Project; +import eu.dnetlib.dhp.schema.oaf.Publication; +import eu.dnetlib.dhp.schema.oaf.Qualifier; +import eu.dnetlib.dhp.schema.oaf.Relation; +import eu.dnetlib.dhp.schema.oaf.Result; +import eu.dnetlib.dhp.schema.oaf.Software; +import eu.dnetlib.dhp.schema.oaf.StructuredProperty; + +public class MigrateDbEntitiesApplication extends AbstractMigrationApplication implements Closeable { + + private static final Qualifier ENTITYREGISTRY_PROVENANCE_ACTION = + qualifier("sysimport:crosswalk:entityregistry", "sysimport:crosswalk:entityregistry", "dnet:provenance_actions", "dnet:provenance_actions"); + + private static final Log log = LogFactory.getLog(MigrateDbEntitiesApplication.class); + + private final DbClient dbClient; + + private final long lastUpdateTimestamp; + + public static void main(final String[] args) throws Exception { + final ArgumentApplicationParser parser = new ArgumentApplicationParser( + IOUtils.toString(MigrateDbEntitiesApplication.class.getResourceAsStream("/eu/dnetlib/dhp/migration/migrate_db_entities_parameters.json"))); + + parser.parseArgument(args); + + final String dbUrl = parser.get("postgresUrl"); + final String dbUser = parser.get("postgresUser"); + final String dbPassword = parser.get("postgresPassword"); + + final String hdfsPath = parser.get("hdfsPath"); + + final boolean processClaims = parser.get("action") != null && parser.get("action").equalsIgnoreCase("claims"); + + try (final MigrateDbEntitiesApplication smdbe = new MigrateDbEntitiesApplication(hdfsPath, dbUrl, dbUser, dbPassword)) { + if (processClaims) { + log.info("Processing claims..."); + smdbe.execute("queryClaims.sql", smdbe::processClaims); + } else { + log.info("Processing datasources..."); + smdbe.execute("queryDatasources.sql", smdbe::processDatasource); + + log.info("Processing projects..."); + smdbe.execute("queryProjects.sql", smdbe::processProject); + + log.info("Processing orgs..."); + smdbe.execute("queryOrganizations.sql", smdbe::processOrganization); + + log.info("Processing relations ds <-> orgs ..."); + smdbe.execute("queryDatasourceOrganization.sql", smdbe::processDatasourceOrganization); + + log.info("Processing projects <-> orgs ..."); + smdbe.execute("queryProjectOrganization.sql", smdbe::processProjectOrganization); + } + log.info("All done."); + } + } + + protected MigrateDbEntitiesApplication() { // ONLY FOR UNIT TEST + super(); + this.dbClient = null; + this.lastUpdateTimestamp = new Date().getTime(); + } + + public MigrateDbEntitiesApplication(final String hdfsPath, final String dbUrl, final String dbUser, + final String dbPassword) throws Exception { + super(hdfsPath); + this.dbClient = new DbClient(dbUrl, dbUser, dbPassword); + this.lastUpdateTimestamp = new Date().getTime(); + } + + public void execute(final String sqlFile, final Function> producer) throws Exception { + final String sql = IOUtils.toString(getClass().getResourceAsStream("/eu/dnetlib/dhp/migration/sql/" + sqlFile)); + + final Consumer consumer = rs -> producer.apply(rs).forEach(oaf -> emitOaf(oaf)); + + dbClient.processResults(sql, consumer); + } + + public List processDatasource(final ResultSet rs) { + + try { + + final DataInfo info = prepareDataInfo(rs); + + final Datasource ds = new Datasource(); + + ds.setId(createOpenaireId(10, rs.getString("datasourceid"), true)); + ds.setOriginalId(Arrays.asList(rs.getString("datasourceid"))); + ds.setCollectedfrom(listKeyValues(rs.getString("collectedfromid"), rs.getString("collectedfromname"))); + ds.setPid(new ArrayList<>()); + ds.setDateofcollection(asString(rs.getDate("dateofcollection"))); + ds.setDateoftransformation(null); // Value not returned by the SQL query + ds.setExtraInfo(new ArrayList<>()); // Values not present in the DB + ds.setOaiprovenance(null); // Values not present in the DB + ds.setDatasourcetype(prepareQualifierSplitting(rs.getString("datasourcetype"))); + ds.setOpenairecompatibility(prepareQualifierSplitting(rs.getString("openairecompatibility"))); + ds.setOfficialname(field(rs.getString("officialname"), info)); + ds.setEnglishname(field(rs.getString("englishname"), info)); + ds.setWebsiteurl(field(rs.getString("websiteurl"), info)); + ds.setLogourl(field(rs.getString("logourl"), info)); + ds.setContactemail(field(rs.getString("contactemail"), info)); + ds.setNamespaceprefix(field(rs.getString("namespaceprefix"), info)); + ds.setLatitude(field(Double.toString(rs.getDouble("latitude")), info)); + ds.setLongitude(field(Double.toString(rs.getDouble("longitude")), info)); + ds.setDateofvalidation(field(asString(rs.getDate("dateofvalidation")), info)); + ds.setDescription(field(rs.getString("description"), info)); + ds.setSubjects(prepareListOfStructProps(rs.getArray("subjects"), info)); + ds.setOdnumberofitems(field(Double.toString(rs.getInt("odnumberofitems")), info)); + ds.setOdnumberofitemsdate(field(asString(rs.getDate("odnumberofitemsdate")), info)); + ds.setOdpolicies(field(rs.getString("odpolicies"), info)); + ds.setOdlanguages(prepareListFields(rs.getArray("odlanguages"), info)); + ds.setOdcontenttypes(prepareListFields(rs.getArray("odcontenttypes"), info)); + ds.setAccessinfopackage(prepareListFields(rs.getArray("accessinfopackage"), info)); + ds.setReleasestartdate(field(asString(rs.getDate("releasestartdate")), info)); + ds.setReleaseenddate(field(asString(rs.getDate("releaseenddate")), info)); + ds.setMissionstatementurl(field(rs.getString("missionstatementurl"), info)); + ds.setDataprovider(field(rs.getBoolean("dataprovider"), info)); + ds.setServiceprovider(field(rs.getBoolean("serviceprovider"), info)); + ds.setDatabaseaccesstype(field(rs.getString("databaseaccesstype"), info)); + ds.setDatauploadtype(field(rs.getString("datauploadtype"), info)); + ds.setDatabaseaccessrestriction(field(rs.getString("databaseaccessrestriction"), info)); + ds.setDatauploadrestriction(field(rs.getString("datauploadrestriction"), info)); + ds.setVersioning(field(rs.getBoolean("versioning"), info)); + ds.setCitationguidelineurl(field(rs.getString("citationguidelineurl"), info)); + ds.setQualitymanagementkind(field(rs.getString("qualitymanagementkind"), info)); + ds.setPidsystems(field(rs.getString("pidsystems"), info)); + ds.setCertificates(field(rs.getString("certificates"), info)); + ds.setPolicies(new ArrayList<>()); // The sql query returns an empty array + ds.setJournal(prepareJournal(rs.getString("officialname"), rs.getString("journal"), info)); // Journal + ds.setDataInfo(info); + ds.setLastupdatetimestamp(lastUpdateTimestamp); + + return Arrays.asList(ds); + } catch (final Exception e) { + throw new RuntimeException(e); + } + } + + public List processProject(final ResultSet rs) { + try { + + final DataInfo info = prepareDataInfo(rs); + + final Project p = new Project(); + + p.setId(createOpenaireId(40, rs.getString("projectid"), true)); + p.setOriginalId(Arrays.asList(rs.getString("projectid"))); + p.setCollectedfrom(listKeyValues(rs.getString("collectedfromid"), rs.getString("collectedfromname"))); + p.setPid(new ArrayList<>()); + p.setDateofcollection(asString(rs.getDate("dateofcollection"))); + p.setDateoftransformation(asString(rs.getDate("dateoftransformation"))); + p.setExtraInfo(new ArrayList<>()); // Values not present in the DB + p.setOaiprovenance(null); // Values not present in the DB + p.setWebsiteurl(field(rs.getString("websiteurl"), info)); + p.setCode(field(rs.getString("code"), info)); + p.setAcronym(field(rs.getString("acronym"), info)); + p.setTitle(field(rs.getString("title"), info)); + p.setStartdate(field(asString(rs.getDate("startdate")), info)); + p.setEnddate(field(asString(rs.getDate("enddate")), info)); + p.setCallidentifier(field(rs.getString("callidentifier"), info)); + p.setKeywords(field(rs.getString("keywords"), info)); + p.setDuration(field(Integer.toString(rs.getInt("duration")), info)); + p.setEcsc39(field(Boolean.toString(rs.getBoolean("ecsc39")), info)); + p.setOamandatepublications(field(Boolean.toString(rs.getBoolean("oamandatepublications")), info)); + p.setEcarticle29_3(field(Boolean.toString(rs.getBoolean("ecarticle29_3")), info)); + p.setSubjects(prepareListOfStructProps(rs.getArray("subjects"), info)); + p.setFundingtree(prepareListFields(rs.getArray("fundingtree"), info)); + p.setContracttype(prepareQualifierSplitting(rs.getString("contracttype"))); + p.setOptional1(field(rs.getString("optional1"), info)); + p.setOptional2(field(rs.getString("optional2"), info)); + p.setJsonextrainfo(field(rs.getString("jsonextrainfo"), info)); + p.setContactfullname(field(rs.getString("contactfullname"), info)); + p.setContactfax(field(rs.getString("contactfax"), info)); + p.setContactphone(field(rs.getString("contactphone"), info)); + p.setContactemail(field(rs.getString("contactemail"), info)); + p.setSummary(field(rs.getString("summary"), info)); + p.setCurrency(field(rs.getString("currency"), info)); + p.setTotalcost(new Float(rs.getDouble("totalcost"))); + p.setFundedamount(new Float(rs.getDouble("fundedamount"))); + p.setDataInfo(info); + p.setLastupdatetimestamp(lastUpdateTimestamp); + + return Arrays.asList(p); + + } catch (final Exception e) { + throw new RuntimeException(e); + } + } + + public List processOrganization(final ResultSet rs) { + + try { + + final DataInfo info = prepareDataInfo(rs); + + final Organization o = new Organization(); + + o.setId(createOpenaireId(20, rs.getString("organizationid"), true)); + o.setOriginalId(Arrays.asList(rs.getString("organizationid"))); + o.setCollectedfrom(listKeyValues(rs.getString("collectedfromid"), rs.getString("collectedfromname"))); + o.setPid(new ArrayList<>()); + o.setDateofcollection(asString(rs.getDate("dateofcollection"))); + o.setDateoftransformation(asString(rs.getDate("dateoftransformation"))); + o.setExtraInfo(new ArrayList<>()); // Values not present in the DB + o.setOaiprovenance(null); // Values not present in the DB + o.setLegalshortname(field(rs.getString("legalshortname"), info)); + o.setLegalname(field(rs.getString("legalname"), info)); + o.setAlternativeNames(new ArrayList<>()); // Values not returned by the SQL query + o.setWebsiteurl(field(rs.getString("websiteurl"), info)); + o.setLogourl(field(rs.getString("logourl"), info)); + o.setEclegalbody(field(Boolean.toString(rs.getBoolean("eclegalbody")), info)); + o.setEclegalperson(field(Boolean.toString(rs.getBoolean("eclegalperson")), info)); + o.setEcnonprofit(field(Boolean.toString(rs.getBoolean("ecnonprofit")), info)); + o.setEcresearchorganization(field(Boolean.toString(rs.getBoolean("ecresearchorganization")), info)); + o.setEchighereducation(field(Boolean.toString(rs.getBoolean("echighereducation")), info)); + o.setEcinternationalorganizationeurinterests(field(Boolean.toString(rs.getBoolean("ecinternationalorganizationeurinterests")), info)); + o.setEcinternationalorganization(field(Boolean.toString(rs.getBoolean("ecinternationalorganization")), info)); + o.setEcenterprise(field(Boolean.toString(rs.getBoolean("ecenterprise")), info)); + o.setEcsmevalidated(field(Boolean.toString(rs.getBoolean("ecsmevalidated")), info)); + o.setEcnutscode(field(Boolean.toString(rs.getBoolean("ecnutscode")), info)); + o.setCountry(prepareQualifierSplitting(rs.getString("country"))); + o.setDataInfo(info); + o.setLastupdatetimestamp(lastUpdateTimestamp); + + return Arrays.asList(o); + } catch (final Exception e) { + throw new RuntimeException(e); + } + } + + public List processDatasourceOrganization(final ResultSet rs) { + try { + final DataInfo info = prepareDataInfo(rs); + final String orgId = createOpenaireId(20, rs.getString("organization"), true); + final String dsId = createOpenaireId(10, rs.getString("datasource"), true); + final List collectedFrom = listKeyValues(rs.getString("collectedfromid"), rs.getString("collectedfromname")); + + final Relation r1 = new Relation(); + r1.setRelType("datasourceOrganization"); + r1.setSubRelType("provision"); + r1.setRelClass("isProvidedBy"); + r1.setSource(dsId); + r1.setTarget(orgId); + r1.setCollectedFrom(collectedFrom); + r1.setDataInfo(info); + r1.setLastupdatetimestamp(lastUpdateTimestamp); + + final Relation r2 = new Relation(); + r2.setRelType("datasourceOrganization"); + r2.setSubRelType("provision"); + r2.setRelClass("provides"); + r2.setSource(orgId); + r2.setTarget(dsId); + r2.setCollectedFrom(collectedFrom); + r2.setDataInfo(info); + r2.setLastupdatetimestamp(lastUpdateTimestamp); + + return Arrays.asList(r1, r2); + } catch (final Exception e) { + throw new RuntimeException(e); + } + } + + public List processProjectOrganization(final ResultSet rs) { + try { + final DataInfo info = prepareDataInfo(rs); + final String orgId = createOpenaireId(20, rs.getString("resporganization"), true); + final String projectId = createOpenaireId(40, rs.getString("project"), true); + final List collectedFrom = listKeyValues(rs.getString("collectedfromid"), rs.getString("collectedfromname")); + + final Relation r1 = new Relation(); + r1.setRelType("projectOrganization"); + r1.setSubRelType("participation"); + r1.setRelClass("isParticipant"); + r1.setSource(projectId); + r1.setTarget(orgId); + r1.setCollectedFrom(collectedFrom); + r1.setDataInfo(info); + r1.setLastupdatetimestamp(lastUpdateTimestamp); + + final Relation r2 = new Relation(); + r2.setRelType("projectOrganization"); + r2.setSubRelType("participation"); + r2.setRelClass("hasParticipant"); + r2.setSource(orgId); + r2.setTarget(projectId); + r2.setCollectedFrom(collectedFrom); + r2.setDataInfo(info); + r2.setLastupdatetimestamp(lastUpdateTimestamp); + + return Arrays.asList(r1, r2); + } catch (final Exception e) { + throw new RuntimeException(e); + } + } + + public List processClaims(final ResultSet rs) { + + final DataInfo info = + dataInfo(false, null, false, false, qualifier("user:claim", "user:claim", "dnet:provenanceActions", "dnet:provenanceActions"), "0.9"); + + try { + + if (rs.getString("source_type").equals("context")) { + final Result r; + + if (rs.getString("target_type").equals("dataset")) { + r = new Dataset(); + } else if (rs.getString("target_type").equals("software")) { + r = new Software(); + } else if (rs.getString("target_type").equals("other")) { + r = new OtherResearchProduct(); + } else { + r = new Publication(); + } + r.setId(createOpenaireId(50, rs.getString("target_id"), false)); + r.setLastupdatetimestamp(lastUpdateTimestamp); + r.setContext(prepareContext(rs.getString("source_id"), info)); + r.setDataInfo(info); + + return Arrays.asList(r); + } else { + final String sourceId = createOpenaireId(rs.getString("source_type"), rs.getString("source_id"), false); + final String targetId = createOpenaireId(rs.getString("target_type"), rs.getString("target_id"), false); + + final Relation r1 = new Relation(); + final Relation r2 = new Relation(); + + if (rs.getString("source_type").equals("project")) { + r1.setRelType("resultProject"); + r1.setSubRelType("outcome"); + r1.setRelClass("produces"); + + r2.setRelType("resultProject"); + r2.setSubRelType("outcome"); + r2.setRelClass("isProducedBy"); + } else { + r1.setRelType("resultResult"); + r1.setSubRelType("relationship"); + r1.setRelClass("isRelatedTo"); + + r2.setRelType("resultResult"); + r2.setSubRelType("relationship"); + r2.setRelClass("isRelatedTo"); + } + + r1.setSource(sourceId); + r1.setTarget(targetId); + r1.setDataInfo(info); + r1.setLastupdatetimestamp(lastUpdateTimestamp); + + r2.setSource(targetId); + r2.setTarget(sourceId); + r2.setDataInfo(info); + r2.setLastupdatetimestamp(lastUpdateTimestamp); + + return Arrays.asList(r1, r2); + } + + } catch (final Exception e) { + throw new RuntimeException(e); + } + } + + private List prepareContext(final String id, final DataInfo dataInfo) { + final Context context = new Context(); + context.setId(id); + context.setDataInfo(Arrays.asList(dataInfo)); + return Arrays.asList(context); + } + + private DataInfo prepareDataInfo(final ResultSet rs) throws SQLException { + final Boolean deletedbyinference = rs.getBoolean("deletedbyinference"); + final String inferenceprovenance = rs.getString("inferenceprovenance"); + final Boolean inferred = rs.getBoolean("inferred"); + final String trust = rs.getString("trust"); + return dataInfo(deletedbyinference, inferenceprovenance, inferred, false, ENTITYREGISTRY_PROVENANCE_ACTION, trust); + } + + private Qualifier prepareQualifierSplitting(final String s) { + if (StringUtils.isBlank(s)) { return null; } + final String[] arr = s.split("@@@"); + return arr.length == 4 ? qualifier(arr[0], arr[1], arr[2], arr[3]) : null; + } + + private List> prepareListFields(final Array array, final DataInfo info) { + try { + return array != null ? listFields(info, (String[]) array.getArray()) : new ArrayList<>(); + } catch (final SQLException e) { + throw new RuntimeException("Invalid SQL array", e); + } + } + + private StructuredProperty prepareStructProp(final String s, final DataInfo dataInfo) { + if (StringUtils.isBlank(s)) { return null; } + final String[] parts = s.split("###"); + if (parts.length == 2) { + final String value = parts[0]; + final String[] arr = parts[1].split("@@@"); + if (arr.length == 4) { return structuredProperty(value, arr[0], arr[1], arr[2], arr[3], dataInfo); } + } + return null; + } + + private List prepareListOfStructProps(final Array array, final DataInfo dataInfo) throws SQLException { + final List res = new ArrayList<>(); + if (array != null) { + for (final String s : (String[]) array.getArray()) { + final StructuredProperty sp = prepareStructProp(s, dataInfo); + if (sp != null) { + res.add(sp); + } + } + } + + return res; + } + + private Journal prepareJournal(final String name, final String sj, final DataInfo info) { + if (StringUtils.isNotBlank(sj)) { + final String[] arr = sj.split("@@@"); + if (arr.length == 3) { + final String issn = StringUtils.isNotBlank(arr[0]) ? arr[0] : null; + final String eissn = StringUtils.isNotBlank(arr[1]) ? arr[1] : null;; + final String lissn = StringUtils.isNotBlank(arr[2]) ? arr[2] : null;; + if (issn != null || eissn != null + || lissn != null) { return journal(name, issn, eissn, eissn, null, null, null, null, null, null, null, info); } + } + } + return null; + } + + @Override + public void close() throws IOException { + super.close(); + dbClient.close(); + } + +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/step1/MigrateMongoMdstoresApplication.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/step1/MigrateMongoMdstoresApplication.java new file mode 100644 index 0000000000..b1de31326b --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/step1/MigrateMongoMdstoresApplication.java @@ -0,0 +1,67 @@ +package eu.dnetlib.dhp.migration.step1; + +import java.io.Closeable; +import java.io.IOException; +import java.util.Map; +import java.util.Map.Entry; + +import org.apache.commons.io.IOUtils; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; + +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.migration.utils.AbstractMigrationApplication; +import eu.dnetlib.dhp.migration.utils.MdstoreClient; + +public class MigrateMongoMdstoresApplication extends AbstractMigrationApplication implements Closeable { + + private static final Log log = LogFactory.getLog(MigrateMongoMdstoresApplication.class); + + private final MdstoreClient mdstoreClient; + + public static void main(final String[] args) throws Exception { + final ArgumentApplicationParser parser = new ArgumentApplicationParser( + IOUtils.toString(MigrateMongoMdstoresApplication.class.getResourceAsStream("/eu/dnetlib/dhp/migration/migrate_mongo_mstores_parameters.json"))); + parser.parseArgument(args); + + final String mongoBaseUrl = parser.get("mongoBaseUrl"); + final String mongoDb = parser.get("mongoDb"); + + final String mdFormat = parser.get("mdFormat"); + final String mdLayout = parser.get("mdLayout"); + final String mdInterpretation = parser.get("mdInterpretation"); + + final String hdfsPath = parser.get("hdfsPath"); + + try (MigrateMongoMdstoresApplication app = new MigrateMongoMdstoresApplication(hdfsPath, mongoBaseUrl, mongoDb)) { + app.execute(mdFormat, mdLayout, mdInterpretation); + } + + } + + public MigrateMongoMdstoresApplication(final String hdfsPath, final String mongoBaseUrl, final String mongoDb) throws Exception { + super(hdfsPath); + this.mdstoreClient = new MdstoreClient(mongoBaseUrl, mongoDb); + } + + public void execute(final String format, final String layout, final String interpretation) { + final Map colls = mdstoreClient.validCollections(format, layout, interpretation); + log.info("Found " + colls.size() + " mdstores"); + + for (final Entry entry : colls.entrySet()) { + log.info("Processing mdstore " + entry.getKey() + " (collection: " + entry.getValue() + ")"); + final String currentColl = entry.getValue(); + + for (final String xml : mdstoreClient.listRecords(currentColl)) { + emit(xml, "native_" + format); + } + } + } + + @Override + public void close() throws IOException { + super.close(); + mdstoreClient.close(); + } + +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/step2/AbstractMdRecordToOafMapper.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/step2/AbstractMdRecordToOafMapper.java new file mode 100644 index 0000000000..7c3000fbad --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/step2/AbstractMdRecordToOafMapper.java @@ -0,0 +1,394 @@ +package eu.dnetlib.dhp.migration.step2; + +import static eu.dnetlib.dhp.migration.utils.OafMapperUtils.createOpenaireId; +import static eu.dnetlib.dhp.migration.utils.OafMapperUtils.dataInfo; +import static eu.dnetlib.dhp.migration.utils.OafMapperUtils.field; +import static eu.dnetlib.dhp.migration.utils.OafMapperUtils.journal; +import static eu.dnetlib.dhp.migration.utils.OafMapperUtils.keyValue; +import static eu.dnetlib.dhp.migration.utils.OafMapperUtils.listFields; +import static eu.dnetlib.dhp.migration.utils.OafMapperUtils.oaiIProvenance; +import static eu.dnetlib.dhp.migration.utils.OafMapperUtils.qualifier; +import static eu.dnetlib.dhp.migration.utils.OafMapperUtils.structuredProperty; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Date; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import org.apache.commons.lang3.StringUtils; +import org.dom4j.Document; +import org.dom4j.DocumentFactory; +import org.dom4j.DocumentHelper; +import org.dom4j.Node; + +import eu.dnetlib.dhp.schema.oaf.Author; +import eu.dnetlib.dhp.schema.oaf.DataInfo; +import eu.dnetlib.dhp.schema.oaf.Dataset; +import eu.dnetlib.dhp.schema.oaf.Field; +import eu.dnetlib.dhp.schema.oaf.GeoLocation; +import eu.dnetlib.dhp.schema.oaf.Instance; +import eu.dnetlib.dhp.schema.oaf.Journal; +import eu.dnetlib.dhp.schema.oaf.KeyValue; +import eu.dnetlib.dhp.schema.oaf.OAIProvenance; +import eu.dnetlib.dhp.schema.oaf.Oaf; +import eu.dnetlib.dhp.schema.oaf.OtherResearchProduct; +import eu.dnetlib.dhp.schema.oaf.Publication; +import eu.dnetlib.dhp.schema.oaf.Qualifier; +import eu.dnetlib.dhp.schema.oaf.Relation; +import eu.dnetlib.dhp.schema.oaf.Result; +import eu.dnetlib.dhp.schema.oaf.Software; +import eu.dnetlib.dhp.schema.oaf.StructuredProperty; + +public abstract class AbstractMdRecordToOafMapper { + + protected final Map code2name; + + protected static final Qualifier MAIN_TITLE_QUALIFIER = qualifier("main title", "main title", "dnet:dataCite_title", "dnet:dataCite_title"); + + protected static final Qualifier PUBLICATION_RESULTTYPE_QUALIFIER = + qualifier("publication", "publication", "dnet:result_typologies", "dnet:result_typologies"); + protected static final Qualifier DATASET_RESULTTYPE_QUALIFIER = qualifier("dataset", "dataset", "dnet:result_typologies", "dnet:result_typologies"); + protected static final Qualifier SOFTWARE_RESULTTYPE_QUALIFIER = qualifier("software", "software", "dnet:result_typologies", "dnet:result_typologies"); + protected static final Qualifier OTHER_RESULTTYPE_QUALIFIER = qualifier("other", "other", "dnet:result_typologies", "dnet:result_typologies"); + + protected AbstractMdRecordToOafMapper(final Map code2name) { + this.code2name = code2name; + } + + public List processMdRecord(final String xml) { + try { + final Map nsContext = new HashMap<>(); + nsContext.put("dr", "http://www.driver-repository.eu/namespace/dr"); + nsContext.put("dri", "http://www.driver-repository.eu/namespace/dri"); + nsContext.put("oaf", "http://namespace.openaire.eu/oaf"); + nsContext.put("oai", "http://www.openarchives.org/OAI/2.0/"); + nsContext.put("prov", "http://www.openarchives.org/OAI/2.0/provenance"); + nsContext.put("dc", "http://purl.org/dc/elements/1.1/"); + nsContext.put("datacite", "http://datacite.org/schema/kernel-3"); + DocumentFactory.getInstance().setXPathNamespaceURIs(nsContext); + + final Document doc = DocumentHelper.parseText(xml); + + final String type = doc.valueOf("//dr:CobjCategory/@type"); + final KeyValue collectedFrom = keyValue(doc.valueOf("//oaf:collectedFrom/@id"), doc.valueOf("//oaf:collectedFrom/@name")); + final KeyValue hostedBy = StringUtils.isBlank(doc.valueOf("//oaf:hostedBy/@id")) ? collectedFrom + : keyValue(doc.valueOf("//oaf:hostedBy/@id"), doc.valueOf("//oaf:hostedBy/@name")); + + final DataInfo info = prepareDataInfo(doc); + final long lastUpdateTimestamp = new Date().getTime(); + + return createOafs(doc, type, collectedFrom, hostedBy, info, lastUpdateTimestamp); + } catch (final Exception e) { + throw new RuntimeException(e); + } + } + + protected List createOafs(final Document doc, + final String type, + final KeyValue collectedFrom, + final KeyValue hostedBy, + final DataInfo info, + final long lastUpdateTimestamp) { + + final List oafs = new ArrayList<>(); + + switch (type.toLowerCase()) { + case "": + case "publication": + final Publication p = new Publication(); + populateResultFields(p, doc, collectedFrom, hostedBy, info, lastUpdateTimestamp); + p.setResulttype(PUBLICATION_RESULTTYPE_QUALIFIER); + p.setJournal(prepareJournal(doc, info)); + oafs.add(p); + break; + case "dataset": + final Dataset d = new Dataset(); + populateResultFields(d, doc, collectedFrom, hostedBy, info, lastUpdateTimestamp); + d.setResulttype(DATASET_RESULTTYPE_QUALIFIER); + d.setStoragedate(prepareDatasetStorageDate(doc, info)); + d.setDevice(prepareDatasetDevice(doc, info)); + d.setSize(prepareDatasetSize(doc, info)); + d.setVersion(prepareDatasetVersion(doc, info)); + d.setLastmetadataupdate(prepareDatasetLastMetadataUpdate(doc, info)); + d.setMetadataversionnumber(prepareDatasetMetadataVersionNumber(doc, info)); + d.setGeolocation(prepareDatasetGeoLocations(doc, info)); + oafs.add(d); + break; + case "software": + final Software s = new Software(); + populateResultFields(s, doc, collectedFrom, hostedBy, info, lastUpdateTimestamp); + s.setResulttype(SOFTWARE_RESULTTYPE_QUALIFIER); + s.setDocumentationUrl(prepareSoftwareDocumentationUrls(doc, info)); + s.setLicense(prepareSoftwareLicenses(doc, info)); + s.setCodeRepositoryUrl(prepareSoftwareCodeRepositoryUrl(doc, info)); + s.setProgrammingLanguage(prepareSoftwareProgrammingLanguage(doc, info)); + oafs.add(s); + break; + case "otherresearchproducts": + default: + final OtherResearchProduct o = new OtherResearchProduct(); + populateResultFields(o, doc, collectedFrom, hostedBy, info, lastUpdateTimestamp); + o.setResulttype(OTHER_RESULTTYPE_QUALIFIER); + o.setContactperson(prepareOtherResearchProductContactPersons(doc, info)); + o.setContactgroup(prepareOtherResearchProductContactGroups(doc, info)); + o.setTool(prepareOtherResearchProductTools(doc, info)); + oafs.add(o); + break; + } + + if (!oafs.isEmpty()) { + oafs.addAll(addProjectRels(doc, collectedFrom, info, lastUpdateTimestamp)); + oafs.addAll(addOtherResultRels(doc, collectedFrom, info, lastUpdateTimestamp)); + } + + return oafs; + } + + private List addProjectRels(final Document doc, + final KeyValue collectedFrom, + final DataInfo info, + final long lastUpdateTimestamp) { + + final List res = new ArrayList<>(); + + final String docId = createOpenaireId(50, doc.valueOf("//dri:objIdentifier"), false); + + for (final Object o : doc.selectNodes("//oaf:projectid")) { + final String projectId = createOpenaireId(40, ((Node) o).getText(), true); + + final Relation r1 = new Relation(); + r1.setRelType("resultProject"); + r1.setSubRelType("outcome"); + r1.setRelClass("isProducedBy"); + r1.setSource(docId); + r1.setTarget(projectId); + r1.setCollectedFrom(Arrays.asList(collectedFrom)); + r1.setDataInfo(info); + r1.setLastupdatetimestamp(lastUpdateTimestamp); + res.add(r1); + + final Relation r2 = new Relation(); + r2.setRelType("resultProject"); + r2.setSubRelType("outcome"); + r2.setRelClass("produces"); + r2.setSource(projectId); + r2.setTarget(docId); + r2.setCollectedFrom(Arrays.asList(collectedFrom)); + r2.setDataInfo(info); + r2.setLastupdatetimestamp(lastUpdateTimestamp); + res.add(r2); + } + + return res; + } + + protected abstract List addOtherResultRels(final Document doc, + final KeyValue collectedFrom, + final DataInfo info, + final long lastUpdateTimestamp); + + private void populateResultFields(final Result r, + final Document doc, + final KeyValue collectedFrom, + final KeyValue hostedBy, + final DataInfo info, + final long lastUpdateTimestamp) { + r.setDataInfo(info); + r.setLastupdatetimestamp(lastUpdateTimestamp); + r.setId(createOpenaireId(50, doc.valueOf("//dri:objIdentifier"), false)); + r.setOriginalId(Arrays.asList(doc.valueOf("//dri:objIdentifier"))); + r.setCollectedfrom(Arrays.asList(collectedFrom)); + r.setPid(prepareListStructProps(doc, "//oaf:identifier", "@identifierType", "dnet:pid_types", "dnet:pid_types", info)); + r.setDateofcollection(doc.valueOf("//dr:dateOfCollection")); + r.setDateoftransformation(doc.valueOf("//dr:dateOfTransformation")); + r.setExtraInfo(new ArrayList<>()); // NOT PRESENT IN MDSTORES + r.setOaiprovenance(prepareOAIprovenance(doc)); + r.setAuthor(prepareAuthors(doc, info)); + r.setLanguage(prepareLanguages(doc)); + r.setCountry(new ArrayList<>()); // NOT PRESENT IN MDSTORES + r.setSubject(prepareSubjects(doc, info)); + r.setTitle(prepareTitles(doc, info)); + r.setRelevantdate(prepareRelevantDates(doc, info)); + r.setDescription(prepareDescriptions(doc, info)); + r.setDateofacceptance(prepareField(doc, "//oaf:dateAccepted", info)); + r.setPublisher(preparePublisher(doc, info)); + r.setEmbargoenddate(prepareField(doc, "//oaf:embargoenddate", info)); + r.setSource(prepareSources(doc, info)); + r.setFulltext(new ArrayList<>()); // NOT PRESENT IN MDSTORES + r.setFormat(prepareFormats(doc, info)); + r.setContributor(prepareContributors(doc, info)); + r.setResourcetype(prepareResourceType(doc, info)); + r.setCoverage(prepareCoverages(doc, info)); + r.setContext(new ArrayList<>()); // NOT PRESENT IN MDSTORES + r.setExternalReference(new ArrayList<>()); // NOT PRESENT IN MDSTORES + r.setInstance(prepareInstances(doc, info, collectedFrom, hostedBy)); + } + + protected abstract Qualifier prepareResourceType(Document doc, DataInfo info); + + protected abstract List prepareInstances(Document doc, DataInfo info, KeyValue collectedfrom, KeyValue hostedby); + + protected abstract List> prepareSources(Document doc, DataInfo info); + + protected abstract List prepareRelevantDates(Document doc, DataInfo info); + + protected abstract List> prepareCoverages(Document doc, DataInfo info); + + protected abstract List> prepareContributors(Document doc, DataInfo info); + + protected abstract List> prepareFormats(Document doc, DataInfo info); + + protected abstract Field preparePublisher(Document doc, DataInfo info); + + protected abstract List> prepareDescriptions(Document doc, DataInfo info); + + protected abstract List prepareTitles(Document doc, DataInfo info); + + protected abstract List prepareSubjects(Document doc, DataInfo info); + + protected abstract Qualifier prepareLanguages(Document doc); + + protected abstract List prepareAuthors(Document doc, DataInfo info); + + protected abstract List> prepareOtherResearchProductTools(Document doc, DataInfo info); + + protected abstract List> prepareOtherResearchProductContactGroups(Document doc, DataInfo info); + + protected abstract List> prepareOtherResearchProductContactPersons(Document doc, DataInfo info); + + protected abstract Qualifier prepareSoftwareProgrammingLanguage(Document doc, DataInfo info); + + protected abstract Field prepareSoftwareCodeRepositoryUrl(Document doc, DataInfo info); + + protected abstract List prepareSoftwareLicenses(Document doc, DataInfo info); + + protected abstract List> prepareSoftwareDocumentationUrls(Document doc, DataInfo info); + + protected abstract List prepareDatasetGeoLocations(Document doc, DataInfo info); + + protected abstract Field prepareDatasetMetadataVersionNumber(Document doc, DataInfo info); + + protected abstract Field prepareDatasetLastMetadataUpdate(Document doc, DataInfo info); + + protected abstract Field prepareDatasetVersion(Document doc, DataInfo info); + + protected abstract Field prepareDatasetSize(Document doc, DataInfo info); + + protected abstract Field prepareDatasetDevice(Document doc, DataInfo info); + + protected abstract Field prepareDatasetStorageDate(Document doc, DataInfo info); + + private Journal prepareJournal(final Document doc, final DataInfo info) { + final Node n = doc.selectSingleNode("//oaf:journal"); + if (n != null) { + final String name = n.getText(); + final String issnPrinted = n.valueOf("@issn"); + final String issnOnline = n.valueOf("@eissn"); + final String issnLinking = n.valueOf("@lissn"); + final String ep = n.valueOf("@ep"); + final String iss = n.valueOf("@iss"); + final String sp = n.valueOf("@sp"); + final String vol = n.valueOf("@vol"); + final String edition = n.valueOf("@edition"); + if (StringUtils.isNotBlank(name)) { return journal(name, issnPrinted, issnOnline, issnLinking, ep, iss, sp, vol, edition, null, null, info); } + } + return null; + } + + protected Qualifier prepareQualifier(final Node node, final String xpath, final String schemeId, final String schemeName) { + final String classId = node.valueOf(xpath); + final String className = code2name.get(classId); + return qualifier(classId, className, schemeId, schemeName); + } + + protected List prepareListStructProps(final Node node, + final String xpath, + final String xpathClassId, + final String schemeId, + final String schemeName, + final DataInfo info) { + final List res = new ArrayList<>(); + for (final Object o : node.selectNodes(xpath)) { + final Node n = (Node) o; + final String classId = n.valueOf(xpathClassId); + final String className = code2name.get(classId); + res.add(structuredProperty(n.getText(), classId, className, schemeId, schemeName, info)); + } + return res; + } + + protected List prepareListStructProps(final Node node, final String xpath, final Qualifier qualifier, final DataInfo info) { + final List res = new ArrayList<>(); + for (final Object o : node.selectNodes(xpath)) { + final Node n = (Node) o; + res.add(structuredProperty(n.getText(), qualifier, info)); + } + return res; + } + + protected List prepareListStructProps(final Node node, final String xpath, final DataInfo info) { + final List res = new ArrayList<>(); + for (final Object o : node.selectNodes(xpath)) { + final Node n = (Node) o; + res.add(structuredProperty(n.getText(), n.valueOf("@classid"), n.valueOf("@classname"), n.valueOf("@schemeid"), n + .valueOf("@schemename"), info)); + } + return res; + } + + protected OAIProvenance prepareOAIprovenance(final Document doc) { + final Node n = doc.selectSingleNode("//*[local-name()='provenance']/*[local-name()='originDescription']"); + + if (n == null) { return null; } + + final String identifier = n.valueOf("./*[local-name()='identifier']"); + final String baseURL = n.valueOf("./*[local-name()='baseURL']");; + final String metadataNamespace = n.valueOf("./*[local-name()='metadataNamespace']");; + final boolean altered = n.valueOf("@altered").equalsIgnoreCase("true"); + final String datestamp = n.valueOf("./*[local-name()='datestamp']");; + final String harvestDate = n.valueOf("@harvestDate");; + + return oaiIProvenance(identifier, baseURL, metadataNamespace, altered, datestamp, harvestDate); + + } + + protected DataInfo prepareDataInfo(final Document doc) { + final Node n = doc.selectSingleNode("//oaf:datainfo"); + + if (n == null) { return null; } + + final String paClassId = n.valueOf("./oaf:provenanceaction/@classid"); + final String paClassName = n.valueOf("./oaf:provenanceaction/@classname"); + final String paSchemeId = n.valueOf("./oaf:provenanceaction/@schemeid"); + final String paSchemeName = n.valueOf("./oaf:provenanceaction/@schemename"); + + final boolean deletedbyinference = Boolean.parseBoolean(n.valueOf("./oaf:deletedbyinference")); + final String inferenceprovenance = n.valueOf("./oaf:inferenceprovenance"); + final Boolean inferred = Boolean.parseBoolean(n.valueOf("./oaf:inferred")); + final String trust = n.valueOf("./oaf:trust"); + + return dataInfo(deletedbyinference, inferenceprovenance, inferred, false, qualifier(paClassId, paClassName, paSchemeId, paSchemeName), trust); + } + + protected Field prepareField(final Node node, final String xpath, final DataInfo info) { + return field(node.valueOf(xpath), info); + } + + protected List> prepareListFields(final Node node, final String xpath, final DataInfo info) { + return listFields(info, prepareListString(node, xpath)); + } + + protected List prepareListString(final Node node, final String xpath) { + final List res = new ArrayList<>(); + for (final Object o : node.selectNodes(xpath)) { + final String s = ((Node) o).getText().trim(); + if (StringUtils.isNotBlank(s)) { + res.add(s); + } + } + return res; + } + +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/step2/GenerateEntitiesApplication.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/step2/GenerateEntitiesApplication.java new file mode 100644 index 0000000000..7f907b0c8c --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/step2/GenerateEntitiesApplication.java @@ -0,0 +1,173 @@ +package eu.dnetlib.dhp.migration.step2; + +import java.io.IOException; +import java.sql.SQLException; +import java.util.Arrays; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; + +import org.apache.commons.io.IOUtils; +import org.apache.commons.lang3.StringUtils; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.compress.GzipCodec; +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.sql.SparkSession; + +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.migration.step1.MigrateMongoMdstoresApplication; +import eu.dnetlib.dhp.migration.utils.DbClient; +import eu.dnetlib.dhp.schema.oaf.Dataset; +import eu.dnetlib.dhp.schema.oaf.Datasource; +import eu.dnetlib.dhp.schema.oaf.Oaf; +import eu.dnetlib.dhp.schema.oaf.Organization; +import eu.dnetlib.dhp.schema.oaf.OtherResearchProduct; +import eu.dnetlib.dhp.schema.oaf.Project; +import eu.dnetlib.dhp.schema.oaf.Publication; +import eu.dnetlib.dhp.schema.oaf.Relation; +import eu.dnetlib.dhp.schema.oaf.Software; +import scala.Tuple2; + +public class GenerateEntitiesApplication { + + private static final Log log = LogFactory.getLog(GenerateEntitiesApplication.class); + + public static void main(final String[] args) throws Exception { + final ArgumentApplicationParser parser = new ArgumentApplicationParser( + IOUtils.toString(MigrateMongoMdstoresApplication.class + .getResourceAsStream("/eu/dnetlib/dhp/migration/generate_entities_parameters.json"))); + + parser.parseArgument(args); + + final String sourcePaths = parser.get("sourcePaths"); + final String targetPath = parser.get("targetPath"); + + final String dbUrl = parser.get("postgresUrl"); + final String dbUser = parser.get("postgresUser"); + final String dbPassword = parser.get("postgresPassword"); + + final Map code2name = loadClassNames(dbUrl, dbUser, dbPassword); + + try (final SparkSession spark = newSparkSession(parser); final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext())) { + final List existingSourcePaths = Arrays.stream(sourcePaths.split(",")).filter(p -> exists(sc, p)).collect(Collectors.toList()); + generateEntities(sc, code2name, existingSourcePaths, targetPath); + } + } + + private static SparkSession newSparkSession(final ArgumentApplicationParser parser) { + return SparkSession + .builder() + .appName(GenerateEntitiesApplication.class.getSimpleName()) + .master(parser.get("master")) + .getOrCreate(); + } + + private static void generateEntities(final JavaSparkContext sc, + final Map code2name, + final List sourcePaths, + final String targetPath) { + + log.info("Generate entities from files:"); + sourcePaths.forEach(log::info); + + JavaRDD inputRdd = sc.emptyRDD(); + + for (final String sp : sourcePaths) { + inputRdd = inputRdd.union(sc.sequenceFile(sp, Text.class, Text.class) + .map(k -> new Tuple2<>(k._1().toString(), k._2().toString())) + .map(k -> convertToListOaf(k._1(), k._2(), code2name)) + .flatMap(list -> list.iterator()) + .map(oaf -> oaf.getClass().getSimpleName().toLowerCase() + "|" + convertToJson(oaf))); + } + + inputRdd.saveAsTextFile(targetPath, GzipCodec.class); + + } + + private static List convertToListOaf(final String id, final String s, final Map code2name) { + final String type = StringUtils.substringAfter(id, ":"); + + switch (type.toLowerCase()) { + case "native_oaf": + return new OafToOafMapper(code2name).processMdRecord(s); + case "native_odf": + return new OdfToOafMapper(code2name).processMdRecord(s); + case "datasource": + return Arrays.asList(convertFromJson(s, Datasource.class)); + case "organization": + return Arrays.asList(convertFromJson(s, Organization.class)); + case "project": + return Arrays.asList(convertFromJson(s, Project.class)); + case "relation": + return Arrays.asList(convertFromJson(s, Relation.class)); + case "publication": + return Arrays.asList(convertFromJson(s, Publication.class)); + case "dataset": + return Arrays.asList(convertFromJson(s, Dataset.class)); + case "software": + return Arrays.asList(convertFromJson(s, Software.class)); + case "otherresearchproducts": + default: + return Arrays.asList(convertFromJson(s, OtherResearchProduct.class)); + } + + } + + private static Map loadClassNames(final String dbUrl, final String dbUser, final String dbPassword) throws IOException { + + log.info("Loading vocabulary terms from db..."); + + final Map map = new HashMap<>(); + + try (DbClient dbClient = new DbClient(dbUrl, dbUser, dbPassword)) { + dbClient.processResults("select code, name from class", rs -> { + try { + map.put(rs.getString("code"), rs.getString("name")); + } catch (final SQLException e) { + e.printStackTrace(); + } + }); + } + + log.info("Found " + map.size() + " terms."); + + return map; + + } + + private static String convertToJson(final Oaf oaf) { + try { + return new ObjectMapper().writeValueAsString(oaf); + } catch (final Exception e) { + throw new RuntimeException(e); + } + } + + private static Oaf convertFromJson(final String s, final Class clazz) { + try { + return new ObjectMapper().readValue(s, clazz); + } catch (final Exception e) { + log.error("Error parsing object of class: " + clazz); + log.error(s); + throw new RuntimeException(e); + } + } + + private static boolean exists(final JavaSparkContext context, final String pathToFile) { + try { + final FileSystem hdfs = org.apache.hadoop.fs.FileSystem.get(context.hadoopConfiguration()); + final Path path = new Path(pathToFile); + return hdfs.exists(path); + } catch (final IOException e) { + throw new RuntimeException(e); + } + } +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/step2/OafToOafMapper.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/step2/OafToOafMapper.java new file mode 100644 index 0000000000..110abc4864 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/step2/OafToOafMapper.java @@ -0,0 +1,242 @@ +package eu.dnetlib.dhp.migration.step2; + +import static eu.dnetlib.dhp.migration.utils.OafMapperUtils.createOpenaireId; +import static eu.dnetlib.dhp.migration.utils.OafMapperUtils.field; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.Map; + +import org.dom4j.Document; +import org.dom4j.Node; + +import eu.dnetlib.dhp.migration.utils.PacePerson; +import eu.dnetlib.dhp.schema.oaf.Author; +import eu.dnetlib.dhp.schema.oaf.DataInfo; +import eu.dnetlib.dhp.schema.oaf.Field; +import eu.dnetlib.dhp.schema.oaf.GeoLocation; +import eu.dnetlib.dhp.schema.oaf.Instance; +import eu.dnetlib.dhp.schema.oaf.KeyValue; +import eu.dnetlib.dhp.schema.oaf.Oaf; +import eu.dnetlib.dhp.schema.oaf.Qualifier; +import eu.dnetlib.dhp.schema.oaf.Relation; +import eu.dnetlib.dhp.schema.oaf.StructuredProperty; + +public class OafToOafMapper extends AbstractMdRecordToOafMapper { + + public OafToOafMapper(final Map code2name) { + super(code2name); + } + + @Override + protected List prepareAuthors(final Document doc, final DataInfo info) { + final List res = new ArrayList<>(); + int pos = 1; + for (final Object o : doc.selectNodes("//dc:creator")) { + final Node n = (Node) o; + final Author author = new Author(); + author.setFullname(n.getText()); + author.setRank(pos++); + final PacePerson p = new PacePerson(n.getText(), false); + if (p.isAccurate()) { + author.setName(p.getNormalisedFirstName()); + author.setSurname(p.getNormalisedSurname()); + } + res.add(author); + } + return res; + } + + @Override + protected Qualifier prepareLanguages(final Document doc) { + return prepareQualifier(doc, "//dc:language", "dnet:languages", "dnet:languages"); + } + + @Override + protected List prepareSubjects(final Document doc, final DataInfo info) { + return prepareListStructProps(doc, "//dc:subject", info); + } + + @Override + protected List prepareTitles(final Document doc, final DataInfo info) { + return prepareListStructProps(doc, "//dc:title", MAIN_TITLE_QUALIFIER, info); + } + + @Override + protected List> prepareDescriptions(final Document doc, final DataInfo info) { + return prepareListFields(doc, "//dc:description", info); + } + + @Override + protected Field preparePublisher(final Document doc, final DataInfo info) { + return prepareField(doc, "//dc:publisher", info); + } + + @Override + protected List> prepareFormats(final Document doc, final DataInfo info) { + return prepareListFields(doc, "//dc:format", info); + } + + @Override + protected List> prepareContributors(final Document doc, final DataInfo info) { + return prepareListFields(doc, "//dc:contributor", info); + } + + @Override + protected List> prepareCoverages(final Document doc, final DataInfo info) { + return prepareListFields(doc, "//dc:coverage", info); + } + + @Override + protected List prepareInstances(final Document doc, final DataInfo info, final KeyValue collectedfrom, final KeyValue hostedby) { + final List res = new ArrayList<>(); + for (final Object o : doc.selectNodes("//dc:identifier")) { + final String url = ((Node) o).getText().trim(); + if (url.startsWith("http")) { + final Instance instance = new Instance(); + instance.setUrl(Arrays.asList(url)); + instance.setInstancetype(prepareQualifier(doc, "//dr:CobjCategory", "dnet:publication_resource", "dnet:publication_resource")); + instance.setCollectedfrom(collectedfrom); + instance.setHostedby(hostedby); + instance.setDateofacceptance(field(doc.valueOf("//oaf:dateAccepted"), info)); + instance.setDistributionlocation(doc.valueOf("//oaf:distributionlocation")); + instance.setAccessright(prepareQualifier(doc, "//oaf:accessrights", "dnet:access_modes", "dnet:access_modes")); + instance.setLicense(field(doc.valueOf("//oaf:license"), info)); + instance.setRefereed(field(doc.valueOf("//oaf:refereed"), info)); + instance.setProcessingchargeamount(field(doc.valueOf("//oaf:processingchargeamount"), info)); + instance.setProcessingchargecurrency(field(doc.valueOf("//oaf:processingchargeamount/@currency"), info)); + res.add(instance); + } + } + return res; + } + + @Override + protected List> prepareSources(final Document doc, final DataInfo info) { + return prepareListFields(doc, "//dc:source", info); + } + + @Override + protected List prepareRelevantDates(final Document doc, final DataInfo info) { + return new ArrayList<>(); // NOT PRESENT IN OAF + } + + // SOFTWARES + + @Override + protected Qualifier prepareSoftwareProgrammingLanguage(final Document doc, final DataInfo info) { + return null; // NOT PRESENT IN OAF + } + + @Override + protected Field prepareSoftwareCodeRepositoryUrl(final Document doc, final DataInfo info) { + return null; // NOT PRESENT IN OAF + } + + @Override + protected List prepareSoftwareLicenses(final Document doc, final DataInfo info) { + return new ArrayList<>(); // NOT PRESENT IN OAF + } + + @Override + protected List> prepareSoftwareDocumentationUrls(final Document doc, final DataInfo info) { + return new ArrayList<>(); // NOT PRESENT IN OAF + } + + // DATASETS + @Override + protected List prepareDatasetGeoLocations(final Document doc, final DataInfo info) { + return new ArrayList<>(); // NOT PRESENT IN OAF + } + + @Override + protected Field prepareDatasetMetadataVersionNumber(final Document doc, final DataInfo info) { + return null; // NOT PRESENT IN OAF + } + + @Override + protected Field prepareDatasetLastMetadataUpdate(final Document doc, final DataInfo info) { + return null; // NOT PRESENT IN OAF + } + + @Override + protected Field prepareDatasetVersion(final Document doc, final DataInfo info) { + return null; // NOT PRESENT IN OAF + } + + @Override + protected Field prepareDatasetSize(final Document doc, final DataInfo info) { + return null; // NOT PRESENT IN OAF + } + + @Override + protected Field prepareDatasetDevice(final Document doc, final DataInfo info) { + return null; // NOT PRESENT IN OAF + } + + @Override + protected Field prepareDatasetStorageDate(final Document doc, final DataInfo info) { + return null; // NOT PRESENT IN OAF + } + + // OTHER PRODUCTS + + @Override + protected List> prepareOtherResearchProductTools(final Document doc, final DataInfo info) { + return new ArrayList<>(); // NOT PRESENT IN OAF + } + + @Override + protected List> prepareOtherResearchProductContactGroups(final Document doc, final DataInfo info) { + return new ArrayList<>(); // NOT PRESENT IN OAF + } + + @Override + protected List> prepareOtherResearchProductContactPersons(final Document doc, final DataInfo info) { + return new ArrayList<>(); // NOT PRESENT IN OAF + } + + @Override + protected List addOtherResultRels(final Document doc, + final KeyValue collectedFrom, + final DataInfo info, + final long lastUpdateTimestamp) { + final String docId = createOpenaireId(50, doc.valueOf("//dri:objIdentifier"), false); + + final List res = new ArrayList<>(); + + for (final Object o : doc.selectNodes("//*[local-name()='relatedDataset']")) { + final String otherId = createOpenaireId(50, ((Node) o).getText(), false); + + final Relation r1 = new Relation(); + r1.setRelType("resultResult"); + r1.setSubRelType("publicationDataset"); + r1.setRelClass("isRelatedTo"); + r1.setSource(docId); + r1.setTarget(otherId); + r1.setCollectedFrom(Arrays.asList(collectedFrom)); + r1.setDataInfo(info); + r1.setLastupdatetimestamp(lastUpdateTimestamp); + res.add(r1); + + final Relation r2 = new Relation(); + r2.setRelType("resultResult"); + r2.setSubRelType("publicationDataset"); + r2.setRelClass("isRelatedTo"); + r2.setSource(otherId); + r2.setTarget(docId); + r2.setCollectedFrom(Arrays.asList(collectedFrom)); + r2.setDataInfo(info); + r2.setLastupdatetimestamp(lastUpdateTimestamp); + res.add(r2); + } + return res; + } + + @Override + protected Qualifier prepareResourceType(final Document doc, final DataInfo info) { + return null; // NOT PRESENT IN OAF + } + +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/step2/OdfToOafMapper.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/step2/OdfToOafMapper.java new file mode 100644 index 0000000000..b4868b8f9b --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/step2/OdfToOafMapper.java @@ -0,0 +1,265 @@ +package eu.dnetlib.dhp.migration.step2; + +import static eu.dnetlib.dhp.migration.utils.OafMapperUtils.createOpenaireId; +import static eu.dnetlib.dhp.migration.utils.OafMapperUtils.field; +import static eu.dnetlib.dhp.migration.utils.OafMapperUtils.structuredProperty; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.Map; + +import org.apache.commons.lang3.StringUtils; +import org.dom4j.Document; +import org.dom4j.Node; + +import eu.dnetlib.dhp.schema.oaf.Author; +import eu.dnetlib.dhp.schema.oaf.DataInfo; +import eu.dnetlib.dhp.schema.oaf.Field; +import eu.dnetlib.dhp.schema.oaf.GeoLocation; +import eu.dnetlib.dhp.schema.oaf.Instance; +import eu.dnetlib.dhp.schema.oaf.KeyValue; +import eu.dnetlib.dhp.schema.oaf.Oaf; +import eu.dnetlib.dhp.schema.oaf.Qualifier; +import eu.dnetlib.dhp.schema.oaf.Relation; +import eu.dnetlib.dhp.schema.oaf.StructuredProperty; + +public class OdfToOafMapper extends AbstractMdRecordToOafMapper { + + public OdfToOafMapper(final Map code2name) { + super(code2name); + } + + @Override + protected List prepareTitles(final Document doc, final DataInfo info) { + return prepareListStructProps(doc, "//datacite:title", MAIN_TITLE_QUALIFIER, info); + } + + @Override + protected List prepareAuthors(final Document doc, final DataInfo info) { + final List res = new ArrayList<>(); + int pos = 1; + for (final Object o : doc.selectNodes("//datacite:creator")) { + final Node n = (Node) o; + final Author author = new Author(); + author.setFullname(n.valueOf("./datacite:creatorName")); + author.setName(n.valueOf("./datacite:givenName")); + author.setSurname(n.valueOf("./datacite:familyName")); + author.setAffiliation(prepareListFields(doc, "./datacite:affiliation", info)); + author.setPid(preparePids(doc, info)); + author.setRank(pos++); + res.add(author); + } + return res; + } + + private List preparePids(final Document doc, final DataInfo info) { + final List res = new ArrayList<>(); + for (final Object o : doc.selectNodes("./datacite:nameIdentifier")) { + res.add(structuredProperty(((Node) o).getText(), prepareQualifier((Node) o, "./@nameIdentifierScheme", "dnet:pid_types", "dnet:pid_types"), info)); + } + return res; + } + + @Override + protected List prepareInstances(final Document doc, final DataInfo info, final KeyValue collectedfrom, final KeyValue hostedby) { + final List res = new ArrayList<>(); + for (final Object o : doc.selectNodes("//datacite:alternateIdentifier[@alternateIdentifierType='URL']")) { + final Instance instance = new Instance(); + instance.setUrl(Arrays.asList(((Node) o).getText().trim())); + instance.setInstancetype(prepareQualifier(doc, "//dr:CobjCategory", "dnet:publication_resource", "dnet:publication_resource")); + instance.setCollectedfrom(collectedfrom); + instance.setHostedby(hostedby); + instance.setDateofacceptance(field(doc.valueOf("//oaf:dateAccepted"), info)); + instance.setDistributionlocation(doc.valueOf("//oaf:distributionlocation")); + instance.setAccessright(prepareQualifier(doc, "//oaf:accessrights", "dnet:access_modes", "dnet:access_modes")); + instance.setLicense(field(doc.valueOf("//oaf:license"), info)); + instance.setRefereed(field(doc.valueOf("//oaf:refereed"), info)); + instance.setProcessingchargeamount(field(doc.valueOf("//oaf:processingchargeamount"), info)); + instance.setProcessingchargecurrency(field(doc.valueOf("//oaf:processingchargeamount/@currency"), info)); + res.add(instance); + } + return res; + } + + @Override + protected List> prepareSources(final Document doc, final DataInfo info) { + return new ArrayList<>(); // Not present in ODF ??? + } + + @Override + protected List prepareRelevantDates(final Document doc, final DataInfo info) { + final List res = new ArrayList<>(); + for (final Object o : doc.selectNodes("//datacite:date")) { + final String dateType = ((Node) o).valueOf("@dateType"); + if (StringUtils.isBlank(dateType) && !dateType.equalsIgnoreCase("Accepted") && !dateType.equalsIgnoreCase("Issued") + && !dateType.equalsIgnoreCase("Updated") && !dateType.equalsIgnoreCase("Available")) { + res.add(structuredProperty(((Node) o).getText(), "UNKNOWN", "UNKNOWN", "dnet:dataCite_date", "dnet:dataCite_date", info)); + } + } + return res; + } + + @Override + protected List> prepareCoverages(final Document doc, final DataInfo info) { + return new ArrayList<>(); // Not present in ODF ??? + } + + @Override + protected List> prepareContributors(final Document doc, final DataInfo info) { + return prepareListFields(doc, "//datacite:contributorName", info); + } + + @Override + protected List> prepareFormats(final Document doc, final DataInfo info) { + return prepareListFields(doc, "//datacite:format", info); + } + + @Override + protected Field preparePublisher(final Document doc, final DataInfo info) { + return prepareField(doc, "//datacite:publisher", info); + } + + @Override + protected List> prepareDescriptions(final Document doc, final DataInfo info) { + return prepareListFields(doc, "//datacite:description[@descriptionType='Abstract']", info); + } + + @Override + protected List prepareSubjects(final Document doc, final DataInfo info) { + return prepareListStructProps(doc, "//datacite:subject", info); + } + + @Override + protected Qualifier prepareLanguages(final Document doc) { + return prepareQualifier(doc, "//datacite:language", "dnet:languages", "dnet:languages"); + } + + @Override + protected List> prepareOtherResearchProductTools(final Document doc, final DataInfo info) { + return new ArrayList<>(); // Not present in ODF ??? + } + + @Override + protected List> prepareOtherResearchProductContactGroups(final Document doc, final DataInfo info) { + return prepareListFields(doc, "//datacite:contributor[@contributorType='ContactGroup']/datacite:contributorName", info); + } + + @Override + protected List> prepareOtherResearchProductContactPersons(final Document doc, final DataInfo info) { + return prepareListFields(doc, "//datacite:contributor[@contributorType='ContactPerson']/datacite:contributorName", info); + } + + @Override + protected Qualifier prepareSoftwareProgrammingLanguage(final Document doc, final DataInfo info) { + return prepareQualifier(doc, "//datacite:format", "dnet:programming_languages", "dnet:programming_languages"); + } + + @Override + protected Field prepareSoftwareCodeRepositoryUrl(final Document doc, final DataInfo info) { + return null; // Not present in ODF ??? + } + + @Override + protected List prepareSoftwareLicenses(final Document doc, final DataInfo info) { + return new ArrayList<>(); // Not present in ODF ??? + } + + @Override + protected List> prepareSoftwareDocumentationUrls(final Document doc, final DataInfo info) { + return prepareListFields(doc, "//datacite:relatedIdentifier[@relatedIdentifierType='URL' and @relationType='IsDocumentedBy']", info); + } + + // DATASETS + + @Override + protected List prepareDatasetGeoLocations(final Document doc, final DataInfo info) { + final List res = new ArrayList<>(); + + for (final Object o : doc.selectNodes("//datacite:geoLocation")) { + final GeoLocation loc = new GeoLocation(); + loc.setBox(((Node) o).valueOf("./datacite:geoLocationBox")); + loc.setPlace(((Node) o).valueOf("./datacite:geoLocationPlace")); + loc.setPoint(((Node) o).valueOf("./datacite:geoLocationPoint")); + res.add(loc); + } + return res; + } + + @Override + protected Field prepareDatasetMetadataVersionNumber(final Document doc, final DataInfo info) { + return null; // Not present in ODF ??? + } + + @Override + protected Field prepareDatasetLastMetadataUpdate(final Document doc, final DataInfo info) { + return prepareField(doc, "//datacite:date[@dateType='Updated']", info); + } + + @Override + protected Field prepareDatasetVersion(final Document doc, final DataInfo info) { + return prepareField(doc, "//datacite:version", info); + } + + @Override + protected Field prepareDatasetSize(final Document doc, final DataInfo info) { + return prepareField(doc, "//datacite:size", info); + } + + @Override + protected Field prepareDatasetDevice(final Document doc, final DataInfo info) { + return null; // Not present in ODF ??? + } + + @Override + protected Field prepareDatasetStorageDate(final Document doc, final DataInfo info) { + return prepareField(doc, "//datacite:date[@dateType='Issued']", info); + } + + @Override + protected List addOtherResultRels(final Document doc, final KeyValue collectedFrom, final DataInfo info, final long lastUpdateTimestamp) { + + final String docId = createOpenaireId(50, doc.valueOf("//dri:objIdentifier"), false); + + final List res = new ArrayList<>(); + + for (final Object o : doc.selectNodes("//datacite:relatedIdentifier[@relatedIdentifierType='OPENAIRE']")) { + final String otherId = createOpenaireId(50, ((Node) o).getText(), false); + final String type = ((Node) o).valueOf("@relationType"); + + if (type.equals("IsSupplementTo")) { + res.add(prepareOtherResultRel(collectedFrom, info, lastUpdateTimestamp, docId, otherId, "supplement", "isSupplementTo")); + res.add(prepareOtherResultRel(collectedFrom, info, lastUpdateTimestamp, otherId, docId, "supplement", "isSupplementedBy")); + } else if (type.equals("IsPartOf")) { + res.add(prepareOtherResultRel(collectedFrom, info, lastUpdateTimestamp, docId, otherId, "part", "IsPartOf")); + res.add(prepareOtherResultRel(collectedFrom, info, lastUpdateTimestamp, otherId, docId, "part", "HasParts")); + } else {} + } + return res; + } + + private Relation prepareOtherResultRel(final KeyValue collectedFrom, + final DataInfo info, + final long lastUpdateTimestamp, + final String source, + final String target, + final String subRelType, + final String relClass) { + final Relation r = new Relation(); + r.setRelType("resultResult"); + r.setSubRelType(subRelType); + r.setRelClass(relClass); + r.setSource(source); + r.setTarget(target); + r.setCollectedFrom(Arrays.asList(collectedFrom)); + r.setDataInfo(info); + r.setLastupdatetimestamp(lastUpdateTimestamp); + return r; + } + + @Override + protected Qualifier prepareResourceType(final Document doc, final DataInfo info) { + return prepareQualifier(doc, "//*[local-name() = 'resource']//*[local-name() = 'resourceType']", "dnet:dataCite_resource", "dnet:dataCite_resource"); + } + +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/step3/DispatchEntitiesApplication.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/step3/DispatchEntitiesApplication.java new file mode 100644 index 0000000000..4ee24cba0a --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/step3/DispatchEntitiesApplication.java @@ -0,0 +1,71 @@ +package eu.dnetlib.dhp.migration.step3; + +import org.apache.commons.io.IOUtils; +import org.apache.commons.lang3.StringUtils; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.io.compress.GzipCodec; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.sql.SparkSession; + +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.migration.step1.MigrateMongoMdstoresApplication; +import eu.dnetlib.dhp.schema.oaf.Dataset; +import eu.dnetlib.dhp.schema.oaf.Datasource; +import eu.dnetlib.dhp.schema.oaf.Organization; +import eu.dnetlib.dhp.schema.oaf.OtherResearchProduct; +import eu.dnetlib.dhp.schema.oaf.Project; +import eu.dnetlib.dhp.schema.oaf.Publication; +import eu.dnetlib.dhp.schema.oaf.Relation; +import eu.dnetlib.dhp.schema.oaf.Software; + +public class DispatchEntitiesApplication { + + private static final Log log = LogFactory.getLog(DispatchEntitiesApplication.class); + + public static void main(final String[] args) throws Exception { + final ArgumentApplicationParser parser = new ArgumentApplicationParser( + IOUtils.toString(MigrateMongoMdstoresApplication.class + .getResourceAsStream("/eu/dnetlib/dhp/migration/dispatch_entities_parameters.json"))); + parser.parseArgument(args); + + try (final SparkSession spark = newSparkSession(parser); final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext())) { + + final String sourcePath = parser.get("sourcePath"); + final String targetPath = parser.get("graphRawPath"); + + processEntity(sc, Publication.class, sourcePath, targetPath); + processEntity(sc, Dataset.class, sourcePath, targetPath); + processEntity(sc, Software.class, sourcePath, targetPath); + processEntity(sc, OtherResearchProduct.class, sourcePath, targetPath); + processEntity(sc, Datasource.class, sourcePath, targetPath); + processEntity(sc, Organization.class, sourcePath, targetPath); + processEntity(sc, Project.class, sourcePath, targetPath); + processEntity(sc, Relation.class, sourcePath, targetPath); + } + } + + private static SparkSession newSparkSession(final ArgumentApplicationParser parser) { + return SparkSession + .builder() + .appName(DispatchEntitiesApplication.class.getSimpleName()) + .master(parser.get("master")) + .getOrCreate(); + } + + private static void processEntity(final JavaSparkContext sc, final Class clazz, final String sourcePath, final String targetPath) { + final String type = clazz.getSimpleName().toLowerCase(); + + log.info(String.format("Processing entities (%s) in file: %s", type, sourcePath)); + + sc.textFile(sourcePath) + .filter(l -> isEntityType(l, type)) + .map(l -> StringUtils.substringAfter(l, "|")) + .saveAsTextFile(targetPath + "/" + type, GzipCodec.class); // use repartition(XXX) ??? + } + + private static boolean isEntityType(final String line, final String type) { + return StringUtils.substringBefore(line, "|").equalsIgnoreCase(type); + } + +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/utils/AbstractMigrationApplication.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/utils/AbstractMigrationApplication.java new file mode 100644 index 0000000000..e1a5e5fa79 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/utils/AbstractMigrationApplication.java @@ -0,0 +1,81 @@ +package eu.dnetlib.dhp.migration.utils; + +import java.io.Closeable; +import java.io.IOException; +import java.util.concurrent.atomic.AtomicInteger; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.SequenceFile; +import org.apache.hadoop.io.Text; +import org.codehaus.jackson.map.ObjectMapper; + +import eu.dnetlib.dhp.schema.oaf.Oaf; + +public class AbstractMigrationApplication implements Closeable { + + private final AtomicInteger counter = new AtomicInteger(0); + + private final Text key = new Text(); + + private final Text value = new Text(); + + private final SequenceFile.Writer writer; + + private final ObjectMapper objectMapper = new ObjectMapper(); + + private static final Log log = LogFactory.getLog(AbstractMigrationApplication.class); + + protected AbstractMigrationApplication() { // ONLY FOR UNIT TEST + this.writer = null; + } + + public AbstractMigrationApplication(final String hdfsPath) throws Exception { + + log.info(String.format("Creating SequenceFile Writer, hdfsPath=%s", hdfsPath)); + + this.writer = SequenceFile.createWriter(getConf(), SequenceFile.Writer.file(new Path(hdfsPath)), SequenceFile.Writer + .keyClass(Text.class), SequenceFile.Writer.valueClass(Text.class)); + } + + private Configuration getConf() throws IOException { + final Configuration conf = new Configuration(); + /* + * conf.set("fs.defaultFS", hdfsNameNode); conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName()); + * conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName()); System.setProperty("HADOOP_USER_NAME", hdfsUser); + * System.setProperty("hadoop.home.dir", "/"); FileSystem.get(URI.create(hdfsNameNode), conf); + */ + return conf; + } + + protected void emit(final String s, final String type) { + try { + key.set(counter.getAndIncrement() + ":" + type); + value.set(s); + writer.append(key, value); + } catch (final Exception e) { + throw new RuntimeException(e); + } + } + + protected void emitOaf(final Oaf oaf) { + try { + emit(objectMapper.writeValueAsString(oaf), oaf.getClass().getSimpleName().toLowerCase()); + } catch (final Exception e) { + throw new RuntimeException(e); + } + } + + public ObjectMapper getObjectMapper() { + return objectMapper; + } + + @Override + public void close() throws IOException { + writer.hflush(); + writer.close(); + } + +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/utils/DbClient.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/utils/DbClient.java new file mode 100644 index 0000000000..8e97843464 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/utils/DbClient.java @@ -0,0 +1,65 @@ +package eu.dnetlib.dhp.migration.utils; + +import java.io.Closeable; +import java.io.IOException; +import java.sql.Connection; +import java.sql.DriverManager; +import java.sql.ResultSet; +import java.sql.SQLException; +import java.sql.Statement; +import java.util.function.Consumer; + +import org.apache.commons.lang3.StringUtils; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; + +public class DbClient implements Closeable { + + private static final Log log = LogFactory.getLog(DbClient.class); + + private Connection connection; + + public DbClient(final String address, final String login, final String password) { + + try { + Class.forName("org.postgresql.Driver"); + + this.connection = + StringUtils.isNoneBlank(login, password) ? DriverManager.getConnection(address, login, password) : DriverManager.getConnection(address); + this.connection.setAutoCommit(false); + } catch (final Exception e) { + log.error("Connection to postgresDB failed"); + throw new RuntimeException("Connection to postgresDB failed", e); + } + log.info("Opened database successfully"); + } + + public void processResults(final String sql, final Consumer consumer) { + + try (final Statement stmt = connection.createStatement()) { + stmt.setFetchSize(100); + + try (final ResultSet rs = stmt.executeQuery(sql)) { + while (rs.next()) { + consumer.accept(rs); + } + } catch (final SQLException e) { + log.error("Error executing sql query: " + sql, e); + throw new RuntimeException("Error executing sql query", e); + } + } catch (final SQLException e1) { + log.error("Error preparing sql statement", e1); + throw new RuntimeException("Error preparing sql statement", e1); + } + } + + @Override + public void close() throws IOException { + try { + connection.close(); + } catch (final SQLException e) { + throw new RuntimeException(e); + } + } + +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/utils/MdstoreClient.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/utils/MdstoreClient.java new file mode 100644 index 0000000000..612503da74 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/utils/MdstoreClient.java @@ -0,0 +1,94 @@ +package eu.dnetlib.dhp.migration.utils; + +import java.io.Closeable; +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.Map; +import java.util.stream.StreamSupport; + +import org.apache.commons.lang3.StringUtils; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.bson.Document; + +import com.google.common.collect.Iterables; +import com.mongodb.MongoClient; +import com.mongodb.MongoClientURI; +import com.mongodb.client.MongoCollection; +import com.mongodb.client.MongoDatabase; + +public class MdstoreClient implements Closeable { + + private final MongoClient client; + private final MongoDatabase db; + + private static final String COLL_METADATA = "metadata"; + private static final String COLL_METADATA_MANAGER = "metadataManager"; + + private static final Log log = LogFactory.getLog(MdstoreClient.class); + + public MdstoreClient(final String baseUrl, final String dbName) { + this.client = new MongoClient(new MongoClientURI(baseUrl)); + this.db = getDb(client, dbName); + } + + public Map validCollections(final String mdFormat, final String mdLayout, final String mdInterpretation) { + + final Map transactions = new HashMap<>(); + for (final Document entry : getColl(db, COLL_METADATA_MANAGER, true).find()) { + final String mdId = entry.getString("mdId"); + final String currentId = entry.getString("currentId"); + if (StringUtils.isNoneBlank(mdId, currentId)) { + transactions.put(mdId, currentId); + } + } + + final Map res = new HashMap<>(); + for (final Document entry : getColl(db, COLL_METADATA, true).find()) { + if (entry.getString("format").equals(mdFormat) && entry.getString("layout").equals(mdLayout) + && entry.getString("interpretation").equals(mdInterpretation) && transactions.containsKey(entry.getString("mdId"))) { + res.put(entry.getString("mdId"), transactions.get(entry.getString("mdId"))); + } + } + + return res; + } + + private MongoDatabase getDb(final MongoClient client, final String dbName) { + if (!Iterables.contains(client.listDatabaseNames(), dbName)) { + final String err = String.format("Database '%s' not found in %s", dbName, client.getAddress()); + log.warn(err); + throw new RuntimeException(err); + } + return client.getDatabase(dbName); + } + + private MongoCollection getColl(final MongoDatabase db, final String collName, final boolean abortIfMissing) { + if (!Iterables.contains(db.listCollectionNames(), collName)) { + final String err = String.format(String.format("Missing collection '%s' in database '%s'", collName, db.getName())); + log.warn(err); + if (abortIfMissing) { + throw new RuntimeException(err); + } else { + return null; + } + } + return db.getCollection(collName); + } + + public Iterable listRecords(final String collName) { + final MongoCollection coll = getColl(db, collName, false); + return coll == null ? new ArrayList<>() + : () -> StreamSupport.stream(coll.find().spliterator(), false) + .filter(e -> e.containsKey("body")) + .map(e -> e.getString("body")) + .iterator(); + } + + @Override + public void close() throws IOException { + client.close(); + } + +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/utils/OafMapperUtils.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/utils/OafMapperUtils.java new file mode 100644 index 0000000000..8e51c1858e --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/utils/OafMapperUtils.java @@ -0,0 +1,195 @@ +package eu.dnetlib.dhp.migration.utils; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.Objects; +import java.util.stream.Collectors; + +import org.apache.commons.lang3.StringUtils; + +import eu.dnetlib.dhp.schema.oaf.DataInfo; +import eu.dnetlib.dhp.schema.oaf.ExtraInfo; +import eu.dnetlib.dhp.schema.oaf.Field; +import eu.dnetlib.dhp.schema.oaf.Journal; +import eu.dnetlib.dhp.schema.oaf.KeyValue; +import eu.dnetlib.dhp.schema.oaf.OAIProvenance; +import eu.dnetlib.dhp.schema.oaf.OriginDescription; +import eu.dnetlib.dhp.schema.oaf.Qualifier; +import eu.dnetlib.dhp.schema.oaf.StructuredProperty; +import eu.dnetlib.dhp.utils.DHPUtils; + +public class OafMapperUtils { + + public static KeyValue keyValue(final String k, final String v) { + final KeyValue kv = new KeyValue(); + kv.setKey(k); + kv.setValue(v); + return kv; + } + + public static List listKeyValues(final String... s) { + if (s.length % 2 > 0) { throw new RuntimeException("Invalid number of parameters (k,v,k,v,....)"); } + + final List list = new ArrayList<>(); + for (int i = 0; i < s.length; i += 2) { + list.add(keyValue(s[i], s[i + 1])); + } + return list; + } + + public static Field field(final T value, final DataInfo info) { + if (value == null || StringUtils.isBlank(value.toString())) { return null; } + + final Field field = new Field<>(); + field.setValue(value); + field.setDataInfo(info); + return field; + } + + public static List> listFields(final DataInfo info, final String... values) { + return Arrays.stream(values).map(v -> field(v, info)).filter(Objects::nonNull).collect(Collectors.toList()); + } + + public static List> listFields(final DataInfo info, final List values) { + return values.stream().map(v -> field(v, info)).filter(Objects::nonNull).collect(Collectors.toList()); + } + + public static Qualifier qualifier(final String classid, final String classname, final String schemeid, final String schemename) { + final Qualifier q = new Qualifier(); + q.setClassid(classid); + q.setClassname(classname); + q.setSchemeid(schemeid); + q.setSchemename(schemename); + return q; + } + + public static StructuredProperty structuredProperty(final String value, + final String classid, + final String classname, + final String schemeid, + final String schemename, + final DataInfo dataInfo) { + + return structuredProperty(value, qualifier(classid, classname, schemeid, schemename), dataInfo); + } + + public static StructuredProperty structuredProperty(final String value, final Qualifier qualifier, final DataInfo dataInfo) { + if (value == null) { return null; } + final StructuredProperty sp = new StructuredProperty(); + sp.setValue(value); + sp.setQualifier(qualifier); + sp.setDataInfo(dataInfo); + return sp; + } + + public static ExtraInfo extraInfo(final String name, final String value, final String typology, final String provenance, final String trust) { + final ExtraInfo info = new ExtraInfo(); + info.setName(name); + info.setValue(value); + info.setTypology(typology); + info.setProvenance(provenance); + info.setTrust(trust); + return info; + } + + public static OAIProvenance oaiIProvenance(final String identifier, + final String baseURL, + final String metadataNamespace, + final Boolean altered, + final String datestamp, + final String harvestDate) { + + final OriginDescription desc = new OriginDescription(); + desc.setIdentifier(identifier); + desc.setBaseURL(baseURL); + desc.setMetadataNamespace(metadataNamespace); + desc.setAltered(altered); + desc.setDatestamp(datestamp); + desc.setHarvestDate(harvestDate); + + final OAIProvenance p = new OAIProvenance(); + p.setOriginDescription(desc); + + return p; + } + + public static Journal journal(final String name, + final String issnPrinted, + final String issnOnline, + final String issnLinking, + final String ep, + final String iss, + final String sp, + final String vol, + final String edition, + final String conferenceplace, + final String conferencedate, + final DataInfo dataInfo) { + + if (StringUtils.isNotBlank(name) || StringUtils.isNotBlank(issnPrinted) || StringUtils.isNotBlank(issnOnline) || StringUtils.isNotBlank(issnLinking)) { + final Journal j = new Journal(); + j.setName(name); + j.setIssnPrinted(issnPrinted); + j.setIssnOnline(issnOnline); + j.setIssnLinking(issnLinking); + j.setEp(ep); + j.setIss(iss); + j.setSp(sp); + j.setVol(vol); + j.setEdition(edition); + j.setConferenceplace(conferenceplace); + j.setConferencedate(conferencedate); + j.setDataInfo(dataInfo); + return j; + } else { + return null; + } + } + + public static DataInfo dataInfo(final Boolean deletedbyinference, + final String inferenceprovenance, + final Boolean inferred, + final Boolean invisible, + final Qualifier provenanceaction, + final String trust) { + final DataInfo d = new DataInfo(); + d.setDeletedbyinference(deletedbyinference); + d.setInferenceprovenance(inferenceprovenance); + d.setInferred(inferred); + d.setInvisible(invisible); + d.setProvenanceaction(provenanceaction); + d.setTrust(trust); + return d; + } + + public static String createOpenaireId(final int prefix, final String originalId, final boolean to_md5) { + if (to_md5) { + final String nsPrefix = StringUtils.substringBefore(originalId, "::"); + final String rest = StringUtils.substringAfter(originalId, "::"); + return String.format("%s|%s::%s", prefix, nsPrefix, DHPUtils.md5(rest)); + } else { + return String.format("%s|%s", prefix, originalId); + } + } + + public static String createOpenaireId(final String type, final String originalId, final boolean to_md5) { + switch (type) { + case "datasource": + return createOpenaireId(10, originalId, to_md5); + case "organization": + return createOpenaireId(20, originalId, to_md5); + case "person": + return createOpenaireId(30, originalId, to_md5); + case "project": + return createOpenaireId(40, originalId, to_md5); + default: + return createOpenaireId(50, originalId, to_md5); + } + } + + public static String asString(final Object o) { + return o == null ? "" : o.toString(); + } + +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/utils/PacePerson.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/utils/PacePerson.java new file mode 100644 index 0000000000..69e128e63c --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/utils/PacePerson.java @@ -0,0 +1,176 @@ +package eu.dnetlib.dhp.migration.utils; + +import java.nio.charset.Charset; +import java.text.Normalizer; +import java.util.HashSet; +import java.util.List; +import java.util.Set; + +import org.apache.commons.io.IOUtils; +import org.apache.commons.lang3.text.WordUtils; + +import com.google.common.base.Joiner; +import com.google.common.base.Splitter; +import com.google.common.collect.Iterables; +import com.google.common.collect.Lists; +import com.google.common.hash.Hashing; + +public class PacePerson { + + private static final String UTF8 = "UTF-8"; + private List name = Lists.newArrayList(); + private List surname = Lists.newArrayList(); + private List fullname = Lists.newArrayList(); + private final String original; + + private static Set particles = null; + + public static final String capitalize(final String s) { + return WordUtils.capitalize(s.toLowerCase(), ' ', '-'); + } + + public static final String dotAbbreviations(final String s) { + return s.length() == 1 ? s + "." : s; + } + + public static Set loadFromClasspath(final String classpath) { + final Set h = new HashSet<>(); + try { + for (final String s : IOUtils.readLines(PacePerson.class.getResourceAsStream(classpath))) { + h.add(s); + } + } catch (final Throwable e) { + return new HashSet<>(); + } + return h; + } + + public PacePerson(String s, final boolean aggressive) { + original = s; + s = Normalizer.normalize(s, Normalizer.Form.NFD); + s = s.replaceAll("\\(.+\\)", ""); + s = s.replaceAll("\\[.+\\]", ""); + s = s.replaceAll("\\{.+\\}", ""); + s = s.replaceAll("\\s+-\\s+", "-"); + s = s.replaceAll("[\\p{Punct}&&[^,-]]", " "); + s = s.replaceAll("\\d", " "); + s = s.replaceAll("\\n", " "); + s = s.replaceAll("\\.", " "); + s = s.replaceAll("\\s+", " "); + + if (aggressive) { + s = s.replaceAll("[\\p{InCombiningDiacriticalMarks}&&[^,-]]", ""); + // s = s.replaceAll("[\\W&&[^,-]]", ""); + } + + if (s.contains(",")) { + final String[] arr = s.split(","); + if (arr.length == 1) { + fullname = splitTerms(arr[0]); + } else if (arr.length > 1) { + surname = splitTerms(arr[0]); + name = splitTerms(arr[1]); + fullname.addAll(surname); + fullname.addAll(name); + } + } else { + fullname = splitTerms(s); + + int lastInitialPosition = fullname.size(); + boolean hasSurnameInUpperCase = false; + + for (int i = 0; i < fullname.size(); i++) { + final String term = fullname.get(i); + if (term.length() == 1) { + lastInitialPosition = i; + } else if (term.equals(term.toUpperCase())) { + hasSurnameInUpperCase = true; + } + } + + if (lastInitialPosition < fullname.size() - 1) { // Case: Michele G. Artini + name = fullname.subList(0, lastInitialPosition + 1); + surname = fullname.subList(lastInitialPosition + 1, fullname.size()); + } else if (hasSurnameInUpperCase) { // Case: Michele ARTINI + for (final String term : fullname) { + if (term.length() > 1 && term.equals(term.toUpperCase())) { + surname.add(term); + } else { + name.add(term); + } + } + } + } + } + + private List splitTerms(final String s) { + if (particles == null) { + particles = loadFromClasspath("/eu/dnetlib/dhp/migration/pace/name_particles.txt"); + } + + final List list = Lists.newArrayList(); + for (final String part : Splitter.on(" ").omitEmptyStrings().split(s)) { + if (!particles.contains(part.toLowerCase())) { + list.add(part); + } + } + return list; + } + + public List getName() { + return name; + } + + public String getNameString() { + return Joiner.on(" ").join(getName()); + } + + public List getSurname() { + return surname; + } + + public List getFullname() { + return fullname; + } + + public String getOriginal() { + return original; + } + + public String hash() { + return Hashing.murmur3_128().hashString(getNormalisedFullname(), Charset.forName(UTF8)).toString(); + } + + public String getNormalisedFirstName() { + return Joiner.on(" ").join(getCapitalFirstnames()); + } + + public String getNormalisedSurname() { + return Joiner.on(" ").join(getCapitalSurname()); + } + + public String getSurnameString() { + return Joiner.on(" ").join(getSurname()); + } + + public String getNormalisedFullname() { + return isAccurate() ? getNormalisedSurname() + ", " + getNormalisedFirstName() : Joiner.on(" ").join(fullname); + } + + public List getCapitalFirstnames() { + return Lists.newArrayList(Iterables.transform(getNameWithAbbreviations(), PacePerson::capitalize)); + } + + public List getCapitalSurname() { + return Lists.newArrayList(Iterables.transform(surname, PacePerson::capitalize)); + } + + public List getNameWithAbbreviations() { + return Lists.newArrayList(Iterables.transform(name, PacePerson::dotAbbreviations)); + } + + public boolean isAccurate() { + return name != null && surname != null && !name.isEmpty() && !surname.isEmpty(); + } + +} diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/dispatch_entities_parameters.json b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/dispatch_entities_parameters.json new file mode 100644 index 0000000000..8c81290ca2 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/dispatch_entities_parameters.json @@ -0,0 +1,20 @@ +[ + { + "paramName": "s", + "paramLongName": "sourcePath", + "paramDescription": "the source path", + "paramRequired": true + }, + { + "paramName": "mt", + "paramLongName": "master", + "paramDescription": "should be local or yarn", + "paramRequired": true + }, + { + "paramName": "g", + "paramLongName": "graphRawPath", + "paramDescription": "the path of the graph Raw in hdfs", + "paramRequired": true + } +] \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/generate_entities_parameters.json b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/generate_entities_parameters.json new file mode 100644 index 0000000000..53ee010c45 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/generate_entities_parameters.json @@ -0,0 +1,39 @@ +[ + { + "paramName": "s", + "paramLongName": "sourcePaths", + "paramDescription": "the HDFS source paths which contains the sequential file (comma separated)", + "paramRequired": true + }, + { + "paramName": "mt", + "paramLongName": "master", + "paramDescription": "should be local or yarn", + "paramRequired": true + }, + { + "paramName": "t", + "paramLongName": "targetPath", + "paramDescription": "the path of the target file", + "paramRequired": true + }, + { + "paramName": "pgurl", + "paramLongName": "postgresUrl", + "paramDescription": "postgres url, example: jdbc:postgresql://localhost:5432/testdb", + "paramRequired": true + }, + { + "paramName": "pguser", + "paramLongName": "postgresUser", + "paramDescription": "postgres user", + "paramRequired": false + }, + { + "paramName": "pgpasswd", + "paramLongName": "postgresPassword", + "paramDescription": "postgres password", + "paramRequired": false + } + +] \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/migrate_actionsets_parameters.json b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/migrate_actionsets_parameters.json new file mode 100644 index 0000000000..c4910ec61b --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/migrate_actionsets_parameters.json @@ -0,0 +1,10 @@ +[ + {"paramName":"is", "paramLongName":"isLookupUrl", "paramDescription": "URL of the isLookUp Service", "paramRequired": true}, + {"paramName":"sn", "paramLongName":"sourceNameNode", "paramDescription": "nameNode of the source cluster", "paramRequired": true}, + {"paramName":"tn", "paramLongName":"targetNameNode", "paramDescription": "namoNode of the target cluster", "paramRequired": true}, + {"paramName":"w", "paramLongName":"workingDirectory", "paramDescription": "working directory", "paramRequired": true}, + {"paramName":"nm", "paramLongName":"distcp_num_maps", "paramDescription": "maximum number of map tasks used in the distcp process", "paramRequired": true}, + {"paramName":"mm", "paramLongName":"distcp_memory_mb", "paramDescription": "memory for distcp action copying actionsets from remote cluster", "paramRequired": true}, + {"paramName":"tt", "paramLongName":"distcp_task_timeout", "paramDescription": "timeout for distcp copying actions from remote cluster", "paramRequired": true}, + {"paramName":"tr", "paramLongName":"transform_only", "paramDescription": "activate tranform-only mode. Only apply transformation step", "paramRequired": true} +] diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/migrate_db_entities_parameters.json b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/migrate_db_entities_parameters.json new file mode 100644 index 0000000000..cb13ff0242 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/migrate_db_entities_parameters.json @@ -0,0 +1,32 @@ +[ + { + "paramName": "p", + "paramLongName": "hdfsPath", + "paramDescription": "the path where storing the sequential file", + "paramRequired": true + }, + { + "paramName": "pgurl", + "paramLongName": "postgresUrl", + "paramDescription": "postgres url, example: jdbc:postgresql://localhost:5432/testdb", + "paramRequired": true + }, + { + "paramName": "pguser", + "paramLongName": "postgresUser", + "paramDescription": "postgres user", + "paramRequired": false + }, + { + "paramName": "pgpasswd", + "paramLongName": "postgresPassword", + "paramDescription": "postgres password", + "paramRequired": false + }, + { + "paramName": "a", + "paramLongName": "action", + "paramDescription": "process claims", + "paramRequired": false + } +] \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/migrate_mongo_mstores_parameters.json b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/migrate_mongo_mstores_parameters.json new file mode 100644 index 0000000000..ee1a6ac4ee --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/migrate_mongo_mstores_parameters.json @@ -0,0 +1,38 @@ +[ + { + "paramName": "p", + "paramLongName": "hdfsPath", + "paramDescription": "the path where storing the sequential file", + "paramRequired": true + }, + { + "paramName": "mongourl", + "paramLongName": "mongoBaseUrl", + "paramDescription": "mongoDB url, example: mongodb://[username:password@]host[:port]", + "paramRequired": true + }, + { + "paramName": "mongodb", + "paramLongName": "mongoDb", + "paramDescription": "mongo database", + "paramRequired": true + }, + { + "paramName": "f", + "paramLongName": "mdFormat", + "paramDescription": "metadata format", + "paramRequired": true + }, + { + "paramName": "l", + "paramLongName": "mdLayout", + "paramDescription": "metadata layout", + "paramRequired": true + }, + { + "paramName": "i", + "paramLongName": "mdInterpretation", + "paramDescription": "metadata interpretation", + "paramRequired": true + } +] \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/pace/name_particles.txt b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/pace/name_particles.txt new file mode 100644 index 0000000000..dae37c9dc3 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/pace/name_particles.txt @@ -0,0 +1,7 @@ +van +der +de +dell +sig +mr +mrs diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/sql/queryClaims.sql b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/sql/queryClaims.sql new file mode 100644 index 0000000000..0390c11aa3 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/sql/queryClaims.sql @@ -0,0 +1 @@ +SELECT source_type, source_id, target_type, target_id, semantics FROM claim WHERE approved=TRUE; \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/sql/queryDatasourceOrganization.sql b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/sql/queryDatasourceOrganization.sql new file mode 100644 index 0000000000..745f83971a --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/sql/queryDatasourceOrganization.sql @@ -0,0 +1,17 @@ +SELECT + dor.datasource AS datasource, + dor.organization AS organization, + NULL AS startdate, + NULL AS enddate, + false AS inferred, + false AS deletedbyinference, + 0.9 AS trust, + NULL AS inferenceprovenance, + dc.id AS collectedfromid, + dc.officialname AS collectedfromname, + 'providedBy@@@provided by@@@dnet:datasources_organizations_typologies@@@dnet:datasources_organizations_typologies' AS semantics, + d.provenanceaction || '@@@' || d.provenanceaction || '@@@dnet:provenanceActions@@@dnet:provenanceActions' AS provenanceaction + +FROM dsm_datasource_organization dor + LEFT OUTER JOIN dsm_datasources d ON (dor.datasource = d.id) + LEFT OUTER JOIN dsm_datasources dc ON (dc.id = d.collectedfrom) diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/sql/queryDatasources.sql b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/sql/queryDatasources.sql new file mode 100644 index 0000000000..8c587f34ee --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/sql/queryDatasources.sql @@ -0,0 +1,147 @@ +SELECT + d.id AS datasourceid, + d.id || array_agg(distinct di.pid) AS identities, + d.officialname AS officialname, + d.englishname AS englishname, + d.contactemail AS contactemail, + CASE + WHEN (array_agg(DISTINCT COALESCE (a.compatibility_override, a.compatibility):: TEXT) @> ARRAY ['openaire-cris_1.1']) + THEN + 'openaire-cris_1.1@@@OpenAIRE CRIS v1.1@@@dnet:datasourceCompatibilityLevel@@@dnet:datasourceCompatibilityLevel' + WHEN (array_agg(DISTINCT COALESCE (a.compatibility_override, a.compatibility):: TEXT) @> ARRAY ['driver', 'openaire2.0']) + THEN + 'driver-openaire2.0@@@OpenAIRE 2.0+ (DRIVER OA, EC funding)@@@dnet:datasourceCompatibilityLevel@@@dnet:datasourceCompatibilityLevel' + WHEN (array_agg(DISTINCT COALESCE (a.compatibility_override, a.compatibility) :: TEXT) @> ARRAY ['driver']) + THEN + 'driver@@@OpenAIRE Basic (DRIVER OA)@@@dnet:datasourceCompatibilityLevel@@@dnet:datasourceCompatibilityLevel' + WHEN (array_agg(DISTINCT COALESCE (a.compatibility_override, a.compatibility) :: TEXT) @> ARRAY ['openaire2.0']) + THEN + 'openaire2.0@@@OpenAIRE 2.0 (EC funding)@@@dnet:datasourceCompatibilityLevel@@@dnet:datasourceCompatibilityLevel' + WHEN (array_agg(DISTINCT COALESCE (a.compatibility_override, a.compatibility) :: TEXT) @> ARRAY ['openaire3.0']) + THEN + 'openaire3.0@@@OpenAIRE 3.0 (OA, funding)@@@dnet:datasourceCompatibilityLevel@@@dnet:datasourceCompatibilityLevel' + WHEN (array_agg(DISTINCT COALESCE (a.compatibility_override, a.compatibility) :: TEXT) @> ARRAY ['openaire2.0_data']) + THEN + 'openaire2.0_data@@@OpenAIRE Data (funded, referenced datasets)@@@dnet:datasourceCompatibilityLevel@@@dnet:datasourceCompatibilityLevel' + WHEN (array_agg(DISTINCT COALESCE (a.compatibility_override, a.compatibility) :: TEXT) @> ARRAY ['native']) + THEN + 'native@@@proprietary@@@dnet:datasourceCompatibilityLevel@@@dnet:datasourceCompatibilityLevel' + WHEN (array_agg(DISTINCT COALESCE (a.compatibility_override, a.compatibility) :: TEXT) @> ARRAY ['hostedBy']) + THEN + 'hostedBy@@@collected from a compatible aggregator@@@dnet:datasourceCompatibilityLevel@@@dnet:datasourceCompatibilityLevel' + WHEN (array_agg(DISTINCT COALESCE (a.compatibility_override, a.compatibility) :: TEXT) @> ARRAY ['notCompatible']) + THEN + 'notCompatible@@@under validation@@@dnet:datasourceCompatibilityLevel@@@dnet:datasourceCompatibilityLevel' + ELSE + 'UNKNOWN@@@not available@@@dnet:datasourceCompatibilityLevel@@@dnet:datasourceCompatibilityLevel' + END AS openairecompatibility, + d.websiteurl AS websiteurl, + d.logourl AS logourl, + array_agg(DISTINCT CASE WHEN a.protocol = 'oai' and last_aggregation_date is not null THEN a.baseurl ELSE NULL END) AS accessinfopackage, + d.latitude AS latitude, + d.longitude AS longitude, + d.namespaceprefix AS namespaceprefix, + NULL AS odnumberofitems, + NULL AS odnumberofitemsdate, + + (SELECT array_agg(s|| '###keywords@@@keywords@@@dnet:subject_classification_typologies@@@dnet:subject_classification_typologies') + FROM UNNEST( + ARRAY( + SELECT trim(s) + FROM unnest(string_to_array(d.subjects, '@@')) AS s)) AS s) AS subjects, + + d.description AS description, + NULL AS odpolicies, + ARRAY(SELECT trim(s) + FROM unnest(string_to_array(d.languages, ',')) AS s) AS odlanguages, + ARRAY(SELECT trim(s) + FROM unnest(string_to_array(d.od_contenttypes, '-')) AS s) AS odcontenttypes, + false AS inferred, + false AS deletedbyinference, + 0.9 AS trust, + NULL AS inferenceprovenance, + d.dateofcollection AS dateofcollection, + d.dateofvalidation AS dateofvalidation, + -- re3data fields + d.releasestartdate AS releasestartdate, + d.releaseenddate AS releaseenddate, + d.missionstatementurl AS missionstatementurl, + d.dataprovider AS dataprovider, + d.serviceprovider AS serviceprovider, + d.databaseaccesstype AS databaseaccesstype, + d.datauploadtype AS datauploadtype, + d.databaseaccessrestriction AS databaseaccessrestriction, + d.datauploadrestriction AS datauploadrestriction, + d.versioning AS versioning, + d.citationguidelineurl AS citationguidelineurl, + d.qualitymanagementkind AS qualitymanagementkind, + d.pidsystems AS pidsystems, + d.certificates AS certificates, + ARRAY[]::text[] AS policies, + dc.id AS collectedfromid, + dc.officialname AS collectedfromname, + d.typology || '@@@' || CASE + WHEN (d.typology = 'crissystem') THEN 'CRIS System' + WHEN (d.typology = 'datarepository::unknown') THEN 'Data Repository' + WHEN (d.typology = 'aggregator::datarepository') THEN 'Data Repository Aggregator' + WHEN (d.typology = 'infospace') THEN 'Information Space' + WHEN (d.typology = 'pubsrepository::institutional') THEN 'Institutional Repository' + WHEN (d.typology = 'aggregator::pubsrepository::institutional') THEN 'Institutional Repository Aggregator' + WHEN (d.typology = 'pubsrepository::journal') THEN 'Journal' + WHEN (d.typology = 'aggregator::pubsrepository::journals') THEN 'Journal Aggregator/Publisher' + WHEN (d.typology = 'pubsrepository::mock') THEN 'Other' + WHEN (d.typology = 'pubscatalogue::unknown') THEN 'Publication Catalogue' + WHEN (d.typology = 'pubsrepository::unknown') THEN 'Publication Repository' + WHEN (d.typology = 'aggregator::pubsrepository::unknown') THEN 'Publication Repository Aggregator' + WHEN (d.typology = 'entityregistry') THEN 'Registry' + WHEN (d.typology = 'scholarcomminfra') THEN 'Scholarly Comm. Infrastructure' + WHEN (d.typology = 'pubsrepository::thematic') THEN 'Thematic Repository' + WHEN (d.typology = 'websource') THEN 'Web Source' + WHEN (d.typology = 'entityregistry::projects') THEN 'Funder database' + WHEN (d.typology = 'entityregistry::repositories') THEN 'Registry of repositories' + WHEN (d.typology = 'softwarerepository') THEN 'Software Repository' + WHEN (d.typology = 'aggregator::softwarerepository') THEN 'Software Repository Aggregator' + WHEN (d.typology = 'orprepository') THEN 'Repository' + ELSE 'Other' + END || '@@@dnet:datasource_typologies@@@dnet:datasource_typologies' AS datasourcetype, + 'sysimport:crosswalk:entityregistry@@@sysimport:crosswalk:entityregistry@@@dnet:provenance_actions@@@dnet:provenance_actions' AS provenanceaction, + CONCAT(d.issn, '@@@', d.eissn, '@@@', d.lissn) AS journal + +FROM dsm_datasources d + +LEFT OUTER JOIN dsm_datasources dc on (d.collectedfrom = dc.id) +LEFT OUTER JOIN dsm_api a ON (d.id = a.datasource) +LEFT OUTER JOIN dsm_datasourcepids di ON (d.id = di.datasource) + +GROUP BY + d.id, + d.officialname, + d.englishname, + d.websiteurl, + d.logourl, + d.contactemail, + d.namespaceprefix, + d.description, + d.latitude, + d.longitude, + d.dateofcollection, + d.dateofvalidation, + d.releasestartdate, + d.releaseenddate, + d.missionstatementurl, + d.dataprovider, + d.serviceprovider, + d.databaseaccesstype, + d.datauploadtype, + d.databaseaccessrestriction, + d.datauploadrestriction, + d.versioning, + d.citationguidelineurl, + d.qualitymanagementkind, + d.pidsystems, + d.certificates, + dc.id, + dc.officialname, + d.issn, + d.eissn, + d.lissn diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/sql/queryOrganizations.sql b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/sql/queryOrganizations.sql new file mode 100644 index 0000000000..aeb04aff91 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/sql/queryOrganizations.sql @@ -0,0 +1,35 @@ +SELECT + o.id AS organizationid, + o.legalshortname AS legalshortname, + o.legalname AS legalname, + o.websiteurl AS websiteurl, + o.logourl AS logourl, + o.ec_legalbody AS eclegalbody, + o.ec_legalperson AS eclegalperson, + o.ec_nonprofit AS ecnonprofit, + o.ec_researchorganization AS ecresearchorganization, + o.ec_highereducation AS echighereducation, + o.ec_internationalorganizationeurinterests AS ecinternationalorganizationeurinterests, + o.ec_internationalorganization AS ecinternationalorganization, + o.ec_enterprise AS ecenterprise, + o.ec_smevalidated AS ecsmevalidated, + o.ec_nutscode AS ecnutscode, + o.dateofcollection AS dateofcollection, + o.lastupdate AS dateoftransformation, + false AS inferred, + false AS deletedbyinference, + o.trust AS trust, + '' AS inferenceprovenance, + d.id AS collectedfromid, + d.officialname AS collectedfromname, + o.country || '@@@' || o.country || '@@@dnet:countries@@@dnet:countries' AS country, + 'sysimport:crosswalk:entityregistry@@@sysimport:crosswalk:entityregistry@@@dnet:provenance_actions@@@dnet:provenance_actions' AS provenanceaction, + + ARRAY[]::text[] AS pid +FROM dsm_organizations o + LEFT OUTER JOIN dsm_datasources d ON (d.id = o.collectedfrom) + + + + + diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/sql/queryOrganizationsFromOpenOrgsDB.sql b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/sql/queryOrganizationsFromOpenOrgsDB.sql new file mode 100644 index 0000000000..99c8e04b4b --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/sql/queryOrganizationsFromOpenOrgsDB.sql @@ -0,0 +1,53 @@ +SELECT + o.id AS organizationid, + coalesce((array_agg(a.acronym))[1], o.name) AS legalshortname, + o.name AS legalname, + array_agg(DISTINCT n.name) AS "alternativeNames", + (array_agg(u.url))[1] AS websiteurl, + o.modification_date AS dateoftransformation, + false AS inferred, + false AS deletedbyinference, + 0.95 AS trust, + '' AS inferenceprovenance, + 'openaire____::openorgs' AS collectedfromid, + 'OpenOrgs Database' AS collectedfromname, + o.country || '@@@' || o.country || '@@@dnet:countries@@@dnet:countries' AS country, + 'sysimport:crosswalk:entityregistry@@@sysimport:crosswalk:entityregistry@@@dnet:provenance_actions@@@dnet:provenance_actions' AS provenanceaction, + array_agg(DISTINCT i.otherid || '###' || i.type || '@@@dnet:pid_types') AS pid +FROM organizations o + LEFT OUTER JOIN acronyms a ON (a.id = o.id) + LEFT OUTER JOIN urls u ON (u.id = o.id) + LEFT OUTER JOIN other_ids i ON (i.id = o.id) + LEFT OUTER JOIN other_names n ON (n.id = o.id) +GROUP BY + o.id, + o.name, + o.modification_date, + o.country + +UNION ALL + +SELECT + 'openorgsmesh'||substring(o.id, 13)||'-'||md5(n.name) AS organizationid, + n.name AS legalshortname, + n.name AS legalname, + ARRAY[]::text[] AS "alternativeNames", + (array_agg(u.url))[1] AS websiteurl, + o.modification_date AS dateoftransformation, + false AS inferred, + false AS deletedbyinference, + 0.88 AS trust, + '' AS inferenceprovenance, + 'openaire____::openorgs' AS collectedfromid, + 'OpenOrgs Database' AS collectedfromname, + o.country || '@@@' || o.country || '@@@dnet:countries@@@dnet:countries' AS country, + 'sysimport:crosswalk:entityregistry@@@sysimport:crosswalk:entityregistry@@@dnet:provenance_actions@@@dnet:provenance_actions' AS provenanceaction, + array_agg(DISTINCT i.otherid || '###' || i.type || '@@@dnet:pid_types') AS pid +FROM other_names n + LEFT OUTER JOIN organizations o ON (n.id = o.id) + LEFT OUTER JOIN urls u ON (u.id = o.id) + LEFT OUTER JOIN other_ids i ON (i.id = o.id) +GROUP BY + o.id, o.modification_date, o.country, n.name + + diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/sql/queryProjectOrganization.sql b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/sql/queryProjectOrganization.sql new file mode 100644 index 0000000000..4c06ca5b9a --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/sql/queryProjectOrganization.sql @@ -0,0 +1,19 @@ +SELECT + po.project AS project, + po.resporganization AS resporganization, + po.participantnumber AS participantnumber, + po.contribution AS contribution, + NULL AS startdate, + NULL AS enddate, + false AS inferred, + false AS deletedbyinference, + po.trust AS trust, + NULL AS inferenceprovenance, + dc.id AS collectedfromid, + dc.officialname AS collectedfromname, + po.semanticclass || '@@@' || po.semanticclass || '@@@dnet:project_organization_relations@@@dnet:project_organization_relations' AS semantics, + 'sysimport:crosswalk:entityregistry@@@sysimport:crosswalk:entityregistry@@@dnet:provenance_actions@@@dnet:provenance_actions' AS provenanceaction + +FROM project_organization po + LEFT OUTER JOIN projects p ON (p.id = po.project) + LEFT OUTER JOIN dsm_datasources dc ON (dc.id = p.collectedfrom) diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/sql/queryProjects.sql b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/sql/queryProjects.sql new file mode 100644 index 0000000000..685b57ab65 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/sql/queryProjects.sql @@ -0,0 +1,89 @@ +SELECT + p.id AS projectid, + p.code AS code, + p.websiteurl AS websiteurl, + p.acronym AS acronym, + p.title AS title, + p.startdate AS startdate, + p.enddate AS enddate, + p.call_identifier AS callidentifier, + p.keywords AS keywords, + p.duration AS duration, + p.ec_sc39 AS ecsc39, + p.oa_mandate_for_publications AS oamandatepublications, + p.ec_article29_3 AS ecarticle29_3, + p.dateofcollection AS dateofcollection, + p.lastupdate AS dateoftransformation, + p.inferred AS inferred, + p.deletedbyinference AS deletedbyinference, + p.trust AS trust, + p.inferenceprovenance AS inferenceprovenance, + p.optional1 AS optional1, + p.optional2 AS optional2, + p.jsonextrainfo AS jsonextrainfo, + p.contactfullname AS contactfullname, + p.contactfax AS contactfax, + p.contactphone AS contactphone, + p.contactemail AS contactemail, + p.summary AS summary, + p.currency AS currency, + p.totalcost AS totalcost, + p.fundedamount AS fundedamount, + dc.id AS collectedfromid, + dc.officialname AS collectedfromname, + p.contracttype || '@@@' || p.contracttypename || '@@@' || p.contracttypescheme || '@@@' || p.contracttypescheme AS contracttype, + pac.code || '@@@' || pac.name || '@@@' || pas.code || '@@@' || pas.name AS provenanceaction, + array_agg(DISTINCT i.pid || '###' || i.issuertype) AS pid, + array_agg(DISTINCT s.name || '###' || sc.code || '@@@' || sc.name || '@@@' || ss.code || '@@@' || ss.name) AS subjects, + array_agg(DISTINCT fp.path) AS fundingtree + + FROM projects p + + LEFT OUTER JOIN class pac ON (pac.code = p.provenanceactionclass) + LEFT OUTER JOIN scheme pas ON (pas.code = p.provenanceactionscheme) + + LEFT OUTER JOIN projectpids pp ON (pp.project = p.id) + LEFT OUTER JOIN dsm_identities i ON (i.pid = pp.pid) + + LEFT OUTER JOIN dsm_datasources dc ON (dc.id = p.collectedfrom) + + LEFT OUTER JOIN project_fundingpath pf ON (pf.project = p.id) + LEFT OUTER JOIN fundingpaths fp ON (fp.id = pf.funding) + + LEFT OUTER JOIN project_subject ps ON (ps.project = p.id) + LEFT OUTER JOIN subjects s ON (s.id = ps.subject) + + LEFT OUTER JOIN class sc ON (sc.code = s.semanticclass) + LEFT OUTER JOIN scheme ss ON (ss.code = s.semanticscheme) + + GROUP BY + p.id, + p.code, + p.websiteurl, + p.acronym, + p.title, + p.startdate, + p.enddate, + p.call_identifier, + p.keywords, + p.duration, + p.ec_sc39, + p.oa_mandate_for_publications, + p.ec_article29_3, + p.dateofcollection, + p.inferred, + p.deletedbyinference, + p.trust, + p.inferenceprovenance, + p.contactfullname, + p.contactfax, + p.contactphone, + p.contactemail, + p.summary, + p.currency, + p.totalcost, + p.fundedamount, + dc.id, + dc.officialname, + pac.code, pac.name, pas.code, pas.name, + p.contracttype , p.contracttypename, p.contracttypescheme; \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/sql/queryProjects_production.sql b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/sql/queryProjects_production.sql new file mode 100644 index 0000000000..6cff188756 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/sql/queryProjects_production.sql @@ -0,0 +1,90 @@ +SELECT + p.id AS projectid, + p.code AS code, + p.websiteurl AS websiteurl, + p.acronym AS acronym, + p.title AS title, + p.startdate AS startdate, + p.enddate AS enddate, + p.call_identifier AS callidentifier, + p.keywords AS keywords, + p.duration AS duration, + p.ec_sc39 AS ecsc39, + p.oa_mandate_for_publications AS oamandatepublications, + p.ec_article29_3 AS ecarticle29_3, + p.dateofcollection AS dateofcollection, + p.lastupdate AS dateoftransformation, + p.inferred AS inferred, + p.deletedbyinference AS deletedbyinference, + p.trust AS trust, + p.inferenceprovenance AS inferenceprovenance, + p.optional1 AS optional1, + p.optional2 AS optional2, + p.jsonextrainfo AS jsonextrainfo, + p.contactfullname AS contactfullname, + p.contactfax AS contactfax, + p.contactphone AS contactphone, + p.contactemail AS contactemail, + p.summary AS summary, + p.currency AS currency, + p.totalcost AS totalcost, + p.fundedamount AS fundedamount, + dc.id AS collectedfromid, + dc.officialname AS collectedfromname, + ctc.code || '@@@' || ctc.name || '@@@' || cts.code || '@@@' || cts.name AS contracttype, + pac.code || '@@@' || pac.name || '@@@' || pas.code || '@@@' || pas.name AS provenanceaction, + array_agg(DISTINCT i.pid || '###' || i.issuertype) AS pid, + array_agg(DISTINCT s.name || '###' || sc.code || '@@@' || sc.name || '@@@' || ss.code || '@@@' || ss.name) AS subjects, + array_agg(DISTINCT fp.path) AS fundingtree + FROM projects p + LEFT OUTER JOIN class pac ON (pac.code = p.provenanceactionclass) + LEFT OUTER JOIN scheme pas ON (pas.code = p.provenanceactionscheme) + + LEFT OUTER JOIN projectpids pp ON (pp.project = p.id) + LEFT OUTER JOIN dsm_identities i ON (i.pid = pp.pid) + + LEFT OUTER JOIN dsm_datasources dc ON (dc.id = p.collectedfrom) + + LEFT OUTER JOIN project_fundingpath pf ON (pf.project = p.id) + LEFT OUTER JOIN fundingpaths fp ON (fp.id = pf.funding) + + LEFT OUTER JOIN project_subject ps ON (ps.project = p.id) + LEFT OUTER JOIN subjects s ON (s.id = ps.subject) + + LEFT OUTER JOIN class sc ON (sc.code = s.semanticclass) + LEFT OUTER JOIN scheme ss ON (ss.code = s.semanticscheme) + + LEFT OUTER JOIN class ctc ON (ctc.code = p.contracttypeclass) + LEFT OUTER JOIN scheme cts ON (cts.code = p.contracttypescheme) + + GROUP BY + p.id, + p.code, + p.websiteurl, + p.acronym, + p.title, + p.startdate, + p.enddate, + p.call_identifier, + p.keywords, + p.duration, + p.ec_sc39, + p.oa_mandate_for_publications, + p.ec_article29_3, + p.dateofcollection, + p.inferred, + p.deletedbyinference, + p.trust, + p.inferenceprovenance, + p.contactfullname, + p.contactfax, + p.contactphone, + p.contactemail, + p.summary, + p.currency, + p.totalcost, + p.fundedamount, + dc.id, + dc.officialname, + pac.code, pac.name, pas.code, pas.name, + ctc.code, ctc.name, cts.code, cts.name; \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/sql/querySimilarityFromOpenOrgsDB.sql b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/sql/querySimilarityFromOpenOrgsDB.sql new file mode 100644 index 0000000000..4407559c61 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/sql/querySimilarityFromOpenOrgsDB.sql @@ -0,0 +1,17 @@ +SELECT local_id AS id1, oa_original_id AS id2 FROM openaire_simrels WHERE reltype = 'is_similar' + +UNION ALL + +SELECT + o.id AS id1, + 'openorgsmesh'||substring(o.id, 13)||'-'||md5(a.acronym) AS id2 +FROM acronyms a + LEFT OUTER JOIN organizations o ON (a.id = o.id) + +UNION ALL + +SELECT + o.id AS id1, + 'openorgsmesh'||substring(o.id, 13)||'-'||md5(n.name) AS id2 +FROM other_names n + LEFT OUTER JOIN organizations o ON (n.id = o.id) diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/transform_actionsets_parameters.json b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/transform_actionsets_parameters.json new file mode 100644 index 0000000000..ce72f53ca6 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/transform_actionsets_parameters.json @@ -0,0 +1,5 @@ +[ + {"paramName":"mt", "paramLongName":"master", "paramDescription": "should be local or yarn", "paramRequired": true}, + {"paramName":"is", "paramLongName":"isLookupUrl", "paramDescription": "URL of the isLookUp Service", "paramRequired": true}, + {"paramName":"i", "paramLongName":"inputPaths", "paramDescription": "URL of the isLookUp Service", "paramRequired": true} +] diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/wfs/actions/oozie_app/config-default.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/wfs/actions/oozie_app/config-default.xml new file mode 100644 index 0000000000..9637ebdc62 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/wfs/actions/oozie_app/config-default.xml @@ -0,0 +1,30 @@ + + + jobTracker + yarnRM + + + nameNode + hdfs://nameservice1 + + + sourceNN + webhdfs://namenode2.hadoop.dm.openaire.eu:50071 + + + oozie.use.system.libpath + true + + + oozie.action.sharelib.for.spark + spark2 + + + spark2YarnHistoryServerAddress + http://iis-cdh5-test-gw.ocean.icm.edu.pl:18088 + + + spark2EventLogDir + /user/spark/applicationHistory + + \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/wfs/actions/oozie_app/workflow.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/wfs/actions/oozie_app/workflow.xml new file mode 100644 index 0000000000..ed01c8de4c --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/wfs/actions/oozie_app/workflow.xml @@ -0,0 +1,122 @@ + + + + sourceNN + the source name node + + + isLookupUrl + the isLookup service endpoint + + + workingDirectory + /tmp/actionsets + working directory + + + distcp_memory_mb + 6144 + memory for distcp copying actionsets from remote cluster + + + distcp_task_timeout + 60000000 + timeout for distcp copying actions from remote cluster + + + distcp_num_maps + 1 + mmaximum number of map tasks used in the distcp process + + + transform_only + activate tranform-only mode. Only apply transformation step + + + sparkDriverMemory + memory for driver process + + + sparkExecutorMemory + memory for individual executor + + + sparkExecutorCores + number of cores used by single executor + + + spark2YarnHistoryServerAddress + spark 2.* yarn history server address + + + spark2EventLogDir + spark 2.* event log dir location + + + + + ${jobTracker} + ${nameNode} + + + mapreduce.job.queuename + ${queueName} + + + oozie.launcher.mapred.job.queue.name + ${oozieLauncherQueueName} + + + + + + + + + eu.dnetlib.dhp.migration.actions.MigrateActionSet + -Dmapred.task.timeout=${distcp_task_timeout} + -is${isLookupUrl} + -sn${sourceNN} + -tn${nameNode} + -w${workingDirectory} + -nm${distcp_num_maps} + -mm${distcp_memory_mb} + -tt${distcp_task_timeout} + -tr${transform_only} + + + + + + + + + yarn + cluster + transform_actions + eu.dnetlib.dhp.migration.actions.TransformActions + dhp-aggregation-${projectVersion}.jar + + --executor-cores ${sparkExecutorCores} + --executor-memory ${sparkExecutorMemory} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" + --conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + + -mtyarn + -is${isLookupUrl} + --inputPaths${wf:actionData('migrate_actionsets')['target_paths']} + + + + + + + migrate_actions failed, error message[${wf:errorMessage(wf:lastErrorNode())}] + + + + + \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/wfs/claims/oozie_app/config-default.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/wfs/claims/oozie_app/config-default.xml new file mode 100644 index 0000000000..2e0ed9aeea --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/wfs/claims/oozie_app/config-default.xml @@ -0,0 +1,18 @@ + + + jobTracker + yarnRM + + + nameNode + hdfs://nameservice1 + + + oozie.use.system.libpath + true + + + oozie.action.sharelib.for.spark + spark2 + + \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/wfs/claims/oozie_app/workflow.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/wfs/claims/oozie_app/workflow.xml new file mode 100644 index 0000000000..1ac456976d --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/wfs/claims/oozie_app/workflow.xml @@ -0,0 +1,169 @@ + + + + migrationClaimsPathStep1 + the base path to store hdfs file + + + migrationClaimsPathStep2 + the temporary path to store entities before dispatching + + + migrationClaimsPathStep3 + the graph Raw base path + + + postgresURL + the postgres URL to access to the database + + + postgresUser + the user postgres + + + postgresPassword + the password postgres + + + mongoURL + mongoDB url, example: mongodb://[username:password@]host[:port] + + + mongoDb + mongo database + + + sparkDriverMemory + memory for driver process + + + sparkExecutorMemory + memory for individual executor + + + sparkExecutorCores + number of cores used by single executor + + + + + + + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] + + + + + + + + + + + + + + ${jobTracker} + ${nameNode} + eu.dnetlib.dhp.migration.step1.MigrateDbEntitiesApplication + -p${migrationClaimsPathStep1}/db_claims + -pgurl${postgresURL} + -pguser${postgresUser} + -pgpasswd${postgresPassword} + -aclaims + + + + + + + + ${jobTracker} + ${nameNode} + eu.dnetlib.dhp.migration.step1.MigrateMongoMdstoresApplication + -p${migrationClaimsPathStep1}/odf_claims + -mongourl${mongoURL} + -mongodb${mongoDb} + -fODF + -lstore + -iclaim + + + + + + + + ${jobTracker} + ${nameNode} + eu.dnetlib.dhp.migration.step1.MigrateMongoMdstoresApplication + -p${migrationClaimsPathStep1}/oaf_claims + -mongourl${mongoURL} + -mongodb${mongoDb} + -fOAF + -lstore + -iclaim + + + + + + + + + + + + + + + + + ${jobTracker} + ${nameNode} + yarn-cluster + cluster + GenerateClaimEntities + eu.dnetlib.dhp.migration.step2.GenerateEntitiesApplication + dhp-aggregation-${projectVersion}.jar + --executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores} --driver-memory=${sparkDriverMemory} --conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" --conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" --conf spark.sql.warehouse.dir="/user/hive/warehouse" + -mt yarn-cluster + -s${migrationClaimsPathStep1}/db_claims,${migrationClaimsPathStep1}/oaf_claims,${migrationClaimsPathStep1}/odf_claims + -t${migrationClaimsPathStep2}/claim_entities + -pgurl${postgresURL} + -pguser${postgresUser} + -pgpasswd${postgresPassword} + + + + + + + + + + + + + + + + + ${jobTracker} + ${nameNode} + yarn-cluster + cluster + GenerateClaimGraph + eu.dnetlib.dhp.migration.step3.DispatchEntitiesApplication + dhp-aggregation-${projectVersion}.jar + --executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores} --driver-memory=${sparkDriverMemory} --conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" --conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" --conf spark.sql.warehouse.dir="/user/hive/warehouse" + -mt yarn-cluster + -s${migrationClaimsPathStep2}/claim_entities + -g${migrationClaimsPathStep3} + + + + + + + \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/wfs/regular_all_steps/oozie_app/config-default.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/wfs/regular_all_steps/oozie_app/config-default.xml new file mode 100644 index 0000000000..2e0ed9aeea --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/wfs/regular_all_steps/oozie_app/config-default.xml @@ -0,0 +1,18 @@ + + + jobTracker + yarnRM + + + nameNode + hdfs://nameservice1 + + + oozie.use.system.libpath + true + + + oozie.action.sharelib.for.spark + spark2 + + \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/wfs/regular_all_steps/oozie_app/workflow.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/wfs/regular_all_steps/oozie_app/workflow.xml new file mode 100644 index 0000000000..42ab59822d --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/wfs/regular_all_steps/oozie_app/workflow.xml @@ -0,0 +1,197 @@ + + + + + workingPath + /tmp/dhp_migration + the base path to store temporary intermediate data + + + graphBasePath + the target path to store raw graph + + + reuseContent + false + should import content from the aggregator or reuse a previous version + + + postgresURL + the postgres URL to access to the database + + + postgresUser + the user postgres + + + postgresPassword + the password postgres + + + mongoURL + mongoDB url, example: mongodb://[username:password@]host[:port] + + + mongoDb + mongo database + + + sparkDriverMemory + memory for driver process + + + sparkExecutorMemory + memory for individual executor + + + sparkExecutorCores + number of cores used by single executor + + + + + ${jobTracker} + ${nameNode} + + + mapreduce.job.queuename + ${queueName} + + + oozie.launcher.mapred.job.queue.name + ${oozieLauncherQueueName} + + + + + + + + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] + + + + + ${wf:conf('reuseContent') eq false} + ${wf:conf('reuseContent') eq true} + + + + + + + + + + + + + + + + eu.dnetlib.dhp.migration.step1.MigrateDbEntitiesApplication + -p${workingPath}/db_records + -pgurl${postgresURL} + -pguser${postgresUser} + -pgpasswd${postgresPassword} + + + + + + + + eu.dnetlib.dhp.migration.step1.MigrateMongoMdstoresApplication + -p${workingPath}/odf_records + -mongourl${mongoURL} + -mongodb${mongoDb} + -fODF + -lstore + -icleaned + + + + + + + + eu.dnetlib.dhp.migration.step1.MigrateMongoMdstoresApplication + -p${workingPath}/oaf_records + -mongourl${mongoURL} + -mongodb${mongoDb} + -fOAF + -lstore + -icleaned + + + + + + + + + + + + + + + + yarn + cluster + GenerateEntities + eu.dnetlib.dhp.migration.step2.GenerateEntitiesApplication + dhp-aggregation-${projectVersion}.jar + + --executor-memory ${sparkExecutorMemory} + --executor-cores ${sparkExecutorCores} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" + --conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" + --conf spark.sql.warehouse.dir="/user/hive/warehouse" + + -mt yarn-cluster + -s${workingPath}/db_records,${workingPath}/oaf_records,${workingPath}/odf_records + -t${workingPath}/all_entities + -pgurl${postgresURL} + -pguser${postgresUser} + -pgpasswd${postgresPassword} + + + + + + + + + + + + + + + + + yarn + cluster + GenerateGraph + eu.dnetlib.dhp.migration.step3.DispatchEntitiesApplication + dhp-aggregation-${projectVersion}.jar + + --executor-memory ${sparkExecutorMemory} + --executor-cores ${sparkExecutorCores} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" + --conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" + --conf spark.sql.warehouse.dir="/user/hive/warehouse" + + -mt yarn-cluster + -s${workingPath}/all_entities + -g${graphBasePath}/graph_raw + + + + + + + \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/wfs/regular_step1/oozie_app/config-default.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/wfs/regular_step1/oozie_app/config-default.xml new file mode 100644 index 0000000000..2e0ed9aeea --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/wfs/regular_step1/oozie_app/config-default.xml @@ -0,0 +1,18 @@ + + + jobTracker + yarnRM + + + nameNode + hdfs://nameservice1 + + + oozie.use.system.libpath + true + + + oozie.action.sharelib.for.spark + spark2 + + \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/wfs/regular_step1/oozie_app/workflow.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/wfs/regular_step1/oozie_app/workflow.xml new file mode 100644 index 0000000000..f16e22f957 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/wfs/regular_step1/oozie_app/workflow.xml @@ -0,0 +1,103 @@ + + + + migrationPathStep1 + the base path to store hdfs file + + + postgresURL + the postgres URL to access to the database + + + postgresUser + the user postgres + + + postgresPassword + the password postgres + + + mongoURL + mongoDB url, example: mongodb://[username:password@]host[:port] + + + mongoDb + mongo database + + + sparkDriverMemory + memory for driver process + + + sparkExecutorMemory + memory for individual executor + + + sparkExecutorCores + number of cores used by single executor + + + + + + + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] + + + + + + + + + + + + + + ${jobTracker} + ${nameNode} + eu.dnetlib.dhp.migration.step1.MigrateDbEntitiesApplication + -p${migrationPathStep1}/db_records + -pgurl${postgresURL} + -pguser${postgresUser} + -pgpasswd${postgresPassword} + + + + + + + + ${jobTracker} + ${nameNode} + eu.dnetlib.dhp.migration.step1.MigrateMongoMdstoresApplication + -p${migrationPathStep1}/odf_records + -mongourl${mongoURL} + -mongodb${mongoDb} + -fODF + -lstore + -icleaned + + + + + + + + ${jobTracker} + ${nameNode} + eu.dnetlib.dhp.migration.step1.MigrateMongoMdstoresApplication + -p${migrationPathStep1}/oaf_records + -mongourl${mongoURL} + -mongodb${mongoDb} + -fOAF + -lstore + -icleaned + + + + + + + \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/wfs/regular_step1_onlydb/oozie_app/config-default.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/wfs/regular_step1_onlydb/oozie_app/config-default.xml new file mode 100644 index 0000000000..2e0ed9aeea --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/wfs/regular_step1_onlydb/oozie_app/config-default.xml @@ -0,0 +1,18 @@ + + + jobTracker + yarnRM + + + nameNode + hdfs://nameservice1 + + + oozie.use.system.libpath + true + + + oozie.action.sharelib.for.spark + spark2 + + \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/wfs/regular_step1_onlydb/oozie_app/workflow.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/wfs/regular_step1_onlydb/oozie_app/workflow.xml new file mode 100644 index 0000000000..0730f3a1f1 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/wfs/regular_step1_onlydb/oozie_app/workflow.xml @@ -0,0 +1,62 @@ + + + + migrationPathStep1 + the base path to store hdfs file + + + postgresURL + the postgres URL to access to the database + + + postgresUser + the user postgres + + + postgresPassword + the password postgres + + + sparkDriverMemory + memory for driver process + + + sparkExecutorMemory + memory for individual executor + + + sparkExecutorCores + number of cores used by single executor + + + + + + + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] + + + + + + + + + + + + + ${jobTracker} + ${nameNode} + eu.dnetlib.dhp.migration.step1.MigrateDbEntitiesApplication + -p${migrationPathStep1}/db_records + -pgurl${postgresURL} + -pguser${postgresUser} + -pgpasswd${postgresPassword} + + + + + + + \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/wfs/regular_step2/oozie_app/config-default.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/wfs/regular_step2/oozie_app/config-default.xml new file mode 100644 index 0000000000..2e0ed9aeea --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/wfs/regular_step2/oozie_app/config-default.xml @@ -0,0 +1,18 @@ + + + jobTracker + yarnRM + + + nameNode + hdfs://nameservice1 + + + oozie.use.system.libpath + true + + + oozie.action.sharelib.for.spark + spark2 + + \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/wfs/regular_step2/oozie_app/workflow.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/wfs/regular_step2/oozie_app/workflow.xml new file mode 100644 index 0000000000..cd0a4025e8 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/wfs/regular_step2/oozie_app/workflow.xml @@ -0,0 +1,74 @@ + + + + migrationPathStep1 + the base path to store hdfs file + + + migrationPathStep2 + the temporary path to store entities before dispatching + + + postgresURL + the postgres URL to access to the database + + + postgresUser + the user postgres + + + postgresPassword + the password postgres + + + sparkDriverMemory + memory for driver process + + + sparkExecutorMemory + memory for individual executor + + + sparkExecutorCores + number of cores used by single executor + + + + + + + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] + + + + + + + + + + + + + + ${jobTracker} + ${nameNode} + yarn-cluster + cluster + GenerateEntities + eu.dnetlib.dhp.migration.step2.GenerateEntitiesApplication + dhp-aggregation-${projectVersion}.jar + --executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores} --driver-memory=${sparkDriverMemory} --conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" --conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" --conf spark.sql.warehouse.dir="/user/hive/warehouse" + -mt yarn-cluster + -s${migrationPathStep1}/db_records,${migrationPathStep1}/oaf_records,${migrationPathStep1}/odf_records + -t${migrationPathStep2}/all_entities + -pgurl${postgresURL} + -pguser${postgresUser} + -pgpasswd${postgresPassword} + + + + + + + \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/wfs/regular_step3/oozie_app/config-default.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/wfs/regular_step3/oozie_app/config-default.xml new file mode 100644 index 0000000000..2e0ed9aeea --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/wfs/regular_step3/oozie_app/config-default.xml @@ -0,0 +1,18 @@ + + + jobTracker + yarnRM + + + nameNode + hdfs://nameservice1 + + + oozie.use.system.libpath + true + + + oozie.action.sharelib.for.spark + spark2 + + \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/wfs/regular_step3/oozie_app/workflow.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/wfs/regular_step3/oozie_app/workflow.xml new file mode 100644 index 0000000000..8688f09d18 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/wfs/regular_step3/oozie_app/workflow.xml @@ -0,0 +1,60 @@ + + + + + migrationPathStep2 + the temporary path to store entities before dispatching + + + migrationPathStep3 + the graph Raw base path + + + sparkDriverMemory + memory for driver process + + + sparkExecutorMemory + memory for individual executor + + + sparkExecutorCores + number of cores used by single executor + + + + + + + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] + + + + + + + + + + + + + + ${jobTracker} + ${nameNode} + yarn-cluster + cluster + GenerateGraph + eu.dnetlib.dhp.migration.step3.DispatchEntitiesApplication + dhp-aggregation-${projectVersion}.jar + --executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores} --driver-memory=${sparkDriverMemory} --conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" --conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" --conf spark.sql.warehouse.dir="/user/hive/warehouse" + -mt yarn-cluster + -s${migrationPathStep2}/all_entities + -g${migrationPathStep3} + + + + + + + \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/log4j.properties b/dhp-workflows/dhp-aggregation/src/main/resources/log4j.properties new file mode 100644 index 0000000000..63cba917ee --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/resources/log4j.properties @@ -0,0 +1,9 @@ +# Set root logger level to DEBUG and its only appender to A1. +log4j.rootLogger=INFO, A1 + +# A1 is set to be a ConsoleAppender. +log4j.appender.A1=org.apache.log4j.ConsoleAppender + +# A1 uses PatternLayout. +log4j.appender.A1.layout=org.apache.log4j.PatternLayout +log4j.appender.A1.layout.ConversionPattern=%-4r [%t] %-5p %c %x - %m%n diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/CollectionJobTest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/CollectionJobTest.java index 848fbe17da..fde928a8b6 100644 --- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/CollectionJobTest.java +++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/CollectionJobTest.java @@ -1,79 +1,89 @@ package eu.dnetlib.dhp.collection; -import com.fasterxml.jackson.databind.ObjectMapper; -import eu.dnetlib.dhp.model.mdstore.MetadataRecord; -import eu.dnetlib.dhp.model.mdstore.Provenance; -import org.apache.commons.io.FileUtils; -import org.apache.commons.io.IOUtils; -import org.junit.*; - import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; +import org.apache.commons.io.FileUtils; +import org.apache.commons.io.IOUtils; +import org.junit.jupiter.api.AfterEach; + +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.dhp.model.mdstore.MetadataRecord; +import eu.dnetlib.dhp.model.mdstore.Provenance; + +import static org.junit.jupiter.api.Assertions.*; + public class CollectionJobTest { - private Path testDir; - @Before - public void setup() throws IOException { - testDir = Files.createTempDirectory("dhp-collection"); - } + private Path testDir; - @After - public void teadDown() throws IOException { - FileUtils.deleteDirectory(testDir.toFile()); - } + @BeforeEach + public void setup() throws IOException { + testDir = Files.createTempDirectory("dhp-collection"); + } - @Test - public void tesCollection() throws Exception { - Provenance provenance = new Provenance("pippo", "puppa", "ns_prefix"); - GenerateNativeStoreSparkJob.main(new String[] { - "-mt", "local", - "-w", "wid", - "-e", "XML", - "-d", ""+System.currentTimeMillis(), - "-p", new ObjectMapper().writeValueAsString(provenance), - "-x", "./*[local-name()='record']/*[local-name()='header']/*[local-name()='identifier']", - "-i", this.getClass().getResource("/eu/dnetlib/dhp/collection/native.seq").toString(), - "-o", testDir.toString()+"/store", - "-t", "true", - "-ru", "", - "-rp", "", - "-rh", "", - "-ro", "", - "-rr", ""}); - System.out.println(new ObjectMapper().writeValueAsString(provenance)); - } + @AfterEach + public void teadDown() throws IOException { + FileUtils.deleteDirectory(testDir.toFile()); + } + @Test + public void tesCollection() throws Exception { + final Provenance provenance = new Provenance("pippo", "puppa", "ns_prefix"); + GenerateNativeStoreSparkJob.main(new String[] { + "-mt", "local", + "-w", "wid", + "-e", "XML", + "-d", "" + System.currentTimeMillis(), + "-p", new ObjectMapper().writeValueAsString(provenance), + "-x", "./*[local-name()='record']/*[local-name()='header']/*[local-name()='identifier']", + "-i", this.getClass().getResource("/eu/dnetlib/dhp/collection/native.seq").toString(), + "-o", testDir.toString() + "/store", + "-t", "true", + "-ru", "", + "-rp", "", + "-rh", "", + "-ro", "", + "-rr", "" }); + System.out.println(new ObjectMapper().writeValueAsString(provenance)); + } + @Test + public void testGenerationMetadataRecord() throws Exception { - @Test - public void testGenerationMetadataRecord() throws Exception { + final String xml = IOUtils.toString(this.getClass().getResourceAsStream("./record.xml")); - final String xml = IOUtils.toString(this.getClass().getResourceAsStream("./record.xml")); + final MetadataRecord record = GenerateNativeStoreSparkJob + .parseRecord(xml, "./*[local-name()='record']/*[local-name()='header']/*[local-name()='identifier']", "XML", new Provenance("foo", "bar", + "ns_prefix"), System.currentTimeMillis(), null, null); - MetadataRecord record = GenerateNativeStoreSparkJob.parseRecord(xml, "./*[local-name()='record']/*[local-name()='header']/*[local-name()='identifier']", "XML", new Provenance("foo", "bar", "ns_prefix"), System.currentTimeMillis(), null,null); + assert record != null; + System.out.println(record.getId()); + System.out.println(record.getOriginalId()); - assert record != null; - System.out.println(record.getId()); - System.out.println(record.getOriginalId()); + } + @Test + public void TestEquals() throws IOException { - } + final String xml = IOUtils.toString(this.getClass().getResourceAsStream("./record.xml")); + final MetadataRecord record = GenerateNativeStoreSparkJob + .parseRecord(xml, "./*[local-name()='record']/*[local-name()='header']/*[local-name()='identifier']", "XML", new Provenance("foo", "bar", + "ns_prefix"), System.currentTimeMillis(), null, null); + final MetadataRecord record1 = GenerateNativeStoreSparkJob + .parseRecord(xml, "./*[local-name()='record']/*[local-name()='header']/*[local-name()='identifier']", "XML", new Provenance("foo", "bar", + "ns_prefix"), System.currentTimeMillis(), null, null); + assert record != null; + record.setBody("ciao"); + assert record1 != null; + record1.setBody("mondo"); + assertEquals(record, record1); - - @Test - public void TestEquals () throws IOException { - - final String xml = IOUtils.toString(this.getClass().getResourceAsStream("./record.xml")); - MetadataRecord record = GenerateNativeStoreSparkJob.parseRecord(xml, "./*[local-name()='record']/*[local-name()='header']/*[local-name()='identifier']", "XML", new Provenance("foo", "bar", "ns_prefix"), System.currentTimeMillis(), null,null); - MetadataRecord record1 = GenerateNativeStoreSparkJob.parseRecord(xml, "./*[local-name()='record']/*[local-name()='header']/*[local-name()='identifier']", "XML", new Provenance("foo", "bar", "ns_prefix"), System.currentTimeMillis(), null,null); - assert record != null; - record.setBody("ciao"); - assert record1 != null; - record1.setBody("mondo"); - Assert.assertEquals(record, record1); - - } + } } diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collector/worker/DnetCollectorWorkerApplicationTests.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collector/worker/DnetCollectorWorkerApplicationTests.java index 6a9417097f..665e989d83 100644 --- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collector/worker/DnetCollectorWorkerApplicationTests.java +++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collector/worker/DnetCollectorWorkerApplicationTests.java @@ -7,13 +7,13 @@ import eu.dnetlib.dhp.collection.worker.DnetCollectorWorker; import eu.dnetlib.dhp.collection.worker.utils.CollectorPluginFactory; import eu.dnetlib.message.Message; import eu.dnetlib.message.MessageManager; -import org.junit.After; -import org.junit.Before; -import org.junit.Test; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; import java.io.File; -import static org.junit.Assert.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertNotNull; import static org.mockito.Mockito.*; @@ -24,7 +24,7 @@ public class DnetCollectorWorkerApplicationTests { private MessageManager messageManager = mock(MessageManager.class); private DnetCollectorWorker worker; - @Before + @BeforeEach public void setup() throws Exception { ObjectMapper mapper = new ObjectMapper(); final String apiJson = mapper.writeValueAsString(getApi()); @@ -47,7 +47,7 @@ public class DnetCollectorWorkerApplicationTests { } - @After + @AfterEach public void dropDown(){ File f = new File("/tmp/file.seq"); f.delete(); diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/migration/step1/MigrateDbEntitiesApplicationTest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/migration/step1/MigrateDbEntitiesApplicationTest.java new file mode 100644 index 0000000000..d63bb3ee32 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/migration/step1/MigrateDbEntitiesApplicationTest.java @@ -0,0 +1,293 @@ +package eu.dnetlib.dhp.migration.step1; + +import com.fasterxml.jackson.core.type.TypeReference; +import com.fasterxml.jackson.databind.ObjectMapper; +import eu.dnetlib.dhp.schema.oaf.*; +import org.apache.commons.io.IOUtils; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.extension.ExtendWith; +import org.mockito.Mock; +import org.mockito.Mockito; +import org.mockito.junit.jupiter.MockitoExtension; + +import java.io.IOException; +import java.sql.Array; +import java.sql.Date; +import java.sql.ResultSet; +import java.sql.SQLException; +import java.util.List; +import java.util.Objects; + +import static org.junit.jupiter.api.Assertions.assertEquals; + +@ExtendWith(MockitoExtension.class) +public class MigrateDbEntitiesApplicationTest { + + private MigrateDbEntitiesApplication app; + + @Mock + private ResultSet rs; + + @BeforeEach + public void setUp() { + this.app = new MigrateDbEntitiesApplication(); + } + + @Test + public void testProcessDatasource() throws Exception { + final List fields = prepareMocks("datasources_resultset_entry.json"); + + final List list = app.processDatasource(rs); + assertEquals(1, list.size()); + verifyMocks(fields); + + final Datasource ds = (Datasource) list.get(0); + assertValidId(ds.getId()); + assertEquals(ds.getOfficialname().getValue(), getValueAsString("officialname", fields)); + assertEquals(ds.getEnglishname().getValue(), getValueAsString("englishname", fields)); + assertEquals(ds.getContactemail().getValue(), getValueAsString("contactemail", fields)); + assertEquals(ds.getWebsiteurl().getValue(), getValueAsString("websiteurl", fields)); + assertEquals(ds.getNamespaceprefix().getValue(), getValueAsString("namespaceprefix", fields)); + assertEquals(ds.getCollectedfrom().get(0).getKey(), getValueAsString("collectedfromid", fields)); + assertEquals(ds.getCollectedfrom().get(0).getValue(), getValueAsString("collectedfromname", fields)); + } + + @Test + public void testProcessProject() throws Exception { + final List fields = prepareMocks("projects_resultset_entry.json"); + + final List list = app.processProject(rs); + assertEquals(1, list.size()); + verifyMocks(fields); + + final Project p = (Project) list.get(0); + assertValidId(p.getId()); + assertEquals(p.getAcronym().getValue(), getValueAsString("acronym", fields)); + assertEquals(p.getTitle().getValue(), getValueAsString("title", fields)); + assertEquals(p.getCollectedfrom().get(0).getKey(), getValueAsString("collectedfromid", fields)); + assertEquals(p.getCollectedfrom().get(0).getValue(), getValueAsString("collectedfromname", fields)); + } + + @Test + public void testProcessOrganization() throws Exception { + final List fields = prepareMocks("organizations_resultset_entry.json"); + + final List list = app.processOrganization(rs); + + assertEquals(1, list.size()); + + verifyMocks(fields); + + final Organization o = (Organization) list.get(0); + assertValidId(o.getId()); + assertEquals(o.getLegalshortname().getValue(), getValueAsString("legalshortname", fields)); + assertEquals(o.getLegalname().getValue(), getValueAsString("legalname", fields)); + assertEquals(o.getWebsiteurl().getValue(), getValueAsString("websiteurl", fields)); + assertEquals(o.getCountry().getClassid(), getValueAsString("country", fields).split("@@@")[0]); + assertEquals(o.getCountry().getClassname(), getValueAsString("country", fields).split("@@@")[1]); + assertEquals(o.getCountry().getSchemeid(), getValueAsString("country", fields).split("@@@")[2]); + assertEquals(o.getCountry().getSchemename(), getValueAsString("country", fields).split("@@@")[3]); + assertEquals(o.getCollectedfrom().get(0).getKey(), getValueAsString("collectedfromid", fields)); + assertEquals(o.getCollectedfrom().get(0).getValue(), getValueAsString("collectedfromname", fields)); + } + + @Test + public void testProcessDatasourceOrganization() throws Exception { + final List fields = prepareMocks("datasourceorganization_resultset_entry.json"); + + final List list = app.processDatasourceOrganization(rs); + + assertEquals(2, list.size()); + verifyMocks(fields); + + final Relation r1 = (Relation) list.get(0); + final Relation r2 = (Relation) list.get(1); + assertValidId(r1.getSource()); + assertValidId(r2.getSource()); + assertEquals(r1.getSource(), r2.getTarget()); + assertEquals(r2.getSource(), r1.getTarget()); + } + + @Test + public void testProcessProjectOrganization() throws Exception { + final List fields = prepareMocks("projectorganization_resultset_entry.json"); + + final List list = app.processProjectOrganization(rs); + + assertEquals(2, list.size()); + verifyMocks(fields); + + final Relation r1 = (Relation) list.get(0); + final Relation r2 = (Relation) list.get(1); + assertValidId(r1.getSource()); + assertValidId(r2.getSource()); + assertEquals(r1.getSource(), r2.getTarget()); + assertEquals(r2.getSource(), r1.getTarget()); + } + + @Test + public void testProcessClaims_context() throws Exception { + final List fields = prepareMocks("claimscontext_resultset_entry.json"); + + final List list = app.processClaims(rs); + + assertEquals(1, list.size()); + verifyMocks(fields); + } + + @Test + public void testProcessClaims_rels() throws Exception { + final List fields = prepareMocks("claimsrel_resultset_entry.json"); + + final List list = app.processClaims(rs); + + assertEquals(2, list.size()); + verifyMocks(fields); + } + + private List prepareMocks(final String jsonFile) throws IOException, SQLException { + final String json = IOUtils.toString(getClass().getResourceAsStream(jsonFile)); + final ObjectMapper mapper = new ObjectMapper(); + final List list = mapper.readValue(json, new TypeReference>() {}); + + for (final TypedField tf : list) { + if (tf.getValue() == null) { + switch (tf.getType()) { + case "not_used": + break; + case "boolean": + Mockito.when(rs.getBoolean(tf.getField())).thenReturn(false); + break; + case "date": + Mockito.when(rs.getDate(tf.getField())).thenReturn(null); + break; + case "int": + Mockito.when(rs.getInt(tf.getField())).thenReturn(0); + break; + case "double": + Mockito.when(rs.getDouble(tf.getField())).thenReturn(0.0); + break; + case "array": + Mockito.when(rs.getArray(tf.getField())).thenReturn(null); + break; + case "string": + default: + Mockito.when(rs.getString(tf.getField())).thenReturn(null); + break; + } + } else { + switch (tf.getType()) { + case "not_used": + break; + case "boolean": + Mockito.when(rs.getBoolean(tf.getField())).thenReturn(Boolean.parseBoolean(tf.getValue().toString())); + break; + case "date": + Mockito.when(rs.getDate(tf.getField())).thenReturn(Date.valueOf(tf.getValue().toString())); + break; + case "int": + Mockito.when(rs.getInt(tf.getField())).thenReturn(new Integer(tf.getValue().toString())); + break; + case "double": + Mockito.when(rs.getDouble(tf.getField())).thenReturn(new Double(tf.getValue().toString())); + break; + case "array": + final Array arr = Mockito.mock(Array.class); + final String[] values = ((List) tf.getValue()).stream() + .filter(Objects::nonNull) + .map(o -> o.toString()) + .toArray(String[]::new); + + Mockito.when(arr.getArray()).thenReturn(values); + Mockito.when(rs.getArray(tf.getField())).thenReturn(arr); + break; + case "string": + default: + Mockito.when(rs.getString(tf.getField())).thenReturn(tf.getValue().toString()); + break; + } + } + } + + return list; + } + + private void verifyMocks(final List list) throws SQLException { + for (final TypedField tf : list) { + + switch (tf.getType()) { + case "not_used": + break; + case "boolean": + Mockito.verify(rs, Mockito.atLeastOnce()).getBoolean(tf.getField()); + break; + case "date": + Mockito.verify(rs, Mockito.atLeastOnce()).getDate(tf.getField()); + break; + case "int": + Mockito.verify(rs, Mockito.atLeastOnce()).getInt(tf.getField()); + break; + case "double": + Mockito.verify(rs, Mockito.atLeastOnce()).getDouble(tf.getField()); + break; + case "array": + Mockito.verify(rs, Mockito.atLeastOnce()).getArray(tf.getField()); + break; + case "string": + default: + Mockito.verify(rs, Mockito.atLeastOnce()).getString(tf.getField()); + break; + } + } + } + + private void assertValidId(final String id) { + assertEquals(49, id.length()); + assertEquals('|', id.charAt(2)); + assertEquals(':', id.charAt(15)); + assertEquals(':', id.charAt(16)); + } + + private String getValueAsString(final String name, final List fields) { + return fields.stream() + .filter(f -> f.getField().equals(name)) + .map(TypedField::getValue) + .filter(Objects::nonNull) + .map(o -> o.toString()) + .findFirst() + .get(); + } +} + +class TypedField { + + private String field; + private String type; + private Object value; + + public String getField() { + return field; + } + + public void setField(final String field) { + this.field = field; + } + + public String getType() { + return type; + } + + public void setType(final String type) { + this.type = type; + } + + public Object getValue() { + return value; + } + + public void setValue(final Object value) { + this.value = value; + } + +} diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/transformation/TransformationJobTest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/transformation/TransformationJobTest.java index 5e5e42f1e2..dfa0c37203 100644 --- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/transformation/TransformationJobTest.java +++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/transformation/TransformationJobTest.java @@ -6,47 +6,32 @@ import eu.dnetlib.dhp.transformation.vocabulary.Vocabulary; import eu.dnetlib.dhp.transformation.vocabulary.VocabularyHelper; import eu.dnetlib.dhp.utils.DHPUtils; import net.sf.saxon.s9api.*; -import org.apache.commons.io.FileUtils; import org.apache.commons.io.IOUtils; import org.apache.spark.util.LongAccumulator; import org.dom4j.Document; import org.dom4j.Node; import org.dom4j.io.SAXReader; -import org.junit.*; +import org.junit.jupiter.api.DisplayName; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.extension.ExtendWith; +import org.junit.jupiter.api.io.TempDir; import org.mockito.Mock; -import org.mockito.junit.MockitoJUnit; -import org.mockito.junit.MockitoRule; +import org.mockito.junit.jupiter.MockitoExtension; import javax.xml.transform.stream.StreamSource; -import java.io.File; -import java.io.IOException; import java.io.StringWriter; import java.nio.file.Files; import java.nio.file.Path; -import java.util.Comparator; import java.util.HashMap; import java.util.Map; +import static org.junit.jupiter.api.Assertions.assertNotNull; + +@ExtendWith(MockitoExtension.class) public class TransformationJobTest { @Mock - LongAccumulator accumulator; - - @Rule - public MockitoRule mockitoRule = MockitoJUnit.rule(); - - private Path testDir; - - @Before - public void setup() throws IOException { - testDir = Files.createTempDirectory("dhp-collection"); - } - - @After - public void tearDown() throws IOException { - FileUtils.deleteDirectory(testDir.toFile()); - } - + private LongAccumulator accumulator; @Test public void testTransformSaxonHE() throws Exception { @@ -70,9 +55,9 @@ public class TransformationJobTest { System.out.println(output.toString()); } - + @DisplayName("Test TransformSparkJobNode.main") @Test - public void transformTest() throws Exception { + public void transformTest(@TempDir Path testDir) throws Exception { final String mdstore_input = this.getClass().getResource("/eu/dnetlib/dhp/transform/mdstorenative").getFile(); final String mdstore_output = testDir.toString()+"/version"; final String xslt = DHPUtils.compressString(IOUtils.toString(this.getClass().getResourceAsStream("/eu/dnetlib/dhp/transform/tr.xml"))); @@ -89,8 +74,6 @@ public class TransformationJobTest { "-rh", "", "-ro", "", "-rr", ""}); - - } @Test @@ -121,7 +104,7 @@ public class TransformationJobTest { record.setBody(IOUtils.toString(this.getClass().getResourceAsStream("/eu/dnetlib/dhp/transform/input.xml"))); final MetadataRecord result = tf.call(record); - Assert.assertNotNull(result.getBody()); + assertNotNull(result.getBody()); System.out.println(result.getBody()); } diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/transformation/vocabulary/VocabularyTest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/transformation/vocabulary/VocabularyTest.java index d96a7ac4c8..c2db17a9d7 100644 --- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/transformation/vocabulary/VocabularyTest.java +++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/transformation/vocabulary/VocabularyTest.java @@ -1,7 +1,7 @@ package eu.dnetlib.dhp.transformation.vocabulary; -import org.junit.Test; -import static org.junit.Assert.*; +import org.junit.jupiter.api.Test; +import static org.junit.jupiter.api.Assertions.*; public class VocabularyTest { diff --git a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/migration/step1/claimscontext_resultset_entry.json b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/migration/step1/claimscontext_resultset_entry.json new file mode 100644 index 0000000000..72bd01a966 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/migration/step1/claimscontext_resultset_entry.json @@ -0,0 +1,27 @@ +[ + { + "field": "source_type", + "type": "string", + "value": "context" + }, + { + "field": "source_id", + "type": "string", + "value": "oa-pg" + }, + { + "field": "target_type", + "type": "string", + "value": "publication" + }, + { + "field": "target_id", + "type": "string", + "value": "userclaim___::d99de49026e79d271f3e7451d8de18b6" + }, + { + "field": "semantics", + "type": "not_used", + "value": "isRelevantTo" + } +] \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/migration/step1/claimsrel_resultset_entry.json b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/migration/step1/claimsrel_resultset_entry.json new file mode 100644 index 0000000000..28fa700356 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/migration/step1/claimsrel_resultset_entry.json @@ -0,0 +1,27 @@ +[ + { + "field": "source_type", + "type": "string", + "value": "project" + }, + { + "field": "source_id", + "type": "string", + "value": "corda__h2020::b38a638a93b505d670fcacc47a0283d6" + }, + { + "field": "target_type", + "type": "string", + "value": "publication" + }, + { + "field": "target_id", + "type": "string", + "value": "userclaim___::5b5117253d3c64c79809d0b92fa287b4" + }, + { + "field": "semantics", + "type": "not_used", + "value": "resultProject_outcome_produces" + } +] \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/migration/step1/datasourceorganization_resultset_entry.json b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/migration/step1/datasourceorganization_resultset_entry.json new file mode 100644 index 0000000000..3a0318ed7b --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/migration/step1/datasourceorganization_resultset_entry.json @@ -0,0 +1,62 @@ +[ + { + "field": "datasource", + "type": "string", + "value": "openaire____::revistasunicauca" + }, + { + "field": "organization", + "type": "string", + "value": "openaire____::openaire____::revistasunicauca" + }, + { + "field": "startdate", + "type": "not_used", + "value": null + }, + { + "field": "enddate", + "type": "not_used", + "value": null + }, + { + "field": "inferred", + "type": "boolean", + "value": false + }, + { + "field": "deletedbyinference", + "type": "boolean", + "value": false + }, + { + "field": "trust", + "type": "string", + "value": "0.9" + }, + { + "field": "inferenceprovenance", + "type": "string", + "value": null + }, + { + "field": "collectedfromid", + "type": "string", + "value": null + }, + { + "field": "collectedfromname", + "type": "string", + "value": null + }, + { + "field": "semantics", + "type": "not_used", + "value": "providedBy@@@provided by@@@dnet:datasources_organizations_typologies@@@dnet:datasources_organizations_typologies" + }, + { + "field": "provenanceaction", + "type": "not_used", + "value": null + } +] \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/migration/step1/datasources_resultset_entry.json b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/migration/step1/datasources_resultset_entry.json new file mode 100644 index 0000000000..71e84954f6 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/migration/step1/datasources_resultset_entry.json @@ -0,0 +1,234 @@ +[ + { + "field": "datasourceid", + "type": "string", + "value": "274269ac6f3b::2579-5449" + }, + { + "field": "identities", + "type": "not_used", + "value": [ + "274269ac6f3b::2579-5449", + null + ] + }, + { + "field": "officialname", + "type": "string", + "value": "Jurnal Ilmiah Pendidikan Scholastic" + }, + { + "field": "englishname", + "type": "string", + "value": "Jurnal Ilmiah Pendidikan Scholastic" + }, + { + "field": "contactemail", + "type": "string", + "value": "test@test.it" + }, + { + "field": "openairecompatibility", + "type": "string", + "value": "hostedBy@@@collected from a compatible aggregator@@@dnet:datasourceCompatibilityLevel@@@dnet:datasourceCompatibilityLevel" + }, + { + "field": "websiteurl", + "type": "string", + "value": "http://e-journal.sastra-unes.com/index.php/JIPS/index" + }, + { + "field": "logourl", + "type": "string", + "value": null + }, + { + "field": "accessinfopackage", + "type": "array", + "value": [ + null + ] + }, + { + "field": "latitude", + "type": "double", + "value": 0 + }, + { + "field": "longitude", + "type": "double", + "value": 0 + }, + { + "field": "namespaceprefix", + "type": "string", + "value": "ojs_25795449" + }, + { + "field": "odnumberofitems", + "type": "int", + "value": null + }, + { + "field": "odnumberofitemsdate", + "type": "date", + "value": null + }, + { + "field": "subjects", + "type": "array", + "value": null + }, + { + "field": "description", + "type": "string", + "value": null + }, + { + "field": "odpolicies", + "type": "string", + "value": null + }, + { + "field": "odlanguages", + "type": "array", + "value": [] + }, + { + "field": "odcontenttypes", + "type": "array", + "value": [ + "Journal articles" + ] + }, + { + "field": "inferred", + "type": "boolean", + "value": false + }, + { + "field": "deletedbyinference", + "type": "boolean", + "value": false + }, + { + "field": "trust", + "type": "string", + "value": "0.9" + }, + { + "field": "inferenceprovenance", + "type": "string", + "value": null + }, + { + "field": "dateofcollection", + "type": "date", + "value": "2020-01-21" + }, + { + "field": "dateofvalidation", + "type": "date", + "value": null + }, + { + "field": "releasestartdate", + "type": "date", + "value": null + }, + { + "field": "releaseenddate", + "type": "date", + "value": null + }, + { + "field": "missionstatementurl", + "type": "string", + "value": null + }, + { + "field": "dataprovider", + "type": "boolean", + "value": null + }, + { + "field": "serviceprovider", + "type": "boolean", + "value": null + }, + { + "field": "databaseaccesstype", + "type": "string", + "value": null + }, + { + "field": "datauploadtype", + "type": "string", + "value": null + }, + { + "field": "databaseaccessrestriction", + "type": "string", + "value": null + }, + { + "field": "datauploadrestriction", + "type": "string", + "value": null + }, + { + "field": "versioning", + "type": "boolean", + "value": null + }, + { + "field": "citationguidelineurl", + "type": "string", + "value": null + }, + { + "field": "qualitymanagementkind", + "type": "string", + "value": null + }, + { + "field": "pidsystems", + "type": "string", + "value": null + }, + { + "field": "certificates", + "type": "string", + "value": null + }, + { + "field": "policies", + "type": "not_used", + "value": [] + }, + { + "field": "collectedfromid", + "type": "string", + "value": "openaire____::SnVybmFsIEZha3VsdGFzIFNhc3RyYSBVbml2ZXJzaXRhcyBFa2FzYWt0aQ==" + }, + { + "field": "collectedfromname", + "type": "string", + "value": "Jurnal Fakultas Sastra Universitas Ekasakti" + }, + { + "field": "datasourcetype", + "type": "string", + "value": "pubsrepository::journal@@@Journal@@@dnet:datasource_typologies@@@dnet:datasource_typologies" + }, + { + "field": "provenanceaction", + "type": "not_used", + "value": "sysimport:crosswalk:entityregistry@@@sysimport:crosswalk:entityregistry@@@dnet:provenance_actions@@@dnet:provenance_actions" + }, + { + "field": "journal", + "type": "string", + "value": "2579-5449@@@2597-6540@@@" + } +] diff --git a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/migration/step1/organizations_resultset_entry.json b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/migration/step1/organizations_resultset_entry.json new file mode 100644 index 0000000000..f766246bcc --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/migration/step1/organizations_resultset_entry.json @@ -0,0 +1,127 @@ +[ + { + "field": "organizationid", + "type": "string", + "value": "openaire____::openaire____::microsoft" + }, + { + "field": "legalshortname", + "type": "string", + "value": "MSFTResearch" + }, + { + "field": "legalname", + "type": "string", + "value": "Microsoft Research" + }, + { + "field": "websiteurl", + "type": "string", + "value": "https://www.microsoft.com/en-us/research/" + }, + { + "field": "logourl", + "type": "string", + "value": null + }, + { + "field": "eclegalbody", + "type": "boolean", + "value": false + }, + { + "field": "eclegalperson", + "type": "boolean", + "value": false + }, + { + "field": "ecnonprofit", + "type": "boolean", + "value": false + }, + { + "field": "ecresearchorganization", + "type": "boolean", + "value": false + }, + { + "field": "echighereducation", + "type": "boolean", + "value": false + }, + { + "field": "ecinternationalorganizationeurinterests", + "type": "boolean", + "value": false + }, + { + "field": "ecinternationalorganization", + "type": "boolean", + "value": false + }, + { + "field": "ecenterprise", + "type": "boolean", + "value": false + }, + { + "field": "ecsmevalidated", + "type": "boolean", + "value": false + }, + { + "field": "ecnutscode", + "type": "boolean", + "value": false + }, + { + "field": "dateofcollection", + "type": "date", + "value": "2018-10-19" + }, + { + "field": "dateoftransformation", + "type": "date", + "value": "2018-10-19" + }, + { + "field": "inferred", + "type": "boolean", + "value": false + }, + { + "field": "deletedbyinference", + "type": "boolean", + "value": false + }, + { + "field": "trust", + "type": "string", + "value": "0.9" + }, + { + "field": "inferenceprovenance", + "type": "string", + "value": "" + }, + { + "field": "collectedfromid", + "type": "string", + "value": "openaire____::TEST" + }, + { + "field": "collectedfromname", + "type": "string", + "value": "TEST" + }, + { + "field": "country", + "type": "string", + "value": "US@@@US@@@dnet:countries@@@dnet:countries" + }, + { + "field": "provenanceaction", + "type": "not_used", + "value": "sysimport:crosswalk:entityregistry@@@sysimport:crosswalk:entityregistry@@@dnet:provenance_actions@@@dnet:provenance_actions" + } +] \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/migration/step1/projectorganization_resultset_entry.json b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/migration/step1/projectorganization_resultset_entry.json new file mode 100644 index 0000000000..855e1a4839 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/migration/step1/projectorganization_resultset_entry.json @@ -0,0 +1,72 @@ +[ + { + "field": "project", + "type": "string", + "value": "nsf_________::1700003" + }, + { + "field": "resporganization", + "type": "string", + "value": "nsf_________::University_of_Notre_Dame" + }, + { + "field": "participantnumber", + "type": "not_used", + "value": 1 + }, + { + "field": "contribution", + "type": "not_used", + "value": null + }, + { + "field": "startdate", + "type": "not_used", + "value": null + }, + { + "field": "enddate", + "type": "not_used", + "value": null + }, + { + "field": "inferred", + "type": "boolean", + "value": false + }, + { + "field": "deletedbyinference", + "type": "boolean", + "value": false + }, + { + "field": "trust", + "type": "string", + "value": "0.9" + }, + { + "field": "inferenceprovenance", + "type": "string", + "value": null + }, + { + "field": "collectedfromid", + "type": "string", + "value": "openaire____::nsf" + }, + { + "field": "collectedfromname", + "type": "string", + "value": "NSF - National Science Foundation" + }, + { + "field": "semantics", + "type": "not_used", + "value": "coordinator@@@coordinator@@@dnet:project_organization_relations@@@dnet:project_organization_relations" + }, + { + "field": "provenanceaction", + "type": "not_used", + "value": "sysimport:crosswalk:entityregistry@@@sysimport:crosswalk:entityregistry@@@dnet:provenance_actions@@@dnet:provenance_actions" + } +] \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/migration/step1/projects_resultset_entry.json b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/migration/step1/projects_resultset_entry.json new file mode 100644 index 0000000000..7d6ebffbee --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/migration/step1/projects_resultset_entry.json @@ -0,0 +1,193 @@ +[ + { + "field": "projectid", + "type": "string", + "value": "aka_________::100469" + }, + { + "field": "code", + "type": "string", + "value": "100469" + }, + { + "field": "websiteurl", + "type": "string", + "value": "http://test" + }, + { + "field": "acronym", + "type": "string", + "value": "RMCAG" + }, + { + "field": "title", + "type": "string", + "value": "Regulation of melanoma cell autonomous growth" + }, + { + "field": "startdate", + "type": "date", + "value": null + }, + { + "field": "enddate", + "type": "date", + "value": null + }, + { + "field": "callidentifier", + "type": "string", + "value": "Tutkijankoulutus ja työskentely ulkomailla/kevät TT" + }, + { + "field": "keywords", + "type": "string", + "value": null + }, + { + "field": "duration", + "type": "int", + "value": null + }, + { + "field": "ecsc39", + "type": "boolean", + "value": null + }, + { + "field": "oamandatepublications", + "type": "boolean", + "value": false + }, + { + "field": "ecarticle29_3", + "type": "boolean", + "value": null + }, + { + "field": "dateofcollection", + "type": "date", + "value": "2019-01-25" + }, + { + "field": "dateoftransformation", + "type": "date", + "value": "2019-04-16" + }, + { + "field": "inferred", + "type": "boolean", + "value": false + }, + { + "field": "deletedbyinference", + "type": "boolean", + "value": false + }, + { + "field": "trust", + "type": "string", + "value": "0.9" + }, + { + "field": "inferenceprovenance", + "type": "string", + "value": null + }, + { + "field": "optional1", + "type": "string", + "value": "9,284 €" + }, + { + "field": "optional2", + "type": "string", + "value": null + }, + { + "field": "jsonextrainfo", + "type": "string", + "value": "{}" + }, + { + "field": "contactfullname", + "type": "string", + "value": null + }, + { + "field": "contactfax", + "type": "string", + "value": null + }, + { + "field": "contactphone", + "type": "string", + "value": null + }, + { + "field": "contactemail", + "type": "string", + "value": null + }, + { + "field": "summary", + "type": "string", + "value": null + }, + { + "field": "currency", + "type": "string", + "value": null + }, + { + "field": "totalcost", + "type": "double", + "value": null + }, + { + "field": "fundedamount", + "type": "double", + "value": null + }, + { + "field": "collectedfromid", + "type": "string", + "value": "openaire____::aka" + }, + { + "field": "collectedfromname", + "type": "string", + "value": "Academy of Finland" + }, + { + "field": "contracttype", + "type": "string", + "value": null + }, + { + "field": "provenanceaction", + "type": "not_used", + "value": "sysimport:crosswalk:entityregistry@@@Harvested@@@dnet:provenanceActions@@@dnet:provenanceActions" + }, + { + "field": "pid", + "type": "not_used", + "value": [ + null + ] + }, + { + "field": "subjects", + "type": "array", + "value": [ + null + ] + }, + { + "field": "fundingtree", + "type": "array", + "value": [ + "\n aka_________::AKA\n AKA\n Academy of Finland\n Academy of Finland\n FI\n " + ] + } +] \ No newline at end of file diff --git a/dhp-workflows/dhp-dedup-openaire/pom.xml b/dhp-workflows/dhp-dedup-openaire/pom.xml new file mode 100644 index 0000000000..a4793da897 --- /dev/null +++ b/dhp-workflows/dhp-dedup-openaire/pom.xml @@ -0,0 +1,97 @@ + + + + dhp-workflows + eu.dnetlib.dhp + 1.1.6-SNAPSHOT + + 4.0.0 + dhp-dedup-openaire + + + + + net.alchim31.maven + scala-maven-plugin + 4.0.1 + + + scala-compile-first + initialize + + add-source + compile + + + + scala-test-compile + process-test-resources + + testCompile + + + + + ${scala.version} + + + + + + + + + + org.apache.spark + spark-core_2.11 + + + org.apache.spark + spark-sql_2.11 + + + + eu.dnetlib.dhp + dhp-common + ${project.version} + + + eu.dnetlib.dhp + dhp-schemas + ${project.version} + + + com.arakelian + java-jq + + + dom4j + dom4j + + + jaxen + jaxen + + + + + eu.dnetlib + dnet-pace-core + + + org.apache.spark + spark-graphx_2.11 + + + com.fasterxml.jackson.core + jackson-databind + + + com.fasterxml.jackson.core + jackson-core + + + + + + \ No newline at end of file diff --git a/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/DatePicker.java b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/dedup/DatePicker.java similarity index 99% rename from dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/DatePicker.java rename to dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/dedup/DatePicker.java index 73f178edce..bd5c1118e9 100644 --- a/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/DatePicker.java +++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/dedup/DatePicker.java @@ -1,4 +1,4 @@ -package eu.dnetlib.dedup; +package eu.dnetlib.dhp.dedup; import eu.dnetlib.dhp.schema.oaf.Field; import org.apache.commons.lang.StringUtils; diff --git a/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/DedupRecordFactory.java b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/dedup/DedupRecordFactory.java similarity index 93% rename from dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/DedupRecordFactory.java rename to dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/dedup/DedupRecordFactory.java index ebb5040781..583e90ab94 100644 --- a/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/DedupRecordFactory.java +++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/dedup/DedupRecordFactory.java @@ -1,6 +1,5 @@ -package eu.dnetlib.dedup; +package eu.dnetlib.dhp.dedup; -import com.fasterxml.jackson.databind.DeserializationFeature; import com.google.common.collect.Lists; import eu.dnetlib.dhp.schema.oaf.*; import eu.dnetlib.pace.config.DedupConfig; @@ -11,7 +10,7 @@ import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.function.PairFunction; import org.apache.spark.sql.Encoders; import org.apache.spark.sql.SparkSession; -import com.fasterxml.jackson.databind.ObjectMapper; +import org.codehaus.jackson.map.ObjectMapper; import scala.Tuple2; import java.util.Collection; @@ -69,8 +68,6 @@ public class DedupRecordFactory { p.setId(e._1()); final ObjectMapper mapper = new ObjectMapper(); - mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); - final Collection dateofacceptance = Lists.newArrayList(); @@ -103,7 +100,6 @@ public class DedupRecordFactory { d.setId(e._1()); final ObjectMapper mapper = new ObjectMapper(); - mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); final Collection dateofacceptance = Lists.newArrayList(); @@ -136,7 +132,6 @@ public class DedupRecordFactory { p.setId(e._1()); final ObjectMapper mapper = new ObjectMapper(); - mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); if (e._2() != null) e._2().forEach(proj -> { try { @@ -160,7 +155,6 @@ public class DedupRecordFactory { s.setId(e._1()); final ObjectMapper mapper = new ObjectMapper(); - mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); final Collection dateofacceptance = Lists.newArrayList(); if (e._2() != null) e._2().forEach(soft -> { @@ -188,7 +182,6 @@ public class DedupRecordFactory { Datasource d = new Datasource(); //the result of the merge, to be returned at the end d.setId(e._1()); final ObjectMapper mapper = new ObjectMapper(); - mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); if (e._2() != null) e._2().forEach(dat -> { try { @@ -213,7 +206,6 @@ public class DedupRecordFactory { o.setId(e._1()); final ObjectMapper mapper = new ObjectMapper(); - mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); StringBuilder trust = new StringBuilder("0.0"); @@ -254,7 +246,6 @@ public class DedupRecordFactory { o.setId(e._1()); final ObjectMapper mapper = new ObjectMapper(); - mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); final Collection dateofacceptance = Lists.newArrayList(); diff --git a/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/DedupUtility.java b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/dedup/DedupUtility.java similarity index 74% rename from dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/DedupUtility.java rename to dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/dedup/DedupUtility.java index 196a8c1401..3d505888a1 100644 --- a/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/DedupUtility.java +++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/dedup/DedupUtility.java @@ -1,35 +1,32 @@ -package eu.dnetlib.dedup; +package eu.dnetlib.dhp.dedup; import com.google.common.collect.Sets; import com.wcohen.ss.JaroWinkler; import eu.dnetlib.dhp.schema.oaf.Author; import eu.dnetlib.dhp.schema.oaf.StructuredProperty; +import eu.dnetlib.dhp.utils.ISLookupClientFactory; +import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; +import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; import eu.dnetlib.pace.clustering.BlacklistAwareClusteringCombiner; import eu.dnetlib.pace.config.DedupConfig; import eu.dnetlib.pace.model.MapDocument; import eu.dnetlib.pace.model.Person; import org.apache.commons.codec.binary.Hex; -import org.apache.commons.io.IOUtils; import org.apache.commons.lang3.StringUtils; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FSDataInputStream; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; import org.apache.spark.SparkContext; -import org.apache.spark.api.java.JavaRDD; -import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.util.LongAccumulator; +import org.dom4j.Document; +import org.dom4j.DocumentException; +import org.dom4j.Element; +import org.dom4j.io.SAXReader; import scala.Tuple2; -import java.io.IOException; -import java.io.StringWriter; -import java.nio.charset.StandardCharsets; +import java.io.StringReader; import java.security.MessageDigest; import java.text.Normalizer; import java.util.*; import java.util.stream.Collectors; -import java.util.stream.Stream; public class DedupUtility { private static final Double THRESHOLD = 0.95; @@ -54,38 +51,6 @@ public class DedupUtility { return accumulators; } - public static JavaRDD loadDataFromHDFS(String path, JavaSparkContext context) { - return context.textFile(path); - } - - public static void deleteIfExists(String path) throws IOException { - Configuration conf = new Configuration(); - FileSystem fileSystem = FileSystem.get(conf); - if (fileSystem.exists(new Path(path))) { - fileSystem.delete(new Path(path), true); - } - } - - public static DedupConfig loadConfigFromHDFS(String path) throws IOException { - - Configuration conf = new Configuration(); - FileSystem fileSystem = FileSystem.get(conf); - FSDataInputStream inputStream = new FSDataInputStream(fileSystem.open(new Path(path))); - - return DedupConfig.load(IOUtils.toString(inputStream, StandardCharsets.UTF_8.name())); - - } - - static String readFromClasspath(final String filename, final Class clazz) { - final StringWriter sw = new StringWriter(); - try { - IOUtils.copy(clazz.getResourceAsStream(filename), sw); - return sw.toString(); - } catch (final IOException e) { - throw new RuntimeException("cannot load resource from classpath: " + filename); - } - } - static Set getGroupingKeys(DedupConfig conf, MapDocument doc) { return Sets.newHashSet(BlacklistAwareClusteringCombiner.filterAndCombine(doc, conf)); } @@ -146,16 +111,20 @@ public class DedupUtility { }); } + public static String createDedupRecordPath(final String basePath, final String actionSetId, final String entityType) { + return String.format("%s/%s/%s_deduprecord", basePath, actionSetId, entityType); + } + public static String createEntityPath(final String basePath, final String entityType) { return String.format("%s/%s", basePath, entityType); } - public static String createSimRelPath(final String basePath, final String entityType) { - return String.format("%s/%s/simRel", basePath, entityType); + public static String createSimRelPath(final String basePath, final String actionSetId,final String entityType) { + return String.format("%s/%s/%s_simrel", basePath, actionSetId, entityType); } - public static String createMergeRelPath(final String basePath, final String entityType) { - return String.format("%s/%s/mergeRel", basePath, entityType); + public static String createMergeRelPath(final String basePath, final String actionSetId, final String entityType) { + return String.format("%s/%s/%s_mergerel", basePath, actionSetId, entityType); } private static Double sim(Author a, Author b) { @@ -216,4 +185,37 @@ public class DedupUtility { return false; return a.getPid().stream().anyMatch(p -> p != null && StringUtils.isNotBlank(p.getValue())); } + + public static List getConfigurations(String isLookUpUrl, String orchestrator) throws ISLookUpException, DocumentException { + final ISLookUpService isLookUpService = ISLookupClientFactory.getLookUpService(isLookUpUrl); + + final String xquery = String.format("/RESOURCE_PROFILE[.//DEDUPLICATION/ACTION_SET/@id = '%s']", orchestrator); + + String orchestratorProfile = isLookUpService.getResourceProfileByQuery(xquery); + + final Document doc = new SAXReader().read(new StringReader(orchestratorProfile)); + + final String actionSetId = doc.valueOf("//DEDUPLICATION/ACTION_SET/@id"); + final List configurations = new ArrayList<>(); + + for (final Object o : doc.selectNodes("//SCAN_SEQUENCE/SCAN")) { + configurations.add(loadConfig(isLookUpService, actionSetId, o)); + } + + return configurations; + + } + + private static DedupConfig loadConfig(final ISLookUpService isLookUpService, final String actionSetId, final Object o) + throws ISLookUpException { + final Element s = (Element) o; + final String configProfileId = s.attributeValue("id"); + final String conf = + isLookUpService.getResourceProfileByQuery(String.format( + "for $x in /RESOURCE_PROFILE[.//RESOURCE_IDENTIFIER/@value = '%s'] return $x//DEDUPLICATION/text()", + configProfileId)); + final DedupConfig dedupConfig = DedupConfig.load(conf); + dedupConfig.getWf().setConfigurationId(actionSetId); + return dedupConfig; + } } diff --git a/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/Deduper.java b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/dedup/Deduper.java similarity index 99% rename from dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/Deduper.java rename to dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/dedup/Deduper.java index 7206f892f7..dda71fbcf7 100644 --- a/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/Deduper.java +++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/dedup/Deduper.java @@ -1,7 +1,6 @@ -package eu.dnetlib.dedup; +package eu.dnetlib.dhp.dedup; import eu.dnetlib.pace.config.DedupConfig; -import eu.dnetlib.pace.model.Field; import eu.dnetlib.pace.model.MapDocument; import eu.dnetlib.pace.util.BlockProcessor; import eu.dnetlib.pace.util.MapDocumentUtil; diff --git a/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/OafEntityType.java b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/dedup/OafEntityType.java similarity index 83% rename from dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/OafEntityType.java rename to dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/dedup/OafEntityType.java index fb347ed516..66f0b3ce67 100644 --- a/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/OafEntityType.java +++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/dedup/OafEntityType.java @@ -1,4 +1,4 @@ -package eu.dnetlib.dedup; +package eu.dnetlib.dhp.dedup; public enum OafEntityType { diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/dedup/SparkCreateConnectedComponent.java b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/dedup/SparkCreateConnectedComponent.java new file mode 100644 index 0000000000..75b1dd01c2 --- /dev/null +++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/dedup/SparkCreateConnectedComponent.java @@ -0,0 +1,100 @@ +package eu.dnetlib.dhp.dedup; + +import com.google.common.hash.Hashing; +import eu.dnetlib.dhp.dedup.graph.ConnectedComponent; +import eu.dnetlib.dhp.dedup.graph.GraphProcessor; +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.schema.oaf.Relation; +import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; +import eu.dnetlib.pace.config.DedupConfig; +import eu.dnetlib.pace.util.MapDocumentUtil; +import org.apache.commons.io.IOUtils; +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.JavaPairRDD; +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.api.java.function.FlatMapFunction; +import org.apache.spark.api.java.function.PairFunction; +import org.apache.spark.graphx.Edge; +import org.apache.spark.rdd.RDD; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Encoders; +import org.apache.spark.sql.SparkSession; +import org.dom4j.DocumentException; +import scala.Tuple2; + +import java.util.ArrayList; +import java.util.List; + +public class SparkCreateConnectedComponent { + + public static void main(String[] args) throws Exception { + final ArgumentApplicationParser parser = new ArgumentApplicationParser(IOUtils.toString(SparkCreateConnectedComponent.class.getResourceAsStream("/eu/dnetlib/dhp/dedup/createCC_parameters.json"))); + parser.parseArgument(args); + + new SparkCreateConnectedComponent().run(parser); + } + + private void run(ArgumentApplicationParser parser) throws ISLookUpException, DocumentException { + + final String graphBasePath = parser.get("graphBasePath"); + final String workingPath = parser.get("workingPath"); + final String isLookUpUrl = parser.get("isLookUpUrl"); + final String actionSetId = parser.get("actionSetId"); + + try (SparkSession spark = getSparkSession(parser)) { + + final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); + + for (DedupConfig dedupConf: DedupUtility.getConfigurations(isLookUpUrl, actionSetId)) { + + final String entity = dedupConf.getWf().getEntityType(); + final String subEntity = dedupConf.getWf().getSubEntityValue(); + + final JavaPairRDD vertexes = sc.textFile(graphBasePath + "/" + subEntity) + .map(s -> MapDocumentUtil.getJPathString(dedupConf.getWf().getIdPath(), s)) + .mapToPair((PairFunction) + s -> new Tuple2(getHashcode(s), s) + ); + + final Dataset similarityRelations = spark.read().load(DedupUtility.createSimRelPath(workingPath, actionSetId, subEntity)).as(Encoders.bean(Relation.class)); + final RDD> edgeRdd = similarityRelations.javaRDD().map(it -> new Edge<>(getHashcode(it.getSource()), getHashcode(it.getTarget()), it.getRelClass())).rdd(); + final JavaRDD cc = GraphProcessor.findCCs(vertexes.rdd(), edgeRdd, dedupConf.getWf().getMaxIterations()).toJavaRDD(); + final Dataset mergeRelation = spark.createDataset(cc.filter(k -> k.getDocIds().size() > 1).flatMap((FlatMapFunction) c -> + c.getDocIds() + .stream() + .flatMap(id -> { + List tmp = new ArrayList<>(); + Relation r = new Relation(); + r.setSource(c.getCcId()); + r.setTarget(id); + r.setRelClass("merges"); + tmp.add(r); + r = new Relation(); + r.setTarget(c.getCcId()); + r.setSource(id); + r.setRelClass("isMergedIn"); + tmp.add(r); + return tmp.stream(); + }).iterator()).rdd(), Encoders.bean(Relation.class)); + mergeRelation.write().mode("overwrite").save(DedupUtility.createMergeRelPath(workingPath, actionSetId, entity)); + } + } + } + + public static long getHashcode(final String id) { + return Hashing.murmur3_128().hashString(id).asLong(); + } + + private static SparkSession getSparkSession(ArgumentApplicationParser parser) { + SparkConf conf = new SparkConf(); + + return SparkSession + .builder() + .appName(SparkCreateSimRels.class.getSimpleName()) + .master(parser.get("master")) + .config(conf) + .enableHiveSupport() + .getOrCreate(); + } +} diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/dedup/SparkCreateDedupRecord.java b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/dedup/SparkCreateDedupRecord.java new file mode 100644 index 0000000000..51d0760e04 --- /dev/null +++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/dedup/SparkCreateDedupRecord.java @@ -0,0 +1,62 @@ +package eu.dnetlib.dhp.dedup; + +import com.fasterxml.jackson.databind.ObjectMapper; +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.schema.oaf.OafEntity; +import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; +import eu.dnetlib.pace.config.DedupConfig; +import org.apache.commons.io.IOUtils; +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.sql.SparkSession; +import org.dom4j.DocumentException; + +public class SparkCreateDedupRecord { + + public static void main(String[] args) throws Exception { + final ArgumentApplicationParser parser = new ArgumentApplicationParser(IOUtils.toString(SparkCreateDedupRecord.class.getResourceAsStream("/eu/dnetlib/dhp/dedup/createDedupRecord_parameters.json"))); + parser.parseArgument(args); + + new SparkCreateDedupRecord().run(parser); + } + + private void run(ArgumentApplicationParser parser) throws ISLookUpException, DocumentException { + + final String graphBasePath = parser.get("graphBasePath"); + final String isLookUpUrl = parser.get("isLookUpUrl"); + final String actionSetId = parser.get("actionSetId"); + final String workingPath = parser.get("workingPath"); + + try (SparkSession spark = getSparkSession(parser)) { + final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); + + for (DedupConfig dedupConf: DedupUtility.getConfigurations(isLookUpUrl, actionSetId)) { + String subEntity = dedupConf.getWf().getSubEntityValue(); + + final String mergeRelPath = DedupUtility.createMergeRelPath(workingPath, actionSetId, subEntity); + final String entityPath = DedupUtility.createEntityPath(graphBasePath, subEntity); + final OafEntityType entityType = OafEntityType.valueOf(subEntity); + final JavaRDD dedupRecord = + DedupRecordFactory.createDedupRecord(sc, spark, mergeRelPath, entityPath, entityType, dedupConf); + dedupRecord.map(r -> { + ObjectMapper mapper = new ObjectMapper(); + return mapper.writeValueAsString(r); + }).saveAsTextFile(DedupUtility.createDedupRecordPath(workingPath, actionSetId, subEntity)); + } + } + } + + private static SparkSession getSparkSession(ArgumentApplicationParser parser) { + SparkConf conf = new SparkConf(); + + return SparkSession + .builder() + .appName(SparkCreateDedupRecord.class.getSimpleName()) + .master(parser.get("master")) + .config(conf) + .enableHiveSupport() + .getOrCreate(); + } +} + diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/dedup/SparkCreateSimRels.java b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/dedup/SparkCreateSimRels.java new file mode 100644 index 0000000000..0fc72db1e1 --- /dev/null +++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/dedup/SparkCreateSimRels.java @@ -0,0 +1,134 @@ +package eu.dnetlib.dhp.dedup; + +import com.fasterxml.jackson.core.JsonProcessingException; +import com.fasterxml.jackson.databind.ObjectMapper; +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.schema.action.AtomicAction; +import eu.dnetlib.dhp.schema.oaf.Relation; +import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; +import eu.dnetlib.pace.config.DedupConfig; +import eu.dnetlib.pace.model.MapDocument; +import eu.dnetlib.pace.util.MapDocumentUtil; +import org.apache.commons.io.IOUtils; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.compress.GzipCodec; +import org.apache.hadoop.mapred.SequenceFileOutputFormat; +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.JavaPairRDD; +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.sql.Encoders; +import org.apache.spark.sql.SparkSession; +import org.dom4j.DocumentException; +import scala.Tuple2; + +import java.io.Serializable; +import java.util.List; + +public class SparkCreateSimRels implements Serializable { + + private static final Log log = LogFactory.getLog(SparkCreateSimRels.class); + + public static void main(String[] args) throws Exception { + final ArgumentApplicationParser parser = new ArgumentApplicationParser(IOUtils.toString(SparkCreateSimRels.class.getResourceAsStream("/eu/dnetlib/dhp/dedup/createSimRels_parameters.json"))); + parser.parseArgument(args); + + new SparkCreateSimRels().run(parser); + } + + private void run(ArgumentApplicationParser parser) throws ISLookUpException, DocumentException { + + //read oozie parameters + final String graphBasePath = parser.get("graphBasePath"); + final String isLookUpUrl = parser.get("isLookUpUrl"); + final String actionSetId = parser.get("actionSetId"); + final String workingPath = parser.get("workingPath"); + + System.out.println(String.format("graphBasePath: '%s'", graphBasePath)); + System.out.println(String.format("isLookUpUrl: '%s'", isLookUpUrl)); + System.out.println(String.format("actionSetId: '%s'", actionSetId)); + System.out.println(String.format("workingPath: '%s'", workingPath)); + + try (SparkSession spark = getSparkSession(parser)) { + final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); + + //for each dedup configuration + for (DedupConfig dedupConf: DedupUtility.getConfigurations(isLookUpUrl, actionSetId)) { + final String entity = dedupConf.getWf().getEntityType(); + final String subEntity = dedupConf.getWf().getSubEntityValue(); + + JavaPairRDD mapDocument = sc.textFile(DedupUtility.createEntityPath(graphBasePath, subEntity)) + .mapToPair(s -> { + MapDocument d = MapDocumentUtil.asMapDocumentWithJPath(dedupConf, s); + return new Tuple2<>(d.getIdentifier(), d); + }); + + //create blocks for deduplication + JavaPairRDD> blocks = Deduper.createsortedBlocks(sc, mapDocument, dedupConf); + + //create relations by comparing only elements in the same group + final JavaPairRDD dedupRels = Deduper.computeRelations2(sc, blocks, dedupConf); + + JavaRDD relationsRDD = dedupRels.map(r -> createSimRel(r._1(), r._2(), entity)); + + //save the simrel in the workingdir + spark.createDataset(relationsRDD.rdd(), Encoders.bean(Relation.class)) + .write() + .mode("overwrite") + .save(DedupUtility.createSimRelPath(workingPath, actionSetId, subEntity)); + } + } + } + + /** + * Utility method used to create an atomic action from a Relation object + * @param relation input relation + * @return A tuple2 with [id, json serialization of the atomic action] + * @throws JsonProcessingException + */ + public Tuple2 createSequenceFileRow(Relation relation) throws JsonProcessingException { + + ObjectMapper mapper = new ObjectMapper(); + + String id = relation.getSource() + "@" + relation.getRelClass() + "@" + relation.getTarget(); + AtomicAction aa = new AtomicAction<>(Relation.class, relation); + + return new Tuple2<>( + new Text(id), + new Text(mapper.writeValueAsString(aa)) + ); + } + + public Relation createSimRel(String source, String target, String entity){ + final Relation r = new Relation(); + r.setSource(source); + r.setTarget(target); + + switch(entity){ + case "result": + r.setRelClass("resultResult_dedupSimilarity_isSimilarTo"); + break; + case "organization": + r.setRelClass("organizationOrganization_dedupSimilarity_isSimilarTo"); + break; + default: + r.setRelClass("isSimilarTo"); + break; + } + return r; + } + + private static SparkSession getSparkSession(ArgumentApplicationParser parser) { + SparkConf conf = new SparkConf(); + + return SparkSession + .builder() + .appName(SparkCreateSimRels.class.getSimpleName()) + .master(parser.get("master")) + .config(conf) + .getOrCreate(); + } + +} diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/dedup/SparkPropagateRelation.java b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/dedup/SparkPropagateRelation.java new file mode 100644 index 0000000000..5c7be2817e --- /dev/null +++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/dedup/SparkPropagateRelation.java @@ -0,0 +1,169 @@ +package eu.dnetlib.dhp.dedup; + +import com.fasterxml.jackson.databind.DeserializationFeature; +import com.fasterxml.jackson.databind.ObjectMapper; +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.schema.oaf.DataInfo; +import eu.dnetlib.dhp.schema.oaf.Oaf; +import eu.dnetlib.dhp.schema.oaf.Relation; +import eu.dnetlib.pace.util.MapDocumentUtil; +import org.apache.commons.io.IOUtils; +import org.apache.hadoop.io.compress.GzipCodec; +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.JavaPairRDD; +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.api.java.Optional; +import org.apache.spark.api.java.function.Function; +import org.apache.spark.api.java.function.PairFunction; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Encoders; +import org.apache.spark.sql.Row; +import org.apache.spark.sql.SparkSession; +import scala.Tuple2; + +import java.io.IOException; + +public class SparkPropagateRelation { + + enum FieldType { + SOURCE, + TARGET + } + + final static String SOURCEJSONPATH = "$.source"; + final static String TARGETJSONPATH = "$.target"; + + public static void main(String[] args) throws Exception { + final ArgumentApplicationParser parser = new ArgumentApplicationParser(IOUtils.toString(SparkPropagateRelation.class.getResourceAsStream("/eu/dnetlib/dhp/dedup/propagateRelation_parameters.json"))); + parser.parseArgument(args); + + new SparkPropagateRelation().run(parser); + } + + public void run(ArgumentApplicationParser parser) { + + final String graphBasePath = parser.get("graphBasePath"); + final String workingPath = parser.get("workingPath"); + final String dedupGraphPath = parser.get("dedupGraphPath"); + + try (SparkSession spark = getSparkSession(parser)) { + final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); + + final Dataset mergeRels = spark.read().load(DedupUtility.createMergeRelPath(workingPath, "*", "*")).as(Encoders.bean(Relation.class)); + + final JavaPairRDD mergedIds = mergeRels + .where("relClass == 'merges'") + .select(mergeRels.col("source"), mergeRels.col("target")) + .distinct() + .toJavaRDD() + .mapToPair((PairFunction) r -> new Tuple2<>(r.getString(1), r.getString(0))); + + JavaRDD relations = sc.textFile(DedupUtility.createEntityPath(graphBasePath, "relation")); + + JavaRDD newRels = relations.mapToPair( + (PairFunction) s -> + new Tuple2<>(MapDocumentUtil.getJPathString(SOURCEJSONPATH, s), s)) + .leftOuterJoin(mergedIds) + .map((Function>>, String>) v1 -> { + if (v1._2()._2().isPresent()) { + return replaceField(v1._2()._1(), v1._2()._2().get(), FieldType.SOURCE); + } + return v1._2()._1(); + }) + .mapToPair( + (PairFunction) s -> + new Tuple2<>(MapDocumentUtil.getJPathString(TARGETJSONPATH, s), s)) + .leftOuterJoin(mergedIds) + .map((Function>>, String>) v1 -> { + if (v1._2()._2().isPresent()) { + return replaceField(v1._2()._1(), v1._2()._2().get(), FieldType.TARGET); + } + return v1._2()._1(); + }).filter(SparkPropagateRelation::containsDedup) + .repartition(500); + + //update deleted by inference + relations = relations.mapToPair( + (PairFunction) s -> + new Tuple2<>(MapDocumentUtil.getJPathString(SOURCEJSONPATH, s), s)) + .leftOuterJoin(mergedIds) + .map((Function>>, String>) v1 -> { + if (v1._2()._2().isPresent()) { + return updateDeletedByInference(v1._2()._1(), Relation.class); + } + return v1._2()._1(); + }) + .mapToPair( + (PairFunction) s -> + new Tuple2<>(MapDocumentUtil.getJPathString(TARGETJSONPATH, s), s)) + .leftOuterJoin(mergedIds) + .map((Function>>, String>) v1 -> { + if (v1._2()._2().isPresent()) { + return updateDeletedByInference(v1._2()._1(), Relation.class); + } + return v1._2()._1(); + }) + .repartition(500); + + newRels.union(relations).repartition(1000) + .saveAsTextFile(DedupUtility.createEntityPath(dedupGraphPath, "relation"), GzipCodec.class); + } + } + + private static boolean containsDedup(final String json) { + final String source = MapDocumentUtil.getJPathString(SOURCEJSONPATH, json); + final String target = MapDocumentUtil.getJPathString(TARGETJSONPATH, json); + + return source.toLowerCase().contains("dedup") || target.toLowerCase().contains("dedup"); + } + + private static String replaceField(final String json, final String id, final FieldType type) { + ObjectMapper mapper = new ObjectMapper(); + mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); + try { + Relation relation = mapper.readValue(json, Relation.class); + if (relation.getDataInfo() == null) + relation.setDataInfo(new DataInfo()); + relation.getDataInfo().setDeletedbyinference(false); + switch (type) { + case SOURCE: + relation.setSource(id); + return mapper.writeValueAsString(relation); + case TARGET: + relation.setTarget(id); + return mapper.writeValueAsString(relation); + default: + throw new IllegalArgumentException(""); + } + } catch (IOException e) { + throw new RuntimeException("unable to deserialize json relation: " + json, e); + } + } + + private static SparkSession getSparkSession(ArgumentApplicationParser parser) { + SparkConf conf = new SparkConf(); + + return SparkSession + .builder() + .appName(SparkPropagateRelation.class.getSimpleName()) + .master(parser.get("master")) + .config(conf) + .enableHiveSupport() + .getOrCreate(); + } + + private static String updateDeletedByInference(final String json, final Class clazz) { + final ObjectMapper mapper = new ObjectMapper(); + mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); + try { + Oaf entity = mapper.readValue(json, clazz); + if (entity.getDataInfo()== null) + entity.setDataInfo(new DataInfo()); + entity.getDataInfo().setDeletedbyinference(true); + return mapper.writeValueAsString(entity); + } catch (IOException e) { + throw new RuntimeException("Unable to convert json", e); + } + } +} diff --git a/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/SparkReporter.java b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/dedup/SparkReporter.java similarity index 97% rename from dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/SparkReporter.java rename to dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/dedup/SparkReporter.java index 165a10b25a..c83a66e700 100644 --- a/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/SparkReporter.java +++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/dedup/SparkReporter.java @@ -1,4 +1,4 @@ -package eu.dnetlib.dedup; +package eu.dnetlib.dhp.dedup; import eu.dnetlib.pace.util.Reporter; import org.apache.commons.logging.Log; diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/dedup/SparkUpdateEntity.java b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/dedup/SparkUpdateEntity.java new file mode 100644 index 0000000000..b8b41d217b --- /dev/null +++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/dedup/SparkUpdateEntity.java @@ -0,0 +1,141 @@ +package eu.dnetlib.dhp.dedup; + +import com.fasterxml.jackson.databind.DeserializationFeature; +import com.fasterxml.jackson.databind.ObjectMapper; +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.schema.oaf.*; +import eu.dnetlib.pace.util.MapDocumentUtil; +import org.apache.commons.io.IOUtils; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.*; +import org.apache.hadoop.io.compress.GzipCodec; +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.JavaPairRDD; +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.api.java.function.PairFunction; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Encoders; +import org.apache.spark.sql.Row; +import org.apache.spark.sql.SparkSession; +import scala.Tuple2; + +import java.io.IOException; +import java.io.Serializable; + +public class SparkUpdateEntity implements Serializable { + + final String IDJSONPATH = "$.id"; + + public static void main(String[] args) throws Exception { + final ArgumentApplicationParser parser = new ArgumentApplicationParser(IOUtils.toString(SparkUpdateEntity.class.getResourceAsStream("/eu/dnetlib/dhp/dedup/updateEntity_parameters.json"))); + parser.parseArgument(args); + + new SparkUpdateEntity().run(parser); + } + + public boolean mergeRelExists(String basePath, String entity) throws IOException { + + boolean result = false; + + FileSystem fileSystem = FileSystem.get(new Configuration()); + + FileStatus[] fileStatuses = fileSystem.listStatus(new Path(basePath)); + + for (FileStatus fs : fileStatuses) { + if (fs.isDirectory()) + if (fileSystem.exists(new Path(DedupUtility.createMergeRelPath(basePath, fs.getPath().getName(), entity)))) + result = true; + } + + return result; + } + + public void run(ArgumentApplicationParser parser) throws IOException { + + final String graphBasePath = parser.get("graphBasePath"); + final String workingPath = parser.get("workingPath"); + final String dedupGraphPath = parser.get("dedupGraphPath"); + + try (SparkSession spark = getSparkSession(parser)) { + + final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); + + //for each entity + for (OafEntityType entity: OafEntityType.values()) { + + JavaRDD sourceEntity = sc.textFile(DedupUtility.createEntityPath(graphBasePath, entity.toString())); + + if (mergeRelExists(workingPath, entity.toString())) { + + final Dataset rel = spark.read().load(DedupUtility.createMergeRelPath(workingPath, "*", entity.toString())).as(Encoders.bean(Relation.class)); + + final JavaPairRDD mergedIds = rel + .where("relClass == 'merges'") + .select(rel.col("target")) + .distinct() + .toJavaRDD() + .mapToPair((PairFunction) r -> new Tuple2<>(r.getString(0), "d")); + + final JavaRDD dedupEntity = sc.textFile(DedupUtility.createDedupRecordPath(workingPath, "*", entity.toString())); + + JavaPairRDD entitiesWithId = sourceEntity.mapToPair((PairFunction) s -> new Tuple2<>(MapDocumentUtil.getJPathString(IDJSONPATH, s), s)); + + JavaRDD map = entitiesWithId.leftOuterJoin(mergedIds).map(k -> k._2()._2().isPresent() ? updateDeletedByInference(k._2()._1(), getOafClass(entity)) : k._2()._1()); + sourceEntity = map.union(dedupEntity); + + } + + sourceEntity.saveAsTextFile(dedupGraphPath + "/" + entity, GzipCodec.class); + + } + } + } + + public Class getOafClass(OafEntityType className) { + switch (className.toString()) { + case "publication": + return Publication.class; + case "dataset": + return eu.dnetlib.dhp.schema.oaf.Dataset.class; + case "datasource": + return Datasource.class; + case "software": + return Software.class; + case "organization": + return Organization.class; + case "otherresearchproduct": + return OtherResearchProduct.class; + case "project": + return Project.class; + default: + throw new IllegalArgumentException("Illegal type " + className); + } + } + + private static String updateDeletedByInference(final String json, final Class clazz) { + final ObjectMapper mapper = new ObjectMapper(); + mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); + try { + Oaf entity = mapper.readValue(json, clazz); + if (entity.getDataInfo()== null) + entity.setDataInfo(new DataInfo()); + entity.getDataInfo().setDeletedbyinference(true); + return mapper.writeValueAsString(entity); + } catch (IOException e) { + throw new RuntimeException("Unable to convert json", e); + } + } + + private static SparkSession getSparkSession(ArgumentApplicationParser parser) { + SparkConf conf = new SparkConf(); + + return SparkSession + .builder() + .appName(SparkUpdateEntity.class.getSimpleName()) + .master(parser.get("master")) + .config(conf) + .enableHiveSupport() + .getOrCreate(); + } +} diff --git a/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/graph/ConnectedComponent.java b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/dedup/graph/ConnectedComponent.java similarity index 95% rename from dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/graph/ConnectedComponent.java rename to dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/dedup/graph/ConnectedComponent.java index 27a61c02d7..dd1a370c5c 100644 --- a/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/graph/ConnectedComponent.java +++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/dedup/graph/ConnectedComponent.java @@ -1,7 +1,7 @@ -package eu.dnetlib.dedup.graph; +package eu.dnetlib.dhp.dedup.graph; import com.fasterxml.jackson.databind.ObjectMapper; -import eu.dnetlib.dedup.DedupUtility; +import eu.dnetlib.dhp.dedup.DedupUtility; import eu.dnetlib.pace.util.PaceException; import org.apache.commons.lang.StringUtils; import org.codehaus.jackson.annotate.JsonIgnore; diff --git a/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/graph/GraphProcessor.scala b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/dedup/graph/GraphProcessor.scala similarity index 96% rename from dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/graph/GraphProcessor.scala rename to dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/dedup/graph/GraphProcessor.scala index 38c6951528..80b0b9ef4c 100644 --- a/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/graph/GraphProcessor.scala +++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/dedup/graph/GraphProcessor.scala @@ -1,4 +1,4 @@ -package eu.dnetlib.dedup.graph +package eu.dnetlib.dhp.dedup.graph import org.apache.spark.graphx._ import org.apache.spark.rdd.RDD diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/dedup/consistency/oozie_app/config-default.xml b/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/dedup/consistency/oozie_app/config-default.xml new file mode 100644 index 0000000000..2e0ed9aeea --- /dev/null +++ b/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/dedup/consistency/oozie_app/config-default.xml @@ -0,0 +1,18 @@ + + + jobTracker + yarnRM + + + nameNode + hdfs://nameservice1 + + + oozie.use.system.libpath + true + + + oozie.action.sharelib.for.spark + spark2 + + \ No newline at end of file diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/dedup/consistency/oozie_app/workflow.xml b/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/dedup/consistency/oozie_app/workflow.xml new file mode 100644 index 0000000000..fecd204e81 --- /dev/null +++ b/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/dedup/consistency/oozie_app/workflow.xml @@ -0,0 +1,105 @@ + + + + graphBasePath + the raw graph base path + + + workingPath + path of the working directory + + + dedupGraphPath + path of the dedup graph + + + sparkDriverMemory + memory for driver process + + + sparkExecutorMemory + memory for individual executor + + + sparkExecutorCores + number of cores used by single executor + + + + + ${jobTracker} + ${nameNode} + + + mapreduce.job.queuename + ${queueName} + + + oozie.launcher.mapred.job.queue.name + ${oozieLauncherQueueName} + + + + + + + + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] + + + + + + + + yarn + cluster + Update Entity + eu.dnetlib.dhp.dedup.SparkUpdateEntity + dhp-dedup-openaire-${projectVersion}.jar + + --executor-memory ${sparkExecutorMemory} + --executor-cores ${sparkExecutorCores} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" + --conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" + --conf spark.sql.warehouse.dir="/user/hive/warehouse" + + -mtyarn + --i${graphBasePath} + --w${workingPath} + --o${dedupGraphPath} + + + + + + + + + + + yarn + cluster + Update Relations + eu.dnetlib.dhp.dedup.SparkPropagateRelation + dhp-dedup-openaire-${projectVersion}.jar + + --executor-memory ${sparkExecutorMemory} + --executor-cores ${sparkExecutorCores} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" + --conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" + --conf spark.sql.warehouse.dir="/user/hive/warehouse" + + -mtyarn + --i${graphBasePath} + --o${dedupGraphPath} + --w${workingPath} + + + + + + + \ No newline at end of file diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/dedup/createCC_parameters.json b/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/dedup/createCC_parameters.json new file mode 100644 index 0000000000..42ef2b78e3 --- /dev/null +++ b/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/dedup/createCC_parameters.json @@ -0,0 +1,32 @@ +[ + { + "paramName": "mt", + "paramLongName": "master", + "paramDescription": "should be local or yarn", + "paramRequired": true + }, + { + "paramName": "asi", + "paramLongName": "actionSetId", + "paramDescription": "action set identifier (name of the orchestrator)", + "paramRequired": true + }, + { + "paramName": "i", + "paramLongName": "graphBasePath", + "paramDescription": "the base path of the raw graph", + "paramRequired": true + }, + { + "paramName": "la", + "paramLongName": "isLookUpUrl", + "paramDescription": "the url for the lookup service", + "paramRequired": true + }, + { + "paramName": "w", + "paramLongName": "workingPath", + "paramDescription": "path for the working directory", + "paramRequired": true + } +] \ No newline at end of file diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/dedup/createDedupRecord_parameters.json b/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/dedup/createDedupRecord_parameters.json new file mode 100644 index 0000000000..f7bf5e518a --- /dev/null +++ b/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/dedup/createDedupRecord_parameters.json @@ -0,0 +1,32 @@ +[ + { + "paramName": "mt", + "paramLongName": "master", + "paramDescription": "should be local or yarn", + "paramRequired": true + }, + { + "paramName": "i", + "paramLongName": "graphBasePath", + "paramDescription": "the base path of raw graph", + "paramRequired": true + }, + { + "paramName": "w", + "paramLongName": "workingPath", + "paramDescription": "the working directory path", + "paramRequired": true + }, + { + "paramName": "la", + "paramLongName": "isLookUpUrl", + "paramDescription": "the url of the lookup service", + "paramRequired": true + }, + { + "paramName": "asi", + "paramLongName": "actionSetId", + "paramDescription": "the id of the actionset (orchestrator)", + "paramRequired": true + } +] \ No newline at end of file diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/dedup/createSimRels_parameters.json b/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/dedup/createSimRels_parameters.json new file mode 100644 index 0000000000..8cffa86dc8 --- /dev/null +++ b/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/dedup/createSimRels_parameters.json @@ -0,0 +1,32 @@ +[ + { + "paramName": "mt", + "paramLongName": "master", + "paramDescription": "should be local or yarn", + "paramRequired": true + }, + { + "paramName": "la", + "paramLongName": "isLookUpUrl", + "paramDescription": "address for the LookUp", + "paramRequired": true + }, + { + "paramName": "asi", + "paramLongName": "actionSetId", + "paramDescription": "action set identifier (name of the orchestrator)", + "paramRequired": true + }, + { + "paramName": "i", + "paramLongName": "graphBasePath", + "paramDescription": "the base path of the raw graph", + "paramRequired": true + }, + { + "paramName": "w", + "paramLongName": "workingPath", + "paramDescription": "path of the working directory", + "paramRequired": true + } +] \ No newline at end of file diff --git a/dhp-workflows/dhp-dedup-scholexplorer/src/main/resources/eu/dnetlib/dhp/dedup/dedupRecord_parameters.json b/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/dedup/dedupRecord_parameters.json similarity index 100% rename from dhp-workflows/dhp-dedup-scholexplorer/src/main/resources/eu/dnetlib/dhp/dedup/dedupRecord_parameters.json rename to dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/dedup/dedupRecord_parameters.json diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/dedup/propagateRelation_parameters.json b/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/dedup/propagateRelation_parameters.json new file mode 100644 index 0000000000..721a783e12 --- /dev/null +++ b/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/dedup/propagateRelation_parameters.json @@ -0,0 +1,26 @@ +[ + { + "paramName": "mt", + "paramLongName": "master", + "paramDescription": "should be local or yarn", + "paramRequired": true + }, + { + "paramName": "i", + "paramLongName": "graphBasePath", + "paramDescription": "the base path of raw graph", + "paramRequired": true + }, + { + "paramName": "w", + "paramLongName": "workingPath", + "paramDescription": "the working directory path", + "paramRequired": true + }, + { + "paramName": "o", + "paramLongName": "dedupGraphPath", + "paramDescription": "the path of the dedup graph", + "paramRequired": true + } +] \ No newline at end of file diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/dedup/scan/oozie_app/config-default.xml b/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/dedup/scan/oozie_app/config-default.xml new file mode 100644 index 0000000000..2e0ed9aeea --- /dev/null +++ b/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/dedup/scan/oozie_app/config-default.xml @@ -0,0 +1,18 @@ + + + jobTracker + yarnRM + + + nameNode + hdfs://nameservice1 + + + oozie.use.system.libpath + true + + + oozie.action.sharelib.for.spark + spark2 + + \ No newline at end of file diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/dedup/scan/oozie_app/workflow.xml b/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/dedup/scan/oozie_app/workflow.xml new file mode 100644 index 0000000000..7cdf3ea216 --- /dev/null +++ b/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/dedup/scan/oozie_app/workflow.xml @@ -0,0 +1,138 @@ + + + + graphBasePath + the raw graph base path + + + isLookUpUrl + the address of the lookUp service + + + actionSetId + id of the actionSet + + + workingPath + path for the working directory + + + sparkDriverMemory + memory for driver process + + + sparkExecutorMemory + memory for individual executor + + + sparkExecutorCores + number of cores used by single executor + + + + + ${jobTracker} + ${nameNode} + + + mapreduce.job.queuename + ${queueName} + + + oozie.launcher.mapred.job.queue.name + ${oozieLauncherQueueName} + + + + + + + + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] + + + + + + + + yarn + cluster + Create Similarity Relations + eu.dnetlib.dhp.dedup.SparkCreateSimRels + dhp-dedup-openaire-${projectVersion}.jar + + --executor-memory ${sparkExecutorMemory} + --executor-cores ${sparkExecutorCores} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" + --conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" + + -mtyarn + --i${graphBasePath} + --la${isLookUpUrl} + --asi${actionSetId} + --w${workingPath} + + + + + + + + + + + yarn + cluster + Create Merge Relations + eu.dnetlib.dhp.dedup.SparkCreateConnectedComponent + dhp-dedup-openaire-${projectVersion}.jar + + --executor-memory ${sparkExecutorMemory} + --executor-cores ${sparkExecutorCores} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" + --conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" + --conf spark.sql.warehouse.dir="/user/hive/warehouse" + + -mtyarn + --i${graphBasePath} + --w${workingPath} + --la${isLookUpUrl} + --asi${actionSetId} + + + + + + + + + + + yarn + cluster + Create Dedup Record + eu.dnetlib.dhp.dedup.SparkCreateDedupRecord + dhp-dedup-openaire-${projectVersion}.jar + + --executor-memory ${sparkExecutorMemory} + --executor-cores ${sparkExecutorCores} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" + --conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" + --conf spark.sql.warehouse.dir="/user/hive/warehouse" + + -mtyarn + --i${graphBasePath} + --w${workingPath} + --la${isLookUpUrl} + --asi${actionSetId} + + + + + + + \ No newline at end of file diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/dedup/updateEntity_parameters.json b/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/dedup/updateEntity_parameters.json new file mode 100644 index 0000000000..06b67f7322 --- /dev/null +++ b/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/dedup/updateEntity_parameters.json @@ -0,0 +1,26 @@ +[ +{ + "paramName": "mt", + "paramLongName": "master", + "paramDescription": "should be local or yarn", + "paramRequired": true +}, +{ + "paramName": "i", + "paramLongName": "graphBasePath", + "paramDescription": "the base path of raw graph", + "paramRequired": true +}, +{ + "paramName": "w", + "paramLongName": "workingPath", + "paramDescription": "the working directory path", + "paramRequired": true +}, +{ + "paramName": "o", + "paramLongName": "dedupGraphPath", + "paramDescription": "the path of the dedup graph", + "paramRequired": true + } +] \ No newline at end of file diff --git a/dhp-workflows/dhp-dedup-scholexplorer/src/test/java/eu/dnetlib/dedup/MergeAuthorTest.java b/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/dedup/MergeAuthorTest.java similarity index 77% rename from dhp-workflows/dhp-dedup-scholexplorer/src/test/java/eu/dnetlib/dedup/MergeAuthorTest.java rename to dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/dedup/MergeAuthorTest.java index 817f2075c3..d6b2a79fd6 100644 --- a/dhp-workflows/dhp-dedup-scholexplorer/src/test/java/eu/dnetlib/dedup/MergeAuthorTest.java +++ b/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/dedup/MergeAuthorTest.java @@ -1,10 +1,10 @@ -package eu.dnetlib.dedup; +package eu.dnetlib.dhp.dedup; import eu.dnetlib.dhp.schema.oaf.Publication; import org.apache.commons.io.IOUtils; import org.codehaus.jackson.map.ObjectMapper; -import org.junit.Before; -import org.junit.Test; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; import java.io.IOException; import java.util.Arrays; @@ -13,12 +13,12 @@ import java.util.stream.Collectors; public class MergeAuthorTest { - List publicationsToMerge; - final ObjectMapper mapper = new ObjectMapper(); + private List publicationsToMerge; + private final ObjectMapper mapper = new ObjectMapper(); - @Before + @BeforeEach public void setUp() throws Exception { - final String json = IOUtils.toString(this.getClass().getResourceAsStream("/eu/dnetlib/dedup/json/authors_merge.json")); + final String json = IOUtils.toString(this.getClass().getResourceAsStream("/eu/dnetlib/dhp/dedup/json/authors_merge.json")); publicationsToMerge = Arrays.asList(json.split("\n")).stream().map(s-> { @@ -28,34 +28,18 @@ public class MergeAuthorTest { throw new RuntimeException(e); } }).collect(Collectors.toList()); - - - } - @Test public void test() throws Exception { Publication dedup = new Publication(); - publicationsToMerge.forEach(p-> { dedup.mergeFrom(p); dedup.setAuthor(DedupUtility.mergeAuthor(dedup.getAuthor(),p.getAuthor())); }); - - - - - - - System.out.println(mapper.writeValueAsString(dedup)); - - } - - } diff --git a/dhp-workflows/dhp-dedup-scholexplorer/src/test/java/eu/dnetlib/dedup/SparkCreateDedupTest.java b/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/dedup/SparkCreateDedupTest.java similarity index 52% rename from dhp-workflows/dhp-dedup-scholexplorer/src/test/java/eu/dnetlib/dedup/SparkCreateDedupTest.java rename to dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/dedup/SparkCreateDedupTest.java index a7b7cb8c84..1b8df02b50 100644 --- a/dhp-workflows/dhp-dedup-scholexplorer/src/test/java/eu/dnetlib/dedup/SparkCreateDedupTest.java +++ b/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/dedup/SparkCreateDedupTest.java @@ -1,12 +1,10 @@ -package eu.dnetlib.dedup; +package eu.dnetlib.dhp.dedup; import com.google.common.hash.HashFunction; import com.google.common.hash.Hashing; import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import org.apache.commons.io.IOUtils; -import org.junit.Before; -import org.junit.Ignore; -import org.junit.Test; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Disabled; import java.io.IOException; @@ -15,41 +13,28 @@ public class SparkCreateDedupTest { String configuration; String entity = "organization"; - @Before + @BeforeEach public void setUp() throws IOException { - configuration = IOUtils.toString(getClass().getResourceAsStream("/eu/dnetlib/dedup/conf/pub_scholix.conf.json")); - +// configuration = IOUtils.toString(getClass().getResourceAsStream("/eu/dnetlib/dedup/conf/org.curr.conf.json")); + configuration = ""; } - - - - @Test - public void PropagateRelationsTest() throws Exception { - SparkPropagateRelationsJob.main(new String[] { + @Disabled("must be parametrized to run locally") + public void createSimRelsTest() throws Exception { + SparkCreateSimRels.main(new String[]{ "-mt", "local[*]", - - - "-ep", "/Users/sandro/Downloads/scholix/graph/relation", - "-mr", "/Users/sandro/Downloads/scholix/dedupGraphWD/publication/mergeRel", - "-mt", "local[*]", - "-t", "/Users/sandro/Downloads/scholix/dedupGraphWD/publication/rel_fixed", + "-i", "/Users/miconis/dumps", + "-o", "/tmp/dedup/rawset_test", + "-asi", "dedup-similarity-result-levenstein", + "-la", "lookupurl", + "-w", "workingPath" }); } - - @Test - public void createDeletedByInference() throws Exception { - SparkUpdateEntityJob.main(new String[] { - "-mt", "local[*]" - }); - } - - @Test - @Ignore + @Disabled("must be parametrized to run locally") public void createCCTest() throws Exception { - SparkCreateConnectedComponent.main(new String[] { + SparkCreateConnectedComponent.main(new String[]{ "-mt", "local[*]", "-s", "/Users/miconis/dumps", "-e", entity, @@ -58,10 +43,9 @@ public class SparkCreateDedupTest { }); } - @Test - @Ignore + @Disabled("must be parametrized to run locally") public void dedupRecordTest() throws Exception { - SparkCreateDedupRecord.main(new String[] { + SparkCreateDedupRecord.main(new String[]{ "-mt", "local[*]", "-s", "/Users/miconis/dumps", "-e", entity, @@ -70,24 +54,21 @@ public class SparkCreateDedupTest { }); } - @Test + @Disabled("must be parametrized to run locally") public void printConfiguration() throws Exception { System.out.println(ArgumentApplicationParser.compressArgument(configuration)); } - @Test + @Disabled("must be parametrized to run locally") public void testHashCode() { final String s1 = "20|grid________::6031f94bef015a37783268ec1e75f17f"; final String s2 = "20|nsf_________::b12be9edf414df8ee66b4c52a2d8da46"; final HashFunction hashFunction = Hashing.murmur3_128(); - System.out.println( s1.hashCode()); - System.out.println(hashFunction.hashUnencodedChars(s1).asLong()); - System.out.println( s2.hashCode()); - System.out.println(hashFunction.hashUnencodedChars(s2).asLong()); - + System.out.println(s1.hashCode()); + System.out.println(hashFunction.hashString(s1).asLong()); + System.out.println(s2.hashCode()); + System.out.println(hashFunction.hashString(s2).asLong()); } - - } diff --git a/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/dedup/jpath/JsonPathTest.java b/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/dedup/jpath/JsonPathTest.java new file mode 100644 index 0000000000..76af1fa902 --- /dev/null +++ b/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/dedup/jpath/JsonPathTest.java @@ -0,0 +1,289 @@ +package eu.dnetlib.dhp.dedup.jpath; + +import eu.dnetlib.pace.config.DedupConfig; +import eu.dnetlib.pace.model.MapDocument; +import eu.dnetlib.pace.util.MapDocumentUtil; +import org.junit.jupiter.api.Test; + +public class JsonPathTest { + + String json = "{\t\"dataInfo\":{\t\t\"invisible\":false,\t\t\"inferred\":false,\t\t\"deletedbyinference\":false,\t\t\"trust\":\"0.810000002384185791\",\t\t\"inferenceprovenance\":\"\",\t\t\"provenanceaction\":{\t\t\t\"classid\":\"sysimport:crosswalk:entityregistry\",\t\t\t\"classname\":\"sysimport:crosswalk:entityregistry\",\t\t\t\"schemeid\":\"dnet:provenance_actions\",\t\t\t\"schemename\":\"dnet:provenance_actions\"\t\t}\t},\t\"lastupdatetimestamp\":1584960968152,\t\"id\":\"20|corda__h2020::9faf23721249f26ac2c16eb857ea1fb9\",\t\"originalId\":[\t\t\"corda__h2020::927957582\"\t],\t\"collectedfrom\":[\t\t{\t\t\t\"key\":\"openaire____::corda_h2020\",\t\t\t\"value\":\"CORDA - COmmon Research DAta Warehouse - Horizon 2020\",\t\t\t\"dataInfo\":null\t\t}\t],\t\"pid\":[\t],\t\"dateofcollection\":\"2016-06-05\",\t\"dateoftransformation\":\"2019-11-19\",\t\"extraInfo\":[\t],\t\"oaiprovenance\":null,\t\"legalshortname\":{\t\t\"value\":\"Comentor AB\",\t\t\"dataInfo\":{\t\t\t\"invisible\":false,\t\t\t\"inferred\":false,\t\t\t\"deletedbyinference\":false,\t\t\t\"trust\":\"0.810000002384185791\",\t\t\t\"inferenceprovenance\":\"\",\t\t\t\"provenanceaction\":{\t\t\t\t\"classid\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"classname\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"schemeid\":\"dnet:provenance_actions\",\t\t\t\t\"schemename\":\"dnet:provenance_actions\"\t\t\t}\t\t}\t},\t\"legalname\":{\t\t\"value\":\"Comentor AB\",\t\t\"dataInfo\":{\t\t\t\"invisible\":false,\t\t\t\"inferred\":false,\t\t\t\"deletedbyinference\":false,\t\t\t\"trust\":\"0.810000002384185791\",\t\t\t\"inferenceprovenance\":\"\",\t\t\t\"provenanceaction\":{\t\t\t\t\"classid\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"classname\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"schemeid\":\"dnet:provenance_actions\",\t\t\t\t\"schemename\":\"dnet:provenance_actions\"\t\t\t}\t\t}\t},\t\"alternativeNames\":[\t],\t\"websiteurl\":{\t\t\"value\":\"http://www.comentor.se\",\t\t\"dataInfo\":{\t\t\t\"invisible\":false,\t\t\t\"inferred\":false,\t\t\t\"deletedbyinference\":false,\t\t\t\"trust\":\"0.810000002384185791\",\t\t\t\"inferenceprovenance\":\"\",\t\t\t\"provenanceaction\":{\t\t\t\t\"classid\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"classname\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"schemeid\":\"dnet:provenance_actions\",\t\t\t\t\"schemename\":\"dnet:provenance_actions\"\t\t\t}\t\t}\t},\t\"logourl\":null,\t\"eclegalbody\":{\t\t\"value\":\"false\",\t\t\"dataInfo\":{\t\t\t\"invisible\":false,\t\t\t\"inferred\":false,\t\t\t\"deletedbyinference\":false,\t\t\t\"trust\":\"0.810000002384185791\",\t\t\t\"inferenceprovenance\":\"\",\t\t\t\"provenanceaction\":{\t\t\t\t\"classid\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"classname\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"schemeid\":\"dnet:provenance_actions\",\t\t\t\t\"schemename\":\"dnet:provenance_actions\"\t\t\t}\t\t}\t},\t\"eclegalperson\":{\t\t\"value\":\"true\",\t\t\"dataInfo\":{\t\t\t\"invisible\":false,\t\t\t\"inferred\":false,\t\t\t\"deletedbyinference\":false,\t\t\t\"trust\":\"0.810000002384185791\",\t\t\t\"inferenceprovenance\":\"\",\t\t\t\"provenanceaction\":{\t\t\t\t\"classid\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"classname\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"schemeid\":\"dnet:provenance_actions\",\t\t\t\t\"schemename\":\"dnet:provenance_actions\"\t\t\t}\t\t}\t},\t\"ecnonprofit\":{\t\t\"value\":\"false\",\t\t\"dataInfo\":{\t\t\t\"invisible\":false,\t\t\t\"inferred\":false,\t\t\t\"deletedbyinference\":false,\t\t\t\"trust\":\"0.810000002384185791\",\t\t\t\"inferenceprovenance\":\"\",\t\t\t\"provenanceaction\":{\t\t\t\t\"classid\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"classname\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"schemeid\":\"dnet:provenance_actions\",\t\t\t\t\"schemename\":\"dnet:provenance_actions\"\t\t\t}\t\t}\t},\t\"ecresearchorganization\":{\t\t\"value\":\"false\",\t\t\"dataInfo\":{\t\t\t\"invisible\":false,\t\t\t\"inferred\":false,\t\t\t\"deletedbyinference\":false,\t\t\t\"trust\":\"0.810000002384185791\",\t\t\t\"inferenceprovenance\":\"\",\t\t\t\"provenanceaction\":{\t\t\t\t\"classid\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"classname\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"schemeid\":\"dnet:provenance_actions\",\t\t\t\t\"schemename\":\"dnet:provenance_actions\"\t\t\t}\t\t}\t},\t\"echighereducation\":{\t\t\"value\":\"false\",\t\t\"dataInfo\":{\t\t\t\"invisible\":false,\t\t\t\"inferred\":false,\t\t\t\"deletedbyinference\":false,\t\t\t\"trust\":\"0.810000002384185791\",\t\t\t\"inferenceprovenance\":\"\",\t\t\t\"provenanceaction\":{\t\t\t\t\"classid\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"classname\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"schemeid\":\"dnet:provenance_actions\",\t\t\t\t\"schemename\":\"dnet:provenance_actions\"\t\t\t}\t\t}\t},\t\"ecinternationalorganizationeurinterests\":{\t\t\"value\":\"false\",\t\t\"dataInfo\":{\t\t\t\"invisible\":false,\t\t\t\"inferred\":false,\t\t\t\"deletedbyinference\":false,\t\t\t\"trust\":\"0.810000002384185791\",\t\t\t\"inferenceprovenance\":\"\",\t\t\t\"provenanceaction\":{\t\t\t\t\"classid\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"classname\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"schemeid\":\"dnet:provenance_actions\",\t\t\t\t\"schemename\":\"dnet:provenance_actions\"\t\t\t}\t\t}\t},\t\"ecinternationalorganization\":{\t\t\"value\":\"false\",\t\t\"dataInfo\":{\t\t\t\"invisible\":false,\t\t\t\"inferred\":false,\t\t\t\"deletedbyinference\":false,\t\t\t\"trust\":\"0.810000002384185791\",\t\t\t\"inferenceprovenance\":\"\",\t\t\t\"provenanceaction\":{\t\t\t\t\"classid\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"classname\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"schemeid\":\"dnet:provenance_actions\",\t\t\t\t\"schemename\":\"dnet:provenance_actions\"\t\t\t}\t\t}\t},\t\"ecenterprise\":{\t\t\"value\":\"false\",\t\t\"dataInfo\":{\t\t\t\"invisible\":false,\t\t\t\"inferred\":false,\t\t\t\"deletedbyinference\":false,\t\t\t\"trust\":\"0.810000002384185791\",\t\t\t\"inferenceprovenance\":\"\",\t\t\t\"provenanceaction\":{\t\t\t\t\"classid\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"classname\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"schemeid\":\"dnet:provenance_actions\",\t\t\t\t\"schemename\":\"dnet:provenance_actions\"\t\t\t}\t\t}\t},\t\"ecsmevalidated\":{\t\t\"value\":\"true\",\t\t\"dataInfo\":{\t\t\t\"invisible\":false,\t\t\t\"inferred\":false,\t\t\t\"deletedbyinference\":false,\t\t\t\"trust\":\"0.810000002384185791\",\t\t\t\"inferenceprovenance\":\"\",\t\t\t\"provenanceaction\":{\t\t\t\t\"classid\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"classname\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"schemeid\":\"dnet:provenance_actions\",\t\t\t\t\"schemename\":\"dnet:provenance_actions\"\t\t\t}\t\t}\t},\t\"ecnutscode\":{\t\t\"value\":\"false\",\t\t\"dataInfo\":{\t\t\t\"invisible\":false,\t\t\t\"inferred\":false,\t\t\t\"deletedbyinference\":false,\t\t\t\"trust\":\"0.810000002384185791\",\t\t\t\"inferenceprovenance\":\"\",\t\t\t\"provenanceaction\":{\t\t\t\t\"classid\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"classname\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"schemeid\":\"dnet:provenance_actions\",\t\t\t\t\"schemename\":\"dnet:provenance_actions\"\t\t\t}\t\t}\t},\t\"country\":null}"; + DedupConfig conf = DedupConfig.load("{\n" + + " \"wf\" : {\n" + + " \"threshold\" : \"0.99\",\n" + + " \"dedupRun\" : \"001\",\n" + + " \"entityType\" : \"organization\",\n" + + " \"subEntityValue\": \"organization\",\n" + + " \"orderField\" : \"legalname\",\n" + + " \"queueMaxSize\" : \"2000\",\n" + + " \"groupMaxSize\" : \"50\",\n" + + " \"slidingWindowSize\" : \"200\",\n" + + " \"idPath\":\"$.id\",\n" + + " \"rootBuilder\" : [ \"organization\", \"projectOrganization_participation_isParticipant\", \"datasourceOrganization_provision_isProvidedBy\" ],\n" + + " \"includeChildren\" : \"true\",\n" + + " \"maxIterations\": \"20\"\n" + + " },\n" + + " \"pace\" : {\n" + + " \"clustering\" : [\n" + + " { \"name\" : \"sortedngrampairs\", \"fields\" : [ \"legalname\" ], \"params\" : { \"max\" : 2, \"ngramLen\" : \"3\"} },\n" + + " { \"name\" : \"suffixprefix\", \"fields\" : [ \"legalname\" ], \"params\" : { \"max\" : 1, \"len\" : \"3\" } },\n" + + " { \"name\" : \"urlclustering\", \"fields\" : [ \"websiteurl\" ], \"params\" : { } },\n" + + " { \"name\" : \"keywordsclustering\", \"fields\" : [ \"legalname\" ], \"params\" : { \"max\": 2, \"windowSize\": 4} }\n" + + " ],\n" + + " \"decisionTree\" : {\n" + + " \"start\": {\n" + + " \"fields\": [\n" + + " {\n" + + " \"field\": \"gridid\",\n" + + " \"comparator\": \"exactMatch\",\n" + + " \"weight\": 1,\n" + + " \"countIfUndefined\": \"false\",\n" + + " \"params\": {}\n" + + " }\n" + + " ],\n" + + " \"threshold\": 1,\n" + + " \"aggregation\": \"AVG\",\n" + + " \"positive\": \"MATCH\",\n" + + " \"negative\": \"NO_MATCH\",\n" + + " \"undefined\": \"layer2\",\n" + + " \"ignoreUndefined\": \"false\"\n" + + " },\n" + + " \"layer2\": {\n" + + " \"fields\": [\n" + + " {\n" + + " \"field\": \"websiteurl\",\n" + + " \"comparator\": \"domainExactMatch\",\n" + + " \"weight\": 1,\n" + + " \"countIfUndefined\": \"false\",\n" + + " \"params\": {}\n" + + " },\n" + + " {\n" + + " \"field\": \"country\",\n" + + " \"comparator\": \"exactMatch\",\n" + + " \"weight\": 1,\n" + + " \"countIfUndefined\": \"true\",\n" + + " \"params\": {}\n" + + " },\n" + + " {\n" + + " \"field\": \"legalname\",\n" + + " \"comparator\": \"numbersMatch\",\n" + + " \"weight\": 1,\n" + + " \"countIfUndefined\": \"true\",\n" + + " \"params\": {}\n" + + " },\n" + + " {\n" + + " \"field\": \"legalname\",\n" + + " \"comparator\": \"romansMatch\",\n" + + " \"weight\": 1,\n" + + " \"countIfUndefined\": \"true\",\n" + + " \"params\": {}\n" + + " }\n" + + " ],\n" + + " \"threshold\": 1,\n" + + " \"aggregation\": \"AND\",\n" + + " \"positive\": \"layer3\",\n" + + " \"negative\": \"NO_MATCH\",\n" + + " \"undefined\": \"layer3\",\n" + + " \"ignoreUndefined\": \"true\"\n" + + " },\n" + + " \"layer3\": {\n" + + " \"fields\": [\n" + + " {\n" + + " \"field\": \"legalname\",\n" + + " \"comparator\": \"cityMatch\",\n" + + " \"weight\": 1.0,\n" + + " \"countIfUndefined\": \"true\",\n" + + " \"params\": {\n" + + " \"windowSize\": \"4\"\n" + + " }\n" + + " }\n" + + " ],\n" + + " \"threshold\": 0.1,\n" + + " \"aggregation\": \"AVG\",\n" + + " \"positive\": \"layer4\",\n" + + " \"negative\": \"NO_MATCH\",\n" + + " \"undefined\": \"NO_MATCH\",\n" + + " \"ignoreUndefined\": \"true\"\n" + + " },\n" + + " \"layer4\": {\n" + + " \"fields\": [\n" + + " {\n" + + " \"field\": \"legalname\",\n" + + " \"comparator\": \"keywordMatch\",\n" + + " \"weight\": 1.0,\n" + + " \"countIfUndefined\": \"true\",\n" + + " \"params\": {\n" + + " \"windowSize\": \"4\"\n" + + " }\n" + + " }\n" + + " ],\n" + + " \"threshold\": 0.7,\n" + + " \"aggregation\": \"AVG\",\n" + + " \"positive\": \"layer5\",\n" + + " \"negative\": \"NO_MATCH\",\n" + + " \"undefined\": \"layer5\",\n" + + " \"ignoreUndefined\": \"true\"\n" + + " },\n" + + " \"layer5\": {\n" + + " \"fields\": [\n" + + " {\n" + + " \"field\": \"legalname\",\n" + + " \"comparator\": \"jaroWinklerNormalizedName\",\n" + + " \"weight\": 0.9,\n" + + " \"countIfUndefined\": \"true\",\n" + + " \"params\": {\n" + + " \"windowSize\": \"4\"\n" + + " }\n" + + " },\n" + + " {\n" + + " \"field\": \"legalshortname\",\n" + + " \"comparator\": \"jaroWinklerNormalizedName\",\n" + + " \"weight\": 0.1,\n" + + " \"countIfUndefined\": \"false\",\n" + + " \"params\": {\n" + + " \"windowSize\": 4\n" + + " }\n" + + " }\n" + + " ],\n" + + " \"threshold\": 0.9,\n" + + " \"aggregation\": \"W_MEAN\",\n" + + " \"positive\": \"MATCH\",\n" + + " \"negative\": \"NO_MATCH\",\n" + + " \"undefined\": \"NO_MATCH\",\n" + + " \"ignoreUndefined\": \"true\"\n" + + " }\n" + + " },\n" + + " \"model\" : [\n" + + " { \"name\" : \"country\", \"type\" : \"String\", \"path\" : \"$.country.classid\"},\n" + + " { \"name\" : \"legalshortname\", \"type\" : \"String\", \"path\" : \"$.legalshortname.value\"},\n" + + " { \"name\" : \"legalname\", \"type\" : \"String\", \"path\" : \"$.legalname.value\" },\n" + + " { \"name\" : \"websiteurl\", \"type\" : \"URL\", \"path\" : \"$.websiteurl.value\" },\n" + + " { \"name\" : \"gridid\", \"type\" : \"String\", \"path\" : \"$.pid[?(@.qualifier.classid =='grid')].value\"},\n" + + " { \"name\" : \"originalId\", \"type\" : \"String\", \"path\" : \"$.id\" }\n" + + " ],\n" + + " \"blacklists\" : {\n" + + " \"legalname\" : []\n" + + " },\n" + + " \"synonyms\": {\n" + + " \"key::1\": [\"university\",\"università\", \"universitas\", \"università studi\",\"universitario\",\"universitaria\",\"université\", \"universite\", \"universitaire\",\"universitaires\",\"universidad\",\"universitade\",\"Universität\",\"universitaet\",\"Uniwersytet\",\"университет\",\"universiteit\",\"πανεπιστήμιο\",\"universitesi\",\"universiteti\", \"universiti\"],\n" + + " \"key::2\": [\"studies\",\"studi\",\"études\",\"estudios\",\"estudos\",\"Studien\",\"studia\",\"исследования\",\"studies\",\"σπουδές\"],\n" + + " \"key::3\": [\"advanced\",\"superiore\",\"supérieur\",\"supérieure\",\"supérieurs\",\"supérieures\",\"avancado\",\"avancados\",\"fortgeschrittene\",\"fortgeschritten\",\"zaawansowany\",\"передовой\",\"gevorderd\",\"gevorderde\",\"προχωρημένος\",\"προχωρημένη\",\"προχωρημένο\",\"προχωρημένες\",\"προχωρημένα\",\"wyzsza\"],\n" + + " \"key::4\": [\"institute\",\"istituto\",\"institut\",\"instituto\",\"instituto\",\"Institut\",\"instytut\",\"институт\",\"instituut\",\"ινστιτούτο\"],\n" + + " \"key::5\": [\"hospital\",\"ospedale\",\"hôpital\",\"hospital\",\"hospital\",\"Krankenhaus\",\"szpital\",\"больница\",\"ziekenhuis\",\"νοσοκομείο\"],\n" + + " \"key::6\": [\"research\",\"ricerca\",\"recherche\",\"investigacion\",\"pesquisa\",\"Forschung\",\"badania\",\"исследования\",\"onderzoek\",\"έρευνα\",\"erevna\",\"erevnas\"],\n" + + " \"key::7\": [\"college\",\"collegio\",\"colegio\",\"faculdade\",\"Hochschule\",\"Szkoła Wyższa\",\"Высшая школа\",\"κολλέγιο\"],\n" + + " \"key::8\": [\"foundation\",\"fondazione\",\"fondation\",\"fundación\",\"fundação\",\"Stiftung\",\"Fundacja\",\"фонд\",\"stichting\",\"ίδρυμα\",\"idryma\"],\n" + + " \"key::9\": [\"center\",\"centro\",\"centre\",\"centro\",\"centro\",\"zentrum\",\"centrum\",\"центр\",\"centrum\",\"κέντρο\"],\n" + + " \"key::10\": [\"national\",\"nazionale\",\"national\",\"nationale\",\"nationaux\",\"nationales\",\"nacional\",\"nacional\",\"national\",\"krajowy\",\"национальный\",\"nationaal\",\"nationale\",\"εθνικό\"],\n" + + " \"key::11\": [\"association\",\"associazione\",\"association\",\"asociación\",\"associação\",\"Verein\",\"verband\",\"stowarzyszenie\",\"ассоциация\",\"associatie\"],\n" + + " \"key::12\": [\"society\",\"societa\",\"société\",\"sociedad\",\"sociedade\",\"gesellschaft\",\"społeczeństwo\",\"общество\",\"maatschappij\",\"κοινωνία\"],\n" + + " \"key::13\": [\"international\",\"internazionale\",\"international\",\"internacional\",\"internacional\",\"international\",\"międzynarodowy\",\"Международный\",\"internationaal\",\"internationale\",\"διεθνής\",\"διεθνή\",\"διεθνές\"],\n" + + " \"key::14\": [\"community\",\"comunita\",\"communauté\",\"comunidad\",\"comunidade\",\"Gemeinschaft\",\"społeczność\",\"сообщество\",\"gemeenschap\",\"κοινότητα\"],\n" + + " \"key::15\": [\"school\",\"scuola\",\"école\",\"escuela\",\"escola\",\"schule\",\"Szkoła\",\"школа\",\"school\",\"σχολείο\"],\n" + + " \"key::16\": [\"education\",\"educazione\",\"éducation\",\"educacion\",\"Educação\",\"Bildung\",\"Edukacja\",\"образование\",\"opleiding\",\"εκπαίδευση\"],\n" + + " \"key::17\": [\"academy\",\"accademia\",\"académie\",\"academia\",\"academia\",\"Akademie\",\"akademie\",\"академия\",\"academie\",\"ακαδημία\"],\n" + + " \"key::18\": [\"public\",\"pubblico\",\"public\",\"publique\",\"publics\",\"publiques\",\"publico\",\"publico\",\"Öffentlichkeit\",\"publiczny\",\"публичный\",\"publiek\",\"publieke\",\"δημόσιος\",\"δημόσια\",\"δημόσιο\"],\n" + + " \"key::19\": [\"museum\",\"museo\",\"musée\",\"mueso\",\"museu\",\"museum\",\"muzeum\",\"музей\",\"museum\",\"μουσείο\"],\n" + + " \"key::20\": [\"group\",\"gruppo\",\"groupe\",\"grupo\",\"grupo\",\"gruppe\",\"grupa\",\"группа\",\"groep\",\"ομάδα\",\"όμιλος\"],\n" + + " \"key::21\": [\"department\",\"dipartimento\",\"département\",\"departamento\",\"departamento\",\"abteilung\",\"departament\",\"отдел\",\"afdeling\",\"τμήμα\"],\n" + + " \"key::22\": [\"council\",\"consiglio\",\"conseil\",\"Consejo\",\"conselho\",\"gesellschaft\",\"rada\",\"совет\",\"raad\",\"συμβούλιο\"],\n" + + " \"key::23\": [\"library\",\"biblioteca\",\"bibliothèque\",\"biblioteca\",\"biblioteca\",\"Bibliothek\",\"biblioteka\",\"библиотека\",\"bibliotheek\",\"βιβλιοθήκη\"],\n" + + " \"key::24\": [\"ministry\",\"ministero\",\"ministère\",\"ministerio\",\"ministério\",\"Ministerium\",\"ministerstwo\",\"министерство\",\"ministerie\",\"υπουργείο\"],\n" + + " \"key::25\": [\"services\",\"servizi\",\"services\",\"servicios\",\"Serviços\",\"Dienstleistungen\",\"usługi\",\"услуги\",\"diensten\",\"υπηρεσίες\"],\n" + + " \"key::26\": [\"central\",\"centrale\",\"central\",\"centrale\",\"centrales\",\"central\",\"central\",\"zentral\",\"centralny\",\"цетральный\",\"centraal\",\"κεντρικός\",\"κεντρική\",\"κεντρικό\",\"κεντρικά\"],\n" + + " \"key::27\": [\"general\",\"generale\",\"général\",\"générale\",\"généraux\",\"générales\",\"general\",\"geral\",\"general\",\"Allgemeines\",\"general\",\"общий\",\"algemeen\",\"algemene\",\"γενικός\",\"γενική\",\"γενικό\",\"γενικά\"],\n" + + " \"key::28\": [\"applied\",\"applicati\",\"appliqué\",\"appliquée\",\"appliqués\",\"appliquées\",\"aplicado\",\"aplicada\",\"angewendet\",\"stosowany\",\"прикладной\",\"toegepast\",\"toegepaste\",\"εφαρμοσμένος\",\"εφαρμοσμένη\",\"εφαρμοσμένο\",\"εφαρμοσμένα\"],\n" + + " \"key::29\": [\"european\",\"europee\",\"europea\",\"européen\",\"européenne\",\"européens\",\"européennes\",\"europeo\",\"europeu\",\"europäisch\",\"europejski\",\"европейский\",\"Europees\",\"Europese\",\"ευρωπαϊκός\",\"ευρωπαϊκή\",\"ευρωπαϊκό\",\"ευρωπαϊκά\"],\n" + + " \"key::30\": [\"agency\",\"agenzia\",\"agence\",\"agencia\",\"agencia\",\"agentur\",\"agencja\",\"агенция\",\"agentschap\",\"πρακτορείο\"],\n" + + " \"key::31\": [\"laboratory\",\"laboratorio\",\"laboratoire\",\"laboratorio\",\"laboratorio\",\"labor\",\"laboratorium\",\"лаборатория\",\"laboratorium\",\"εργαστήριο\"],\n" + + " \"key::32\": [\"industry\",\"industria\",\"industrie\",\"индустрия\",\"industrie\",\"βιομηχανία\"],\n" + + " \"key::33\": [\"industrial\",\"industriale\",\"industriel\",\"industrielle\",\"industriels\",\"industrielles\",\"индустриальный\",\"industrieel\",\"βιομηχανικός\",\"βιομηχανική\",\"βιομηχανικό\",\"βιομηχανικά\",\"βιομηχανικές\"],\n" + + " \"key::34\": [\"consortium\",\"consorzio\",\"consortium\",\"консорциум\",\"consortium\",\"κοινοπραξία\"],\n" + + " \"key::35\": [\"organization\",\"organizzazione\",\"organisation\",\"organización\",\"organização\",\"organizacja\",\"организация\",\"organisatie\",\"οργανισμός\"],\n" + + " \"key::36\": [\"authority\",\"autorità\",\"autorité\",\"авторитет\",\"autoriteit\"],\n" + + " \"key::37\": [\"federation\",\"federazione\",\"fédération\",\"федерация\",\"federatie\",\"ομοσπονδία\"],\n" + + " \"key::38\": [\"observatory\",\"osservatorio\",\"observatoire\",\"обсерватория\",\"observatorium\",\"αστεροσκοπείο\"],\n" + + " \"key::39\": [\"bureau\",\"ufficio\",\"bureau\",\"офис\",\"bureau\",\"γραφείο\"],\n" + + " \"key::40\": [\"company\",\"impresa\",\"compagnie\",\"société\",\"компания\",\"bedrijf\",\"εταιρία\"],\n" + + " \"key::41\": [\"polytechnic\",\"politecnico\",\"polytechnique\",\"политехника\",\"polytechnisch\",\"πολυτεχνείο\",\"universita politecnica\",\"polytechnic university\",\"universidad politecnica\",\"universitat politecnica\",\"politechnika\",\"politechniki\",\"university technology\",\"university science technology\"],\n" + + " \"key::42\": [\"coalition\",\"coalizione\",\"coalition\",\"коалиция\",\"coalitie\",\"συνασπισμός\"],\n" + + " \"key::43\": [\"initiative\",\"iniziativa\",\"initiative\",\"инициатива\",\"initiatief\",\"πρωτοβουλία\"],\n" + + " \"key::44\": [\"academic\",\"accademico\",\"académique\",\"universitaire\",\"акадеческий academisch\",\"ακαδημαϊκός\",\"ακαδημαϊκή\",\"ακαδημαϊκό\",\"ακαδημαϊκές\",\"ακαδημαϊκοί\"],\n" + + " \"key::45\": [\"institution\",\"istituzione\",\"institution\",\"институциональный\",\"instelling\",\"ινστιτούτο\"],\n" + + " \"key::46\": [\"division\",\"divisione\",\"division\",\"отделение\",\"divisie\",\"τμήμα\"],\n" + + " \"key::47\": [\"committee\",\"comitato\",\"comité\",\"комитет\",\"commissie\",\"επιτροπή\"],\n" + + " \"key::48\": [\"promotion\",\"promozione\",\"продвижение\",\"proothisis\",\"forderung\"],\n" + + " \"key::49\": [\"medical\",\"medicine\",\"clinical\",\"medicina\",\"clinici\",\"médico\",\"medicina\",\"clínica\",\"médico\",\"medicina\",\"clínica\",\"medizinisch\",\"Medizin\",\"klinisch\",\"medisch\",\"geneeskunde\",\"klinisch\",\"ιατρικός\",\"ιατρική\",\"ιατρικό\",\"ιατρικά\",\"κλινικός\",\"κλινική\",\"κλινικό\",\"κλινικά\",\"tıbbi\",\"tıp\",\"klinik\",\"orvosi\",\"orvostudomány\",\"klinikai\",\"zdravniški\",\"medicinski\",\"klinični\",\"meditsiini\",\"kliinik\",\"kliiniline\"],\n" + + " \"key::50\": [\"technology\",\"technological\",\"tecnologia\",\"tecnologie\",\"tecnología\",\"tecnológico\",\"tecnologia\",\"tecnológico\",\"Technologie\",\"technologisch\",\"technologie\",\"technologisch\",\"τεχνολογία\",\"τεχνολογικός\",\"τεχνολογική\",\"τεχνολογικό\",\"teknoloji\",\"teknolojik\",\"technológia\",\"technológiai\",\"tehnologija\",\"tehnološki\",\"tehnoloogia\",\"tehnoloogiline\",\"technologii\",\"technical\",\"texniki\",\"teknik\"],\n" + + " \"key::51\": [\"science\",\"scientific\",\"scienza\",\"scientifiche\",\"scienze\",\"ciencia\",\"científico\",\"ciência\",\"científico\",\"Wissenschaft\",\"wissenschaftlich\",\"wetenschap\",\"wetenschappelijk\",\"επιστήμη\",\"επιστημονικός\",\"επιστημονική\",\"επιστημονικό\",\"επιστημονικά\",\"bilim\",\"bilimsel\",\"tudomány\",\"tudományos\",\"znanost\",\"znanstveni\",\"teadus\",\"teaduslik\",\"\"],\n" + + " \"key::52\": [\"engineering\",\"ingegneria\",\"ingeniería\",\"engenharia\",\"Ingenieurwissenschaft\",\"ingenieurswetenschappen\",\"bouwkunde\",\"μηχανικός\",\"μηχανική\",\"μηχανικό\",\"mühendislik\",\"mérnöki\",\"Inženirstvo\",\"inseneeria\",\"inseneri\",\"\"],\n" + + " \"key::53\": [\"management\",\"gestione\",\"gestionale\",\"gestionali\",\"gestión\",\"administración\",\"gestão\",\"administração\",\"Verwaltung\",\"management\",\"διαχείριση\",\"yönetim\",\"menedzsment\",\"vodstvo\",\"upravljanje\",\"management\",\"juhtkond\",\"juhtimine\",\"haldus\",\"\"],\n" + + " \"key::54\": [\"energy\",\"energia\",\"energía\",\"energia\",\"Energie\",\"energie\",\"ενέργεια\",\"enerji\",\"energia\",\"energija\",\"energia\",\"\"],\n" + + " \"key::55\": [\"agricultural\",\"agriculture\",\"agricoltura\",\"agricole\",\"agrícola\",\"agricultura\",\"agrícola\",\"agricultura\",\"landwirtschaftlich\",\"Landwirtschaft\",\"landbouwkundig\",\"landbouw\",\"αγροτικός\",\"αγροτική\",\"αγροτικό\",\"γεωργικός\",\"γεωργική\",\"γεωργικό\",\"γεωργία\",\"tarımsal\",\"tarım\",\"mezőgazdasági\",\"mezőgazdaság\",\"poljedelski\",\"poljedelstvo\",\"põllumajandus\",\"põllumajanduslik\",\"\"],\n" + + " \"key::56\": [\"information\",\"informazione\",\"información\",\"informação\",\"Information\",\"informatie\",\"πληροφορία\",\"bilgi\",\"információ\",\"informacija\",\"informatsioon\",\"informatycznych\",\"\"],\n" + + " \"key::57\": [\"social\",\"sociali\",\"social\",\"social\",\"Sozial\",\"sociaal\",\"maatschappelijk\",\"κοινωνικός\",\"κοινωνική\",\"κοινωνικό\",\"κοινωνικά\",\"sosyal\",\"szociális\",\"družbeni\",\"sotsiaal\",\"sotsiaalne\",\"\"],\n" + + " \"key::58\": [\"environmental\",\"ambiente\",\"medioambiental\",\"ambiente\",\"medioambiente\",\"meioambiente\",\"Umwelt\",\"milieu\",\"milieuwetenschap\",\"milieukunde\",\"περιβαλλοντικός\",\"περιβαλλοντική\",\"περιβαλλοντικό\",\"περιβαλλοντικά\",\"çevre\",\"környezeti\",\"okoliški\",\"keskonna\",\"\"],\n" + + " \"key::59\": [\"business\",\"economia\",\"economiche\",\"economica\",\"negocio\",\"empresa\",\"negócio\",\"Unternehmen\",\"bedrijf\",\"bedrijfskunde\",\"επιχείρηση\",\"iş\",\"üzleti\",\"posel\",\"ettevõte/äri\",\"\"],\n" + + " \"key::60\": [\"pharmaceuticals\",\"pharmacy\",\"farmacia\",\"farmaceutica\",\"farmacéutica\",\"farmacia\",\"farmacêutica\",\"farmácia\",\"Pharmazeutika\",\"Arzneimittelkunde\",\"farmaceutica\",\"geneesmiddelen\",\"apotheek\",\"φαρμακευτικός\",\"φαρμακευτική\",\"φαρμακευτικό\",\"φαρμακευτικά\",\"φαρμακείο\",\"ilaç\",\"eczane\",\"gyógyszerészeti\",\"gyógyszertár\",\"farmacevtika\",\"lekarništvo\",\"farmaatsia\",\"farmatseutiline\",\"\"],\n" + + " \"key::61\": [\"healthcare\",\"health services\",\"salute\",\"atenciónmédica\",\"cuidadodelasalud\",\"cuidadoscomasaúde\",\"Gesundheitswesen\",\"gezondheidszorg\",\"ιατροφαρμακευτικήπερίθαλψη\",\"sağlıkhizmeti\",\"egészségügy\",\"zdravstvo\",\"tervishoid\",\"tervishoiu\",\"\"],\n" + + " \"key::62\": [\"history\",\"storia\",\"historia\",\"história\",\"Geschichte\",\"geschiedenis\",\"geschiedkunde\",\"ιστορία\",\"tarih\",\"történelem\",\"zgodovina\",\"ajalugu\",\"\"],\n" + + " \"key::63\": [\"materials\",\"materiali\",\"materia\",\"materiales\",\"materiais\",\"materialen\",\"υλικά\",\"τεκμήρια\",\"malzemeler\",\"anyagok\",\"materiali\",\"materjalid\",\"vahendid\",\"\"],\n" + + " \"key::64\": [\"economics\",\"economia\",\"economiche\",\"economica\",\"economía\",\"economia\",\"Wirtschaft\",\"economie\",\"οικονομικά\",\"οικονομικέςεπιστήμες\",\"ekonomi\",\"közgazdaságtan\",\"gospodarstvo\",\"ekonomija\",\"majanduslik\",\"majandus\",\"\"],\n" + + " \"key::65\": [\"therapeutics\",\"terapeutica\",\"terapéutica\",\"terapêutica\",\"therapie\",\"θεραπευτική\",\"tedavibilimi\",\"gyógykezelés\",\"terapevtika\",\"terapeutiline\",\"ravi\",\"\"],\n" + + " \"key::66\": [\"oncology\",\"oncologia\",\"oncologico\",\"oncología\",\"oncologia\",\"Onkologie\",\"oncologie\",\"ογκολογία\",\"onkoloji\",\"onkológia\",\"onkologija\",\"onkoloogia\",\"\"],\n" + + " \"key::67\": [\"natural\",\"naturali\",\"naturale\",\"natural\",\"natural\",\"natürlich\",\"natuurlijk\",\"φυσικός\",\"φυσική\",\"φυσικό\",\"φυσικά\",\"doğal\",\"természetes\",\"naraven\",\"loodus\",\"\"],\n" + + " \"key::68\": [\"educational\",\"educazione\",\"pedagogia\",\"educacional\",\"educativo\",\"educacional\",\"pädagogisch\",\"educatief\",\"εκπαιδευτικός\",\"εκπαιδευτική\",\"εκπαιδευτικό\",\"εκπαιδευτικά\",\"eğitimsel\",\"oktatási\",\"izobraževalen\",\"haridus\",\"hariduslik\",\"\"],\n" + + " \"key::69\": [\"biomedical\",\"biomedica\",\"biomédico\",\"biomédico\",\"biomedizinisch\",\"biomedisch\",\"βιοιατρικός\",\"βιοιατρική\",\"βιοιατρικό\",\"βιοιατρικά\",\"biyomedikal\",\"orvosbiológiai\",\"biomedicinski\",\"biomeditsiiniline\",\"\"],\n" + + " \"key::70\": [\"veterinary\",\"veterinaria\",\"veterinarie\",\"veterinaria\",\"veterinária\",\"tierärtzlich\",\"veterinair\",\"veeartsenijlkunde\",\"κτηνιατρικός\",\"κτηνιατρική\",\"κτηνιατρικό\",\"κτηνιατρικά\",\"veteriner\",\"állatorvosi\",\"veterinar\",\"veterinarski\",\"veterinaaria\",\"\"],\n" + + " \"key::71\": [\"chemistry\",\"chimica\",\"química\",\"química\",\"Chemie\",\"chemie\",\"scheikunde\",\"χημεία\",\"kimya\",\"kémia\",\"kemija\",\"keemia\",\"\"],\n" + + " \"key::72\": [\"security\",\"sicurezza\",\"seguridad\",\"segurança\",\"Sicherheit\",\"veiligheid\",\"ασφάλεια\",\"güvenlik\",\"biztonsági\",\"varnost\",\"turvalisus\",\"julgeolek\",\"\"],\n" + + " \"key::73\": [\"biotechnology\",\"biotecnologia\",\"biotecnologie\",\"biotecnología\",\"biotecnologia\",\"Biotechnologie\",\"biotechnologie\",\"βιοτεχνολογία\",\"biyoteknoloji\",\"biotechnológia\",\"biotehnologija\",\"biotehnoloogia\",\"\"],\n" + + " \"key::74\": [\"military\",\"militare\",\"militari\",\"militar\",\"militar\",\"Militär\",\"militair\",\"leger\",\"στρατιωτικός\",\"στρατιωτική\",\"στρατιωτικό\",\"στρατιωτικά\",\"askeri\",\"katonai\",\"vojaški\",\"vojni\",\"militaar\",\"wojskowa\",\"\"],\n" + + " \"key::75\": [\"theological\",\"teologia\",\"teologico\",\"teológico\",\"tecnológica\",\"theologisch\",\"theologisch\",\"θεολογικός\",\"θεολογική\",\"θεολογικό\",\"θεολογικά\",\"teolojik\",\"technológiai\",\"teološki\",\"teoloogia\",\"usuteadus\",\"teoloogiline\",\"\"],\n" + + " \"key::76\": [\"electronics\",\"elettronica\",\"electrónica\",\"eletrônicos\",\"Elektronik\",\"elektronica\",\"ηλεκτρονική\",\"elektronik\",\"elektronika\",\"elektronika\",\"elektroonika\",\"\"],\n" + + " \"key::77\": [\"forestry\",\"forestale\",\"forestali\",\"silvicultura\",\"forestal\",\"floresta\",\"Forstwirtschaft\",\"bosbouw\",\"δασοκομία\",\"δασολογία\",\"ormancılık\",\"erdészet\",\"gozdarstvo\",\"metsandus\",\"\"],\n" + + " \"key::78\": [\"maritime\",\"marittima\",\"marittime\",\"marittimo\",\"marítimo\",\"marítimo\",\"maritiem\",\"ναυτικός\",\"ναυτική\",\"ναυτικό\",\"ναυτικά\",\"ναυτιλιακός\",\"ναυτιλιακή\",\"ναυτιλιακό\",\"ναυτιλιακά\",\"θαλάσσιος\",\"θαλάσσια\",\"θαλάσσιο\",\"denizcilik\",\"tengeri\",\"morski\",\"mere\",\"merendus\",\"\"],\n" + + " \"key::79\": [\"sports\",\"sport\",\"deportes\",\"esportes\",\"Sport\",\"sport\",\"sportwetenschappen\",\"άθληση\",\"γυμναστικήδραστηριότητα\",\"spor\",\"sport\",\"šport\",\"sport\",\"spordi\",\"\"],\n" + + " \"key::80\": [\"surgery\",\"chirurgia\",\"chirurgiche\",\"cirugía\",\"cirurgia\",\"Chirurgie\",\"chirurgie\",\"heelkunde\",\"εγχείρηση\",\"επέμβαση\",\"χειρουργικήεπέμβαση\",\"cerrahi\",\"sebészet\",\"kirurgija\",\"kirurgia\",\"\"],\n" + + " \"key::81\": [\"cultural\",\"culturale\",\"culturali\",\"cultura\",\"cultural\",\"cultural\",\"kulturell\",\"cultureel\",\"πολιτιστικός\",\"πολιτιστική\",\"πολιτιστικό\",\"πολιτισμικός\",\"πολιτισμική\",\"πολιτισμικό\",\"kültürel\",\"kultúrális\",\"kulturni\",\"kultuuri\",\"kultuuriline\",\"\"],\n" + + " \"key::82\": [\"computerscience\",\"informatica\",\"ordenador\",\"computadora\",\"informática\",\"computación\",\"cienciasdelacomputación\",\"ciênciadacomputação\",\"Computer\",\"computer\",\"υπολογιστής\",\"ηλεκτρονικόςυπολογιστής\",\"bilgisayar\",\"számítógép\",\"računalnik\",\"arvuti\",\"\"],\n" + + " \"key::83\": [\"finance\",\"financial\",\"finanza\",\"finanziarie\",\"finanza\",\"financiero\",\"finanças\",\"financeiro\",\"Finanzen\",\"finanziell\",\"financiën\",\"financieel\",\"χρηματοοικονομικά\",\"χρηματοδότηση\",\"finanse\",\"finansal\",\"pénzügy\",\"pénzügyi\",\"finance\",\"finančni\",\"finants\",\"finantsiline\",\"\"],\n" + + " \"key::84\": [\"communication\",\"comunicazione\",\"comuniciación\",\"comunicação\",\"Kommunikation\",\"communication\",\"επικοινωνία\",\"iletişim\",\"kommunikáció\",\"komuniciranje\",\"kommunikatsioon\",\"\"],\n" + + " \"key::85\": [\"justice\",\"giustizia\",\"justicia\",\"justiça\",\"Recht\",\"Justiz\",\"justitie\",\"gerechtigheid\",\"δικαιοσύνη\",\"υπουργείοδικαιοσύνης\",\"δίκαιο\",\"adalet\",\"igazságügy\",\"pravo\",\"õigus\",\"\"],\n" + + " \"key::86\": [\"aerospace\",\"aerospaziale\",\"aerospaziali\",\"aeroespacio\",\"aeroespaço\",\"Luftfahrt\",\"luchtvaart\",\"ruimtevaart\",\"αεροπορικός\",\"αεροπορική\",\"αεροπορικό\",\"αεροναυπηγικός\",\"αεροναυπηγική\",\"αεροναυπηγικό\",\"αεροναυπηγικά\",\"havacılıkveuzay\",\"légtér\",\"zrakoplovstvo\",\"atmosfäär\",\"kosmos\",\"\"],\n" + + " \"key::87\": [\"dermatology\",\"dermatologia\",\"dermatología\",\"dermatologia\",\"Dermatologie\",\"dermatologie\",\"δρματολογία\",\"dermatoloji\",\"bőrgyógyászat\",\"dermatológia\",\"dermatologija\",\"dermatoloogia\",\"\"],\n" + + " \"key::88\": [\"architecture\",\"architettura\",\"arquitectura\",\"arquitetura\",\"Architektur\",\"architectuur\",\"αρχιτεκτονική\",\"mimarlık\",\"építészet\",\"arhitektura\",\"arhitektuur\",\"\"],\n" + + " \"key::89\": [\"mathematics\",\"matematica\",\"matematiche\",\"matemáticas\",\"matemáticas\",\"Mathematik\",\"wiskunde\",\"mathematica\",\"μαθηματικά\",\"matematik\",\"matematika\",\"matematika\",\"matemaatika\",\"\"],\n" + + " \"key::90\": [\"language\",\"lingue\",\"linguistica\",\"linguistiche\",\"lenguaje\",\"idioma\",\"língua\",\"idioma\",\"Sprache\",\"taal\",\"taalkunde\",\"γλώσσα\",\"dil\",\"nyelv\",\"jezik\",\"keel\",\"\"],\n" + + " \"key::91\": [\"neuroscience\",\"neuroscienza\",\"neurociencia\",\"neurociência\",\"Neurowissenschaft\",\"neurowetenschappen\",\"νευροεπιστήμη\",\"nörobilim\",\"idegtudomány\",\"nevroznanost\",\"neuroteadused\",\"\"],\n" + + " \"key::92\": [\"automation\",\"automazione\",\"automatización\",\"automação\",\"Automatisierung\",\"automatisering\",\"αυτοματοποίηση\",\"otomasyon\",\"automatizálás\",\"avtomatizacija\",\"automatiseeritud\",\"\"],\n" + + " \"key::93\": [\"pediatric\",\"pediatria\",\"pediatriche\",\"pediatrico\",\"pediátrico\",\"pediatría\",\"pediátrico\",\"pediatria\",\"pädiatrisch\",\"pediatrische\",\"παιδιατρική\",\"pediatrik\",\"gyermekgyógyászat\",\"pediatrija\",\"pediaatria\",\"\"],\n" + + " \"key::94\": [\"photonics\",\"fotonica\",\"fotoniche\",\"fotónica\",\"fotônica\",\"Photonik\",\"fotonica\",\"φωτονική\",\"fotonik\",\"fotonika\",\"fotonika\",\"fotoonika\",\"\"],\n" + + " \"key::95\": [\"mechanics\", \"mechanical\", \"meccanica\",\"meccaniche\",\"mecánica\",\"mecânica\",\"Mechanik\",\"Maschinenbau\",\"mechanica\",\"werktuigkunde\",\"μηχανικής\",\"mekanik\",\"gépészet\",\"mehanika\",\"mehaanika\",\"\"],\n" + + " \"key::96\": [\"psychiatrics\",\"psichiatria\",\"psichiatrica\",\"psichiatriche\",\"psiquiatría\",\"psiquiatria\",\"Psychiatrie\",\"psychiatrie\",\"ψυχιατρική\",\"psikiyatrik\",\"pszihiátria\",\"psihiatrija\",\"psühhaatria\",\"\"],\n" + + " \"key::97\": [\"psychology\",\"fisiologia\",\"psicología\",\"psicologia\",\"Psychologie\",\"psychologie\",\"ψυχολογία\",\"psikoloji\",\"pszihológia\",\"psihologija\",\"psühholoogia\",\"\"],\n" + + " \"key::98\": [\"automotive\",\"industriaautomobilistica\",\"industriadelautomóvil\",\"automotriz\",\"industriaautomotriz\",\"automotivo\",\"Automobilindustrie\",\"autoindustrie\",\"αυτοκίνητος\",\"αυτοκίνητη\",\"αυτοκίνητο\",\"αυτοκινούμενος\",\"αυτοκινούμενη\",\"αυτοκινούμενο\",\"αυτοκινητιστικός\",\"αυτοκινητιστική\",\"αυτοκινητιστικό\",\"otomotiv\",\"autóipari\",\"samogiben\",\"avtomobilskaindustrija\",\"auto-\",\"\"],\n" + + " \"key::99\": [\"neurology\",\"neurologia\",\"neurologiche\",\"neurología\",\"neurologia\",\"Neurologie\",\"neurologie\",\"zenuwleer\",\"νευρολογία\",\"nöroloji\",\"neurológia\",\"ideggyógyászat\",\"nevrologija\",\"neuroloogia\",\"\"],\n" + + " \"key::100\": [\"geology\",\"geologia\",\"geologiche\",\"geología\",\"geologia\",\"Geologie\",\"geologie\",\"aardkunde\",\"γεωλογία\",\"jeoloji\",\"geológia\",\"földtudomány\",\"geologija\",\"geoloogia\",\"\"],\n" + + " \"key::101\": [\"microbiology\",\"microbiologia\",\"micro-biologia\",\"microbiologiche\",\"microbiología\",\"microbiologia\",\"Mikrobiologie\",\"microbiologie\",\"μικροβιολογία\",\"mikrobiyoloji\",\"mikrobiológia\",\"mikrobiologija\",\"mikrobioloogia\",\"\"],\n" + + " \"key::102\": [\"informatics\",\"informatica\",\"informática\",\"informática\",\"informatica\",\"\"],\n" + + " \"key::103\": [\"forschungsgemeinschaft\",\"comunita ricerca\",\"research community\",\"research foundation\",\"research association\"],\n" + + " \"key::104\": [\"commerce\",\"ticaret\",\"ticarət\",\"commercio\",\"trade\",\"handel\",\"comercio\"],\n" + + " \"key::105\" : [\"state\", \"stato\", \"etade\", \"estado\", \"statale\", \"etat\", \"zustand\", \"estado\"],\n" + + " \"key::106\" : [\"seminary\", \"seminario\", \"seminaire\", \"seminar\"],\n" + + " \"key::107\" : [\"agricultural forestry\", \"af\", \"a f\"],\n" + + " \"key::108\" : [\"agricultural mechanical\", \"am\", \"a m\"],\n" + + " \"key::109\" : [\"catholic\", \"catholique\", \"katholische\", \"catolica\", \"cattolica\", \"catolico\"]\n" + + " }\n" + + " }\n" + + "}"); + + @Test + public void testJPath () throws Exception { + + MapDocument d = MapDocumentUtil.asMapDocumentWithJPath(conf, json); + + System.out.println("d = " + d); + + } +} diff --git a/dhp-workflows/dhp-dedup-scholexplorer/src/test/resources/eu/dnetlib/dedup/conf/org.curr.conf.json b/dhp-workflows/dhp-dedup-openaire/src/test/resources/eu/dnetlib/dhp/dedup/conf/org.curr.conf.json similarity index 97% rename from dhp-workflows/dhp-dedup-scholexplorer/src/test/resources/eu/dnetlib/dedup/conf/org.curr.conf.json rename to dhp-workflows/dhp-dedup-openaire/src/test/resources/eu/dnetlib/dhp/dedup/conf/org.curr.conf.json index 2d09055626..726f2b8997 100644 --- a/dhp-workflows/dhp-dedup-scholexplorer/src/test/resources/eu/dnetlib/dedup/conf/org.curr.conf.json +++ b/dhp-workflows/dhp-dedup-openaire/src/test/resources/eu/dnetlib/dhp/dedup/conf/org.curr.conf.json @@ -3,6 +3,7 @@ "threshold" : "0.99", "dedupRun" : "001", "entityType" : "organization", + "subEntityValue": "organization", "orderField" : "legalname", "queueMaxSize" : "2000", "groupMaxSize" : "50", @@ -87,8 +88,8 @@ } } ], - "threshold": 0.7, - "aggregation": "W_MEAN", + "threshold": 0.1, + "aggregation": "AVG", "positive": "layer4", "negative": "NO_MATCH", "undefined": "NO_MATCH", @@ -106,7 +107,7 @@ } } ], - "threshold": 0.9, + "threshold": 0.7, "aggregation": "AVG", "positive": "layer5", "negative": "NO_MATCH", @@ -129,7 +130,9 @@ "comparator": "jaroWinklerNormalizedName", "weight": 0.1, "countIfUndefined": "false", - "params": {} + "params": { + "windowSize": 4 + } } ], "threshold": 0.9, @@ -145,14 +148,14 @@ { "name" : "legalshortname", "type" : "String", "path" : "$.legalshortname.value"}, { "name" : "legalname", "type" : "String", "path" : "$.legalname.value" }, { "name" : "websiteurl", "type" : "URL", "path" : "$.websiteurl.value" }, - { "name" : "gridid", "type" : "String", "path" : "$.pid[?(@.qualifier.classid =='grid.ac')].value"}, + { "name" : "gridid", "type" : "String", "path" : "$.pid[?(@.qualifier.classid =='grid')].value"}, { "name" : "originalId", "type" : "String", "path" : "$.id" } ], "blacklists" : { "legalname" : [] }, "synonyms": { - "key::1": ["university","università","università studi","universitario","universitaria","université", "universite", "universitaire","universitaires","universidad","universitade","Universität","universitaet","Uniwersytet","университет","universiteit","πανεπιστήμιο","universitesi","universiteti", "universiti"], + "key::1": ["university","università", "universitas", "università studi","universitario","universitaria","université", "universite", "universitaire","universitaires","universidad","universitade","Universität","universitaet","Uniwersytet","университет","universiteit","πανεπιστήμιο","universitesi","universiteti", "universiti"], "key::2": ["studies","studi","études","estudios","estudos","Studien","studia","исследования","studies","σπουδές"], "key::3": ["advanced","superiore","supérieur","supérieure","supérieurs","supérieures","avancado","avancados","fortgeschrittene","fortgeschritten","zaawansowany","передовой","gevorderd","gevorderde","προχωρημένος","προχωρημένη","προχωρημένο","προχωρημένες","προχωρημένα","wyzsza"], "key::4": ["institute","istituto","institut","instituto","instituto","Institut","instytut","институт","instituut","ινστιτούτο"], diff --git a/dhp-workflows/dhp-dedup-scholexplorer/src/test/resources/eu/dnetlib/dedup/conf/pub_dt.curr.conf.json b/dhp-workflows/dhp-dedup-openaire/src/test/resources/eu/dnetlib/dhp/dedup/conf/pub.curr.conf.json similarity index 96% rename from dhp-workflows/dhp-dedup-scholexplorer/src/test/resources/eu/dnetlib/dedup/conf/pub_dt.curr.conf.json rename to dhp-workflows/dhp-dedup-openaire/src/test/resources/eu/dnetlib/dhp/dedup/conf/pub.curr.conf.json index 6ca0ecd531..d471ccb89d 100644 --- a/dhp-workflows/dhp-dedup-scholexplorer/src/test/resources/eu/dnetlib/dedup/conf/pub_dt.curr.conf.json +++ b/dhp-workflows/dhp-dedup-openaire/src/test/resources/eu/dnetlib/dhp/dedup/conf/pub.curr.conf.json @@ -28,34 +28,10 @@ "idPath": "$.id" }, "pace": { - "clustering": [ - { - "name": "ngrampairs", - "fields": [ - "title" - ], - "params": { - "max": "1", - "ngramLen": "3" - } - }, - { - "name": "suffixprefix", - "fields": [ - "title" - ], - "params": { - "max": "1", - "len": "3" - } - }, - { - "name": "lowercase", - "fields": [ - "doi" - ], - "params": {} - } + "clustering" : [ + { "name" : "ngrampairs", "fields" : [ "title" ], "params" : { "max" : "1", "ngramLen" : "3"} }, + { "name" : "suffixprefix", "fields" : [ "title" ], "params" : { "max" : "1", "len" : "3" } }, + { "name" : "lowercase", "fields" : [ "doi" ], "params" : { } } ], "decisionTree": { "start": { diff --git a/dhp-workflows/dhp-dedup-scholexplorer/src/test/resources/eu/dnetlib/dedup/conf/sample.json b/dhp-workflows/dhp-dedup-openaire/src/test/resources/eu/dnetlib/dhp/dedup/conf/sample.json similarity index 100% rename from dhp-workflows/dhp-dedup-scholexplorer/src/test/resources/eu/dnetlib/dedup/conf/sample.json rename to dhp-workflows/dhp-dedup-openaire/src/test/resources/eu/dnetlib/dhp/dedup/conf/sample.json diff --git a/dhp-workflows/dhp-dedup-openaire/src/test/resources/eu/dnetlib/dhp/dedup/json/authors_merge.json b/dhp-workflows/dhp-dedup-openaire/src/test/resources/eu/dnetlib/dhp/dedup/json/authors_merge.json new file mode 100644 index 0000000000..e69de29bb2 diff --git a/dhp-workflows/dhp-dedup-scholexplorer/pom.xml b/dhp-workflows/dhp-dedup-scholexplorer/pom.xml index aa278d265f..f1b51a7094 100644 --- a/dhp-workflows/dhp-dedup-scholexplorer/pom.xml +++ b/dhp-workflows/dhp-dedup-scholexplorer/pom.xml @@ -4,7 +4,7 @@ dhp-workflows eu.dnetlib.dhp - 1.0.5-SNAPSHOT + 1.1.6-SNAPSHOT 4.0.0 diff --git a/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/SparkCreateConnectedComponent.java b/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/SparkCreateConnectedComponent.java deleted file mode 100644 index 16e112c252..0000000000 --- a/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/SparkCreateConnectedComponent.java +++ /dev/null @@ -1,79 +0,0 @@ -package eu.dnetlib.dedup; - -import com.google.common.collect.Iterables; -import com.google.common.collect.Lists; -import com.google.common.hash.HashFunction; -import com.google.common.hash.Hashing; -import eu.dnetlib.dedup.graph.ConnectedComponent; -import eu.dnetlib.dedup.graph.GraphProcessor; -import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.schema.oaf.Relation; -import eu.dnetlib.pace.config.DedupConfig; -import eu.dnetlib.pace.util.MapDocumentUtil; -import org.apache.commons.io.IOUtils; -import org.apache.spark.api.java.JavaPairRDD; -import org.apache.spark.api.java.JavaRDD; -import org.apache.spark.api.java.JavaSparkContext; -import org.apache.spark.api.java.function.FlatMapFunction; -import org.apache.spark.api.java.function.PairFunction; -import org.apache.spark.graphx.Edge; -import org.apache.spark.rdd.RDD; -import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.Encoders; -import org.apache.spark.sql.SparkSession; -import scala.Tuple2; - -import java.util.ArrayList; -import java.util.List; - -public class SparkCreateConnectedComponent { - - public static void main(String[] args) throws Exception { - final ArgumentApplicationParser parser = new ArgumentApplicationParser(IOUtils.toString(SparkCreateConnectedComponent.class.getResourceAsStream("/eu/dnetlib/dhp/dedup/dedup_parameters.json"))); - parser.parseArgument(args); - final SparkSession spark = SparkSession - .builder() - .appName(SparkCreateConnectedComponent.class.getSimpleName()) - .master(parser.get("master")) - .getOrCreate(); - - final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); - final String inputPath = parser.get("sourcePath"); - final String entity = parser.get("entity"); - final String targetPath = parser.get("targetPath"); -// final DedupConfig dedupConf = DedupConfig.load(IOUtils.toString(SparkCreateConnectedComponent.class.getResourceAsStream("/eu/dnetlib/dhp/dedup/conf/org.curr.conf2.json"))); - final DedupConfig dedupConf = DedupConfig.load(parser.get("dedupConf")); - - final JavaPairRDD vertexes = sc.textFile(inputPath + "/" + entity) - .map(s -> MapDocumentUtil.getJPathString(dedupConf.getWf().getIdPath(), s)) - .mapToPair((PairFunction) - s -> new Tuple2(getHashcode(s), s) - ); - - final Dataset similarityRelations = spark.read().load(DedupUtility.createSimRelPath(targetPath,entity)).as(Encoders.bean(Relation.class)); - final RDD> edgeRdd = similarityRelations.javaRDD().map(it -> new Edge<>(getHashcode(it.getSource()), getHashcode(it.getTarget()), it.getRelClass())).rdd(); - final JavaRDD cc = GraphProcessor.findCCs(vertexes.rdd(), edgeRdd, dedupConf.getWf().getMaxIterations()).toJavaRDD(); - final Dataset mergeRelation = spark.createDataset(cc.filter(k->k.getDocIds().size()>1).flatMap((FlatMapFunction) c -> - c.getDocIds() - .stream() - .flatMap(id -> { - List tmp = new ArrayList<>(); - Relation r = new Relation(); - r.setSource(c.getCcId()); - r.setTarget(id); - r.setRelClass("merges"); - tmp.add(r); - r = new Relation(); - r.setTarget(c.getCcId()); - r.setSource(id); - r.setRelClass("isMergedIn"); - tmp.add(r); - return tmp.stream(); - }).iterator()).rdd(), Encoders.bean(Relation.class)); - mergeRelation.write().mode("overwrite").save(DedupUtility.createMergeRelPath(targetPath,entity)); - } - - public static long getHashcode(final String id) { - return Hashing.murmur3_128().hashUnencodedChars(id).asLong(); - } -} diff --git a/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/SparkCreateDedupRecord.java b/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/SparkCreateDedupRecord.java deleted file mode 100644 index 8e60df945a..0000000000 --- a/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/SparkCreateDedupRecord.java +++ /dev/null @@ -1,34 +0,0 @@ -package eu.dnetlib.dedup; - -import com.fasterxml.jackson.databind.ObjectMapper; -import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.schema.oaf.OafEntity; -import eu.dnetlib.pace.config.DedupConfig; -import org.apache.commons.io.IOUtils; -import org.apache.spark.api.java.JavaRDD; -import org.apache.spark.api.java.JavaSparkContext; -import org.apache.spark.sql.SparkSession; - -public class SparkCreateDedupRecord { - public static void main(String[] args) throws Exception { - final ArgumentApplicationParser parser = new ArgumentApplicationParser(IOUtils.toString(SparkCreateDedupRecord.class.getResourceAsStream("/eu/dnetlib/dhp/dedup/dedupRecord_parameters.json"))); - parser.parseArgument(args); - final SparkSession spark = SparkSession - .builder() - .appName(SparkCreateDedupRecord.class.getSimpleName()) - .master(parser.get("master")) - .getOrCreate(); - - final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); - final String sourcePath = parser.get("sourcePath"); - final String entity = parser.get("entity"); - final String dedupPath = parser.get("dedupPath"); - final DedupConfig dedupConf = DedupConfig.load(parser.get("dedupConf")); - - final JavaRDD dedupRecord = DedupRecordFactory.createDedupRecord(sc, spark, DedupUtility.createMergeRelPath(dedupPath,entity), DedupUtility.createEntityPath(sourcePath,entity), OafEntityType.valueOf(entity), dedupConf); - dedupRecord.map(r-> { - ObjectMapper mapper = new ObjectMapper(); - return mapper.writeValueAsString(r); - }).saveAsTextFile(dedupPath+"/"+entity+"/dedup_records"); - } -} diff --git a/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/SparkCreateSimRels.java b/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/SparkCreateSimRels.java deleted file mode 100644 index 2bdfa8759b..0000000000 --- a/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/SparkCreateSimRels.java +++ /dev/null @@ -1,73 +0,0 @@ -package eu.dnetlib.dedup; - -import com.google.common.hash.Hashing; -import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.schema.oaf.Relation; -import eu.dnetlib.pace.config.DedupConfig; -import eu.dnetlib.pace.model.MapDocument; -import eu.dnetlib.pace.util.MapDocumentUtil; -import org.apache.commons.io.IOUtils; -import org.apache.spark.api.java.JavaPairRDD; -import org.apache.spark.api.java.JavaRDD; -import org.apache.spark.api.java.JavaSparkContext; -import org.apache.spark.api.java.function.PairFunction; -import org.apache.spark.sql.Encoders; -import org.apache.spark.sql.SparkSession; -import scala.Tuple2; - -import java.util.List; - - -/** - * This Spark class creates similarity relations between entities, saving result - * - * param request: - * sourcePath - * entityType - * target Path - */ -public class SparkCreateSimRels { - - public static void main(String[] args) throws Exception { - final ArgumentApplicationParser parser = new ArgumentApplicationParser(IOUtils.toString(SparkCreateSimRels.class.getResourceAsStream("/eu/dnetlib/dhp/dedup/dedup_parameters.json"))); - parser.parseArgument(args); - final SparkSession spark = SparkSession - .builder() - .appName(SparkCreateSimRels.class.getSimpleName()) - .master(parser.get("master")) - .getOrCreate(); - - final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); - final String inputPath = parser.get("sourcePath"); - final String entity = parser.get("entity"); - final String targetPath = parser.get("targetPath"); -// final DedupConfig dedupConf = DedupConfig.load(IOUtils.toString(SparkCreateSimRels.class.getResourceAsStream("/eu/dnetlib/dhp/dedup/conf/org.curr.conf.json"))); - final DedupConfig dedupConf = DedupConfig.load(parser.get("dedupConf")); - - - - JavaPairRDD mapDocument = sc.textFile(inputPath + "/" + entity) - .mapToPair(s->{ - MapDocument d = MapDocumentUtil.asMapDocumentWithJPath(dedupConf,s); - return new Tuple2<>(d.getIdentifier(), d);}); - - //create blocks for deduplication - JavaPairRDD> blocks = Deduper.createsortedBlocks(sc, mapDocument, dedupConf); -// JavaPairRDD> blocks = Deduper.createBlocks(sc, mapDocument, dedupConf); - - //create relations by comparing only elements in the same group - final JavaPairRDD dedupRels = Deduper.computeRelations2(sc, blocks, dedupConf); -// final JavaPairRDD dedupRels = Deduper.computeRelations(sc, blocks, dedupConf); - - final JavaRDD isSimilarToRDD = dedupRels.map(simRel -> { - final Relation r = new Relation(); - r.setSource(simRel._1()); - r.setTarget(simRel._2()); - r.setRelClass("isSimilarTo"); - return r; - }); - - spark.createDataset(isSimilarToRDD.rdd(), Encoders.bean(Relation.class)).write().mode("overwrite").save( DedupUtility.createSimRelPath(targetPath,entity)); - - } -} diff --git a/dhp-workflows/dhp-dedup-scholexplorer/src/main/resources/eu/dnetlib/dhp/dedup/dedup_parameters.json b/dhp-workflows/dhp-dedup-scholexplorer/src/main/resources/eu/dnetlib/dhp/dedup/dedup_parameters.json deleted file mode 100644 index 8ba8515d0e..0000000000 --- a/dhp-workflows/dhp-dedup-scholexplorer/src/main/resources/eu/dnetlib/dhp/dedup/dedup_parameters.json +++ /dev/null @@ -1,33 +0,0 @@ -[ - { - "paramName": "mt", - "paramLongName": "master", - "paramDescription": "should be local or yarn", - "paramRequired": true - }, - { - "paramName": "s", - "paramLongName": "sourcePath", - "paramDescription": "the path of the sequential file to read", - "paramRequired": true - }, - { - "paramName": "e", - "paramLongName": "entity", - "paramDescription": "the type of entity to be deduped", - "paramRequired": true - }, - { - "paramName": "c", - "paramLongName": "dedupConf", - "paramDescription": "dedup configuration to be used", - "compressed": true, - "paramRequired": true - }, - { - "paramName": "t", - "paramLongName": "targetPath", - "paramDescription": "target path to save dedup result", - "paramRequired": true - } -] \ No newline at end of file diff --git a/dhp-workflows/dhp-dedup-scholexplorer/src/main/resources/eu/dnetlib/dhp/dedup/propagaterels/oozie_app/config-default.xml b/dhp-workflows/dhp-dedup-scholexplorer/src/main/resources/eu/dnetlib/dhp/dedup/propagaterels/oozie_app/config-default.xml index fcab9dd00a..8d8766283f 100644 --- a/dhp-workflows/dhp-dedup-scholexplorer/src/main/resources/eu/dnetlib/dhp/dedup/propagaterels/oozie_app/config-default.xml +++ b/dhp-workflows/dhp-dedup-scholexplorer/src/main/resources/eu/dnetlib/dhp/dedup/propagaterels/oozie_app/config-default.xml @@ -19,6 +19,10 @@ hive_metastore_uris thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083 + + hive_jdbc_url + jdbc:hive2://iis-cdh5-test-m3.ocean.icm.edu.pl:10000 + hive_db_name openaire diff --git a/dhp-workflows/dhp-dedup-scholexplorer/src/test/java/eu/dnetlib/dedup/jpath/JsonPathTest.java b/dhp-workflows/dhp-dedup-scholexplorer/src/test/java/eu/dnetlib/dedup/jpath/JsonPathTest.java deleted file mode 100644 index 7a63cfe24e..0000000000 --- a/dhp-workflows/dhp-dedup-scholexplorer/src/test/java/eu/dnetlib/dedup/jpath/JsonPathTest.java +++ /dev/null @@ -1,31 +0,0 @@ -package eu.dnetlib.dedup.jpath; - -import com.fasterxml.jackson.core.JsonProcessingException; -import com.fasterxml.jackson.databind.ObjectMapper; -import com.jayway.jsonpath.JsonPath; -import org.apache.commons.io.IOUtils; -import org.junit.Test; -import java.util.List; -import java.util.Map; - -public class JsonPathTest { - - @Test - public void testJPath () throws Exception { - final String json = IOUtils.toString(getClass().getResourceAsStream("/eu/dnetlib/dedup/conf/sample.json")); - List> pid = JsonPath.read(json, "$.pid[*]"); -// System.out.println(json); - - pid.forEach(it -> { - try { - System.out.println(new ObjectMapper().writeValueAsString(it)); - } catch (JsonProcessingException e) { - e.printStackTrace(); - } - }); - - - - - } -} diff --git a/dhp-workflows/dhp-dedup-scholexplorer/src/test/resources/eu/dnetlib/dedup/conf/pub.curr.conf.json b/dhp-workflows/dhp-dedup-scholexplorer/src/test/resources/eu/dnetlib/dedup/conf/pub.curr.conf.json deleted file mode 100644 index 3e861fb715..0000000000 --- a/dhp-workflows/dhp-dedup-scholexplorer/src/test/resources/eu/dnetlib/dedup/conf/pub.curr.conf.json +++ /dev/null @@ -1,280 +0,0 @@ -{ - "wf" : { - "threshold" : "0.99", - "dedupRun" : "001", - "entityType" : "result", - "subEntityType" : "resulttype", - "subEntityValue" : "publication", - "orderField" : "title", - "queueMaxSize" : "2000", - "groupMaxSize" : "100", - "maxChildren" : "100", - "idPath": "$.id", - "slidingWindowSize" : "200", - "rootBuilder" : [ "result", "resultProject_outcome_isProducedBy", "resultResult_publicationDataset_isRelatedTo", "resultResult_similarity_isAmongTopNSimilarDocuments", "resultResult_similarity_hasAmongTopNSimilarDocuments", "resultOrganization_affiliation_isAffiliatedWith", "resultResult_part_hasPart", "resultResult_part_isPartOf", "resultResult_supplement_isSupplementTo", "resultResult_supplement_isSupplementedBy", "resultResult_version_isVersionOf" ], - "includeChildren" : "true" - }, - "pace" : { - "clustering" : [ - { "name" : "ngrampairs", "fields" : [ "title" ], "params" : { "max" : "1", "ngramLen" : "3"} }, - { "name" : "suffixprefix", "fields" : [ "title" ], "params" : { "max" : "1", "len" : "3" } }, - { "name" : "lowercase", "fields" : [ "doi" ], "params" : { } } - ], - "strictConditions" : [ - { "name" : "pidMatch", "fields" : [ "pid" ] } - ], - "conditions" : [ - { "name" : "titleVersionMatch", "fields" : [ "title" ] }, - { "name" : "sizeMatch", "fields" : [ "authors" ] } - ], - "model" : [ - { "name" : "doi", "algo" : "Null", "type" : "String", "weight" : "0.0", "ignoreMissing" : "true", "path" : "$.pid[?(@.qualifier.classid ==\"doi\")].value" }, - { "name" : "pid", "algo" : "Null", "type" : "JSON", "weight" : "0.0", "ignoreMissing" : "true", "path" : "$.pid", "overrideMatch" : "true" }, - { "name" : "title", "algo" : "LevensteinTitle", "type" : "String", "weight" : "1.0", "ignoreMissing" : "false", "path" : "$.title[?(@.qualifier.classid ==\"main title\")].value", "length" : 250, "size" : 5 }, - { "name" : "authors", "algo" : "Null", "type" : "List", "weight" : "0.0", "ignoreMissing" : "true", "path" : "$.author[*].fullname", "size" : 200 }, - { "name" : "resulttype", "algo" : "Null", "type" : "String", "weight" : "0.0", "ignoreMissing" : "false", "path" : "$.resulttype.classid" } - ], - "synonyms": {}, - "blacklists" : { - "title" : [ - "^Inside Front Cover$", - "(?i)^Poster presentations$", - "^THE ASSOCIATION AND THE GENERAL MEDICAL COUNCIL$", - "^Problems with perinatal pathology\\.?$", - "(?i)^Cases? of Puerperal Convulsions$", - "(?i)^Operative Gyna?ecology$", - "(?i)^Mind the gap\\!?\\:?$", - "^Chronic fatigue syndrome\\.?$", - "^Cartas? ao editor Letters? to the Editor$", - "^Note from the Editor$", - "^Anesthesia Abstract$", - - "^Annual report$", - "(?i)^“?THE RADICAL PREVENTION OF VENEREAL DISEASE\\.?”?$", - "(?i)^Graph and Table of Infectious Diseases?$", - "^Presentation$", - "(?i)^Reviews and Information on Publications$", - "(?i)^PUBLIC HEALTH SERVICES?$", - "(?i)^COMBINED TEXT-?BOOK OF OBSTETRICS AND GYN(Æ|ae)COLOGY$", - "(?i)^Adrese autora$", - "(?i)^Systematic Part .*\\. Catalogus Fossilium Austriae, Band 2: Echinoidea neogenica$", - "(?i)^Acknowledgement to Referees$", - "(?i)^Behçet's disease\\.?$", - "(?i)^Isolation and identification of restriction endonuclease.*$", - "(?i)^CEREBROVASCULAR DISEASES?.?$", - "(?i)^Screening for abdominal aortic aneurysms?\\.?$", - "^Event management$", - "(?i)^Breakfast and Crohn's disease.*\\.?$", - "^Cálculo de concentraciones en disoluciones acuosas. Ejercicio interactivo\\..*\\.$", - "(?i)^Genetic and functional analyses of SHANK2 mutations suggest a multiple hit model of Autism spectrum disorders?\\.?$", - "^Gushi hakubutsugaku$", - - "^Starobosanski nadpisi u Bosni i Hercegovini \\(.*\\)$", - "^Intestinal spirocha?etosis$", - "^Treatment of Rodent Ulcer$", - "(?i)^\\W*Cloud Computing\\W*$", - "^Compendio mathematico : en que se contienen todas las materias mas principales de las Ciencias que tratan de la cantidad$", - "^Free Communications, Poster Presentations: Session [A-F]$", - - "^“The Historical Aspects? of Quackery\\.?”$", - "^A designated centre for people with disabilities operated by St John of God Community Services (Limited|Ltd), Louth$", - "^P(er|re)-Mile Premiums for Auto Insurance\\.?$", - "(?i)^Case Report$", - "^Boletín Informativo$", - "(?i)^Glioblastoma Multiforme$", - "(?i)^Nuevos táxones animales descritos en la península Ibérica y Macaronesia desde 1994 \\(.*\\)$", - "^Zaměstnanecké výhody$", - "(?i)^The Economics of Terrorism and Counter-Terrorism: A Survey \\(Part .*\\)$", - "(?i)^Carotid body tumours?\\.?$", - "(?i)^\\[Españoles en Francia : La condición Emigrante.*\\]$", - "^Avant-propos$", - "(?i)^St\\. Patrick's Cathedral, Dublin, County Dublin - Head(s)? and Capital(s)?$", - "(?i)^St\\. Patrick's Cathedral, Dublin, County Dublin - Bases?$", - "(?i)^PUBLIC HEALTH VERSUS THE STATE$", - "^Viñetas de Cortázar$", - "(?i)^Search for heavy neutrinos and W(\\[|_|\\(|_\\{|-)?R(\\]|\\)|\\})? bosons with right-handed couplings in a left-right symmetric model in pp collisions at.*TeV(\\.)?$", - "(?i)^Measurement of the pseudorapidity and centrality dependence of the transverse energy density in Pb(-?)Pb collisions at.*tev(\\.?)$", - "(?i)^Search for resonances decaying into top-quark pairs using fully hadronic decays in pp collisions with ATLAS at.*TeV$", - "(?i)^Search for neutral minimal supersymmetric standard model Higgs bosons decaying to tau pairs in pp collisions at.*tev$", - - "(?i)^Relatório de Estágio (de|em) Angiologia e Cirurgia Vascular$", - "^Aus der AGMB$", - - "^Znanstveno-stručni prilozi$", - "(?i)^Zhodnocení finanční situace podniku a návrhy na zlepšení$", - "(?i)^Evaluation of the Financial Situation in the Firm and Proposals to its Improvement$", - "(?i)^Hodnocení finanční situace podniku a návrhy na její zlepšení$", - "^Finanční analýza podniku$", - "^Financial analysis( of business)?$", - "(?i)^Textbook of Gyn(a)?(Æ)?(e)?cology$", - "^Jikken nihon shūshinsho$", - "(?i)^CORONER('|s)(s|') INQUESTS$", - "(?i)^(Μελέτη παραγόντων )?risk management( για ανάπτυξη και εφαρμογή ενός πληροφοριακού συστήματος| και ανάπτυξη συστήματος)?$", - "(?i)^Consultants' contract(s)?$", - "(?i)^Upute autorima$", - "(?i)^Bijdrage tot de Kennis van den Godsdienst der Dajaks van Lan(d|f)ak en Tajan$", - "^Joshi shin kokubun$", - "^Kōtō shōgaku dokuhon nōson'yō$", - "^Jinjō shōgaku shōka$", - "^Shōgaku shūjichō$", - "^Nihon joshi dokuhon$", - "^Joshi shin dokuhon$", - "^Chūtō kanbun dokuhon$", - "^Wabun dokuhon$", - "(?i)^(Analysis of economy selected village or town|Rozbor hospodaření vybrané obce či města)$", - "(?i)^cardiac rehabilitation$", - "(?i)^Analytical summary$", - "^Thesaurus resolutionum Sacrae Congregationis Concilii$", - "(?i)^Sumario analítico(\\s{1})?(Analitic summary)?$", - "^Prikazi i osvrti$", - "^Rodinný dům s provozovnou$", - "^Family house with an establishment$", - "^Shinsei chūtō shin kokugun$", - "^Pulmonary alveolar proteinosis(\\.?)$", - "^Shinshū kanbun$", - "^Viñeta(s?) de Rodríguez$", - "(?i)^RUBRIKA UREDNIKA$", - "^A Matching Model of the Academic Publication Market$", - "^Yōgaku kōyō$", - - "^Internetový marketing$", - "^Internet marketing$", - "^Chūtō kokugo dokuhon$", - "^Kokugo dokuhon$", - "^Antibiotic Cover for Dental Extraction(s?)$", - "^Strategie podniku$", - "^Strategy of an Enterprise$", - "(?i)^respiratory disease(s?)(\\.?)$", - "^Award(s?) for Gallantry in Civil Defence$", - "^Podniková kultura$", - "^Corporate Culture$", - "^Severe hyponatraemia in hospital inpatient(s?)(\\.?)$", - "^Pracovní motivace$", - "^Work Motivation$", - "^Kaitei kōtō jogaku dokuhon$", - "^Konsolidovaná účetní závěrka$", - "^Consolidated Financial Statements$", - "(?i)^intracranial tumour(s?)$", - "^Climate Change Mitigation Options and Directed Technical Change: A Decentralized Equilibrium Analysis$", - "^\\[CERVECERIAS MAHOU(\\.|\\:) INTERIOR\\] \\[Material gráfico\\]$", - "^Housing Market Dynamics(\\:|\\.) On the Contribution of Income Shocks and Credit Constraint(s?)$", - "^\\[Funciones auxiliares de la música en Radio París,.*\\]$", - "^Úroveň motivačního procesu jako způsobu vedení lidí$", - "^The level of motivation process as a leadership$", - "^Pay-beds in N(\\.?)H(\\.?)S(\\.?) Hospitals$", - "(?i)^news and events$", - "(?i)^NOVOSTI I DOGAĐAJI$", - "^Sansū no gakushū$", - "^Posouzení informačního systému firmy a návrh změn$", - "^Information System Assessment and Proposal for ICT Modification$", - "^Stresové zatížení pracovníků ve vybrané profesi$", - "^Stress load in a specific job$", - - "^Sunday: Poster Sessions, Pt.*$", - "^Monday: Poster Sessions, Pt.*$", - "^Wednesday: Poster Sessions, Pt.*", - "^Tuesday: Poster Sessions, Pt.*$", - - "^Analýza reklamy$", - "^Analysis of advertising$", - - "^Shōgaku shūshinsho$", - "^Shōgaku sansū$", - "^Shintei joshi kokubun$", - "^Taishō joshi kokubun dokuhon$", - "^Joshi kokubun$", - - "^Účetní uzávěrka a účetní závěrka v ČR$", - "(?i)^The \"?Causes\"? of Cancer$", - "^Normas para la publicación de artículos$", - "^Editor('|s)(s|') [Rr]eply$", - "^Editor(’|s)(s|’) letter$", - "^Redaktoriaus žodis$", - "^DISCUSSION ON THE PRECEDING PAPER$", - "^Kōtō shōgaku shūshinsho jidōyō$", - "^Shōgaku nihon rekishi$", - "^(Theory of the flow of action currents in isolated myelinated nerve fibers).*$", - "^Préface$", - "^Occupational [Hh]ealth [Ss]ervices.$", - "^In Memoriam Professor Toshiyuki TAKESHIMA$", - "^Účetní závěrka ve vybraném podniku.*$", - "^Financial statements in selected company$", - "^Abdominal [Aa]ortic [Aa]neurysms.*$", - "^Pseudomyxoma peritonei$", - "^Kazalo autora$", - - "(?i)^uvodna riječ$", - "^Motivace jako způsob vedení lidí$", - "^Motivation as a leadership$", - "^Polyfunkční dům$", - "^Multi\\-funkcional building$", - "^Podnikatelský plán$", - "(?i)^Podnikatelský záměr$", - "(?i)^Business Plan$", - "^Oceňování nemovitostí$", - "^Marketingová komunikace$", - "^Marketing communication$", - "^Sumario Analítico$", - "^Riječ uredništva$", - "^Savjetovanja i priredbe$", - "^Índice$", - "^(Starobosanski nadpisi).*$", - "^Vzdělávání pracovníků v organizaci$", - "^Staff training in organization$", - "^(Life Histories of North American Geometridae).*$", - "^Strategická analýza podniku$", - "^Strategic Analysis of an Enterprise$", - "^Sadržaj$", - "^Upute suradnicima$", - "^Rodinný dům$", - "(?i)^Fami(l)?ly house$", - "^Upute autorima$", - "^Strategic Analysis$", - "^Finanční analýza vybraného podniku$", - "^Finanční analýza$", - "^Riječ urednika$", - "(?i)^Content(s?)$", - "(?i)^Inhalt$", - "^Jinjō shōgaku shūshinsho jidōyō$", - "(?i)^Index$", - "^Chūgaku kokubun kyōkasho$", - "^Retrato de una mujer$", - "^Retrato de un hombre$", - "^Kōtō shōgaku dokuhon$", - "^Shotōka kokugo$", - "^Shōgaku dokuhon$", - "^Jinjō shōgaku kokugo dokuhon$", - "^Shinsei kokugo dokuhon$", - "^Teikoku dokuhon$", - "^Instructions to Authors$", - "^KİTAP TAHLİLİ$", - "^PRZEGLĄD PIŚMIENNICTWA$", - "(?i)^Presentación$", - "^İçindekiler$", - "(?i)^Tabl?e of contents$", - "^(CODICE DEL BEATO DE LOS REYES FERNANDO I Y SANCHA).*$", - "^(\\[MADRID\\. BIBL\\. NAC\\. N.*KING FERDINAND I.*FROM SAN ISIDORO DE LEON\\. FACUNDUS SCRIPSIT DATED.*\\]).*", - "^Editorial( Board)?$", - "(?i)^Editorial \\(English\\)$", - "^Editörden$", - "^(Corpus Oral Dialectal \\(COD\\)\\.).*$", - "^(Kiri Karl Morgensternile).*$", - "^(\\[Eksliibris Aleksandr).*\\]$", - "^(\\[Eksliibris Aleksandr).*$", - "^(Eksliibris Aleksandr).*$", - "^(Kiri A\\. de Vignolles).*$", - "^(2 kirja Karl Morgensternile).*$", - "^(Pirita kloostri idaosa arheoloogilised).*$", - "^(Kiri tundmatule).*$", - "^(Kiri Jenaer Allgemeine Literaturzeitung toimetusele).*$", - "^(Eksliibris Nikolai Birukovile).*$", - "^(Eksliibris Nikolai Issakovile).*$", - "^(WHP Cruise Summary Information of section).*$", - "^(Measurement of the top quark\\-pair production cross section with ATLAS in pp collisions at).*$", - "^(Measurement of the spin\\-dependent structure function).*", - "(?i)^.*authors['’′]? reply\\.?$", - "(?i)^.*authors['’′]? response\\.?$" - ] - } - } -} \ No newline at end of file diff --git a/dhp-workflows/dhp-dedup-scholexplorer/src/test/resources/eu/dnetlib/dedup/json/authors_merge.json b/dhp-workflows/dhp-dedup-scholexplorer/src/test/resources/eu/dnetlib/dedup/json/authors_merge.json deleted file mode 100644 index 4e8b66d1bb..0000000000 --- a/dhp-workflows/dhp-dedup-scholexplorer/src/test/resources/eu/dnetlib/dedup/json/authors_merge.json +++ /dev/null @@ -1,3 +0,0 @@ -{"journal":{"name":"","issnPrinted":"","issnOnline":"","issnLinking":"","ep":"","iss":"","sp":"","vol":"","edition":"","conferenceplace":"","conferencedate":"","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},"author":[{"fullname":"Nemkov, Pavel G.","name":"","surname":"","rank":1,"pid":[],"affiliation":[]},{"fullname":"Gayubo, Severiano F.","name":"","surname":"","rank":2,"pid":[{"value":"ORCID1","qualifier":{"classid":"orcid","classname":"orcid","schemeid":"dnet:pidType","schemename":"dnet:pidType"},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}}],"affiliation":[]},{"fullname":"Ciccio Pasticcio","name":"","surname":"","rank":2,"pid":[],"affiliation":[]}],"resulttype":{"classid":"publication","classname":"publication","schemeid":"dnet:result_typologies","schemename":"dnet:result_typologies"},"language":{"classid":"und","classname":"Undetermined","schemeid":"dent:languages","schemename":"dent:languages"},"country":[],"subject":[{"value":"Biodiversity","qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject","schemename":"dnet:subject"},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},{"value":"Taxonomy","qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject","schemename":"dnet:subject"},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},{"value":"Animalia","qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject","schemename":"dnet:subject"},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},{"value":"Arthropoda","qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject","schemename":"dnet:subject"},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},{"value":"Insecta","qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject","schemename":"dnet:subject"},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},{"value":"Hymenoptera","qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject","schemename":"dnet:subject"},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},{"value":"Crabronidae","qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject","schemename":"dnet:subject"},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}}],"title":[{"value":"A New Species Of Nysson Latreille (Hymenoptera: Crabronidae: Bembicinae) From Turkmenistan","qualifier":{"classid":"main title","classname":"main title","schemeid":"dnet:dataCite_title","schemename":"dnet:dataCite_title"},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}}],"relevantdate":[{"value":"2003-12-31","qualifier":{"classid":"dnet:date","classname":"dnet:date","schemeid":"dnet:date","schemename":"dnet:date"},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}}],"description":[{"value":"Nemkov, Pavel G., Gayubo, Severiano F. (2003): A new species of Nysson Latreille (Hymenoptera: Crabronidae: Bembicinae) from Turkmenistan. Zootaxa 144: 1-4, DOI: 10.5281/zenodo.156314","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}}],"dateofacceptance":{"value":"","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},"publisher":{"value":"Zenodo","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},"embargoenddate":{"value":"","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},"source":[],"fulltext":[],"format":[],"contributor":[],"resourcetype":{"classid":"","classname":"","schemeid":"","schemename":""},"coverage":[],"refereed":{"value":"","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},"context":[],"id":"50|scholexplore::ceb3a5d32107897a0df1178211e3e9ca","originalId":[],"collectedfrom":[{"key":"10|openaire____::e034d6a11054f5ade9221ebac484e864","value":"scholExplorer","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}}],"pid":[{"value":"10.5281/zenodo.156314","qualifier":{"classid":"doi","classname":"doi","schemeid":"dnet:pid_types","schemename":"dnet:pid_types"},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}}],"extraInfo":[],"dataInfo":{"invisible":false,"inferred":true,"deletedbyinference":true,"inferenceprovenance":"dedup-similarity-result-levenstein","provenanceaction":{"classid":"sysimport:actionset","classname":"sysimport:actionset","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"lastupdatetimestamp":0} -{"journal":{"name":"","issnPrinted":"","issnOnline":"","issnLinking":"","ep":"","iss":"","sp":"","vol":"","edition":"","conferenceplace":"","conferencedate":"","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},"author":[{"fullname":"Nemkov, Pavel G.","name":"","surname":"","rank":1,"pid":[{"value":"ORCIDDIO","qualifier":{"classid":"ORCID","classname":"ORCID","schemeid":"dnet:pidType","schemename":"dnet:pidType"},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}}],"affiliation":[]},{"fullname":"Gayubo, Severiano F.","name":"","surname":"","rank":2,"pid":[{"value":"MAGGLES","qualifier":{"classid":"mag","classname":"mag","schemeid":"dnet:pidType","schemename":"dnet:pidType"},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}}],"affiliation":[]}],"resulttype":{"classid":"publication","classname":"publication","schemeid":"dnet:result_typologies","schemename":"dnet:result_typologies"},"language":{"classid":"und","classname":"Undetermined","schemeid":"dent:languages","schemename":"dent:languages"},"country":[],"subject":[{"value":"Biodiversity","qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject","schemename":"dnet:subject"},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},{"value":"Taxonomy","qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject","schemename":"dnet:subject"},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},{"value":"Animalia","qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject","schemename":"dnet:subject"},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},{"value":"Arthropoda","qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject","schemename":"dnet:subject"},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},{"value":"Insecta","qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject","schemename":"dnet:subject"},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},{"value":"Hymenoptera","qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject","schemename":"dnet:subject"},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},{"value":"Crabronidae","qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject","schemename":"dnet:subject"},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}}],"title":[{"value":"A New Species Of Nysson Latreille (Hymenoptera: Crabronidae: Bembicinae) From Turkmenistan","qualifier":{"classid":"main title","classname":"main title","schemeid":"dnet:dataCite_title","schemename":"dnet:dataCite_title"},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}}],"relevantdate":[{"value":"2003-12-31","qualifier":{"classid":"dnet:date","classname":"dnet:date","schemeid":"dnet:date","schemename":"dnet:date"},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}}],"description":[{"value":"Nemkov, Pavel G., Gayubo, Severiano F. (2003): A new species of Nysson Latreille (Hymenoptera: Crabronidae: Bembicinae) from Turkmenistan. Zootaxa 144: 1-4, DOI: 10.5281/zenodo.156314","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}}],"dateofacceptance":{"value":"","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},"publisher":{"value":"Zenodo","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},"embargoenddate":{"value":"","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},"source":[],"fulltext":[],"format":[],"contributor":[],"resourcetype":{"classid":"","classname":"","schemeid":"","schemename":""},"coverage":[],"refereed":{"value":"","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},"context":[],"id":"50|scholexplore::ceb3a5d32107897a0df1178211e3e9ca","originalId":[],"collectedfrom":[{"key":"10|openaire____::e034d6a11054f5ade9221ebac484e864","value":"scholExplorer","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}}],"pid":[{"value":"10.5281/zenodo.156314","qualifier":{"classid":"doi","classname":"doi","schemeid":"dnet:pid_types","schemename":"dnet:pid_types"},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}}],"extraInfo":[],"dataInfo":{"invisible":false,"inferred":true,"deletedbyinference":true,"inferenceprovenance":"dedup-similarity-result-levenstein","provenanceaction":{"classid":"sysimport:actionset","classname":"sysimport:actionset","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"lastupdatetimestamp":0} -{"journal":{"name":"","issnPrinted":"","issnOnline":"","issnLinking":"","ep":"","iss":"","sp":"","vol":"","edition":"","conferenceplace":"","conferencedate":"","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},"author":[],"resulttype":{"classid":"publication","classname":"publication","schemeid":"dnet:result_typologies","schemename":"dnet:result_typologies"},"language":{"classid":"und","classname":"Undetermined","schemeid":"dent:languages","schemename":"dent:languages"},"country":[],"subject":[{"value":"Biodiversity","qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject","schemename":"dnet:subject"},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},{"value":"Taxonomy","qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject","schemename":"dnet:subject"},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},{"value":"Animalia","qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject","schemename":"dnet:subject"},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},{"value":"Arthropoda","qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject","schemename":"dnet:subject"},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},{"value":"Insecta","qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject","schemename":"dnet:subject"},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},{"value":"Hymenoptera","qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject","schemename":"dnet:subject"},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},{"value":"Crabronidae","qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject","schemename":"dnet:subject"},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}}],"title":[{"value":"A New Species Of Nysson Latreille (Hymenoptera: Crabronidae: Bembicinae) From Turkmenistan","qualifier":{"classid":"main title","classname":"main title","schemeid":"dnet:dataCite_title","schemename":"dnet:dataCite_title"},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}}],"relevantdate":[{"value":"2003-12-31","qualifier":{"classid":"dnet:date","classname":"dnet:date","schemeid":"dnet:date","schemename":"dnet:date"},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}}],"description":[{"value":"Nemkov, Pavel G., Gayubo, Severiano F. (2003): A new species of Nysson Latreille (Hymenoptera: Crabronidae: Bembicinae) from Turkmenistan. Zootaxa 144: 1-4, DOI: 10.5281/zenodo.156314","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}}],"dateofacceptance":{"value":"","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},"publisher":{"value":"Zenodo","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},"embargoenddate":{"value":"","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},"source":[],"fulltext":[],"format":[],"contributor":[],"resourcetype":{"classid":"","classname":"","schemeid":"","schemename":""},"coverage":[],"refereed":{"value":"","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},"context":[],"id":"50|scholexplore::ceb3a5d32107897a0df1178211e3e9ca","originalId":[],"collectedfrom":[{"key":"10|openaire____::e034d6a11054f5ade9221ebac484e864","value":"scholExplorer","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}}],"pid":[{"value":"10.5281/zenodo.156314","qualifier":{"classid":"doi","classname":"doi","schemeid":"dnet:pid_types","schemename":"dnet:pid_types"},"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}}],"extraInfo":[],"dataInfo":{"invisible":false,"inferred":true,"deletedbyinference":true,"inferenceprovenance":"dedup-similarity-result-levenstein","provenanceaction":{"classid":"sysimport:actionset","classname":"sysimport:actionset","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"lastupdatetimestamp":0} diff --git a/dhp-workflows/dhp-distcp/pom.xml b/dhp-workflows/dhp-distcp/pom.xml index 3b2f3176bb..c3f09b42c1 100644 --- a/dhp-workflows/dhp-distcp/pom.xml +++ b/dhp-workflows/dhp-distcp/pom.xml @@ -3,7 +3,7 @@ dhp-workflows eu.dnetlib.dhp - 1.0.5-SNAPSHOT + 1.1.6-SNAPSHOT 4.0.0 diff --git a/dhp-workflows/dhp-graph-mapper/pom.xml b/dhp-workflows/dhp-graph-mapper/pom.xml index 641fbd933c..dc9e2d9e8b 100644 --- a/dhp-workflows/dhp-graph-mapper/pom.xml +++ b/dhp-workflows/dhp-graph-mapper/pom.xml @@ -4,7 +4,7 @@ dhp-workflows eu.dnetlib.dhp - 1.0.5-SNAPSHOT + 1.1.6-SNAPSHOT 4.0.0 diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/graph/GraphMappingUtils.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/graph/GraphMappingUtils.java index ab19ff2b51..0291be47ef 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/graph/GraphMappingUtils.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/graph/GraphMappingUtils.java @@ -1,23 +1,31 @@ package eu.dnetlib.dhp.graph; -import com.google.common.collect.Maps; -import eu.dnetlib.dhp.schema.oaf.*; - import java.util.Map; +import com.google.common.collect.Maps; + +import eu.dnetlib.dhp.schema.oaf.Dataset; +import eu.dnetlib.dhp.schema.oaf.Datasource; +import eu.dnetlib.dhp.schema.oaf.Organization; +import eu.dnetlib.dhp.schema.oaf.OtherResearchProduct; +import eu.dnetlib.dhp.schema.oaf.Project; +import eu.dnetlib.dhp.schema.oaf.Publication; +import eu.dnetlib.dhp.schema.oaf.Relation; +import eu.dnetlib.dhp.schema.oaf.Software; + public class GraphMappingUtils { - public final static Map types = Maps.newHashMap(); + public final static Map types = Maps.newHashMap(); - static { - types.put("datasource", Datasource.class); - types.put("organization", Organization.class); - types.put("project", Project.class); - types.put("dataset", Dataset.class); - types.put("otherresearchproduct", OtherResearchProduct.class); - types.put("software", Software.class); - types.put("publication", Publication.class); - types.put("relation", Relation.class); - } + static { + types.put("datasource", Datasource.class); + types.put("organization", Organization.class); + types.put("project", Project.class); + types.put("dataset", Dataset.class); + types.put("otherresearchproduct", OtherResearchProduct.class); + types.put("software", Software.class); + types.put("publication", Publication.class); + types.put("relation", Relation.class); + } } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/graph/SparkGraphImporterJob.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/graph/SparkGraphImporterJob.java index a6a4e92911..95c3cd4800 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/graph/SparkGraphImporterJob.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/graph/SparkGraphImporterJob.java @@ -3,48 +3,50 @@ package eu.dnetlib.dhp.graph; import com.fasterxml.jackson.databind.ObjectMapper; import eu.dnetlib.dhp.application.ArgumentApplicationParser; import org.apache.commons.io.IOUtils; -import org.apache.hadoop.io.Text; -import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.sql.Encoders; import org.apache.spark.sql.SaveMode; import org.apache.spark.sql.SparkSession; -import scala.Tuple2; public class SparkGraphImporterJob { public static void main(String[] args) throws Exception { - final ArgumentApplicationParser parser = new ArgumentApplicationParser(IOUtils.toString(SparkGraphImporterJob.class.getResourceAsStream("/eu/dnetlib/dhp/graph/input_graph_parameters.json"))); + final ArgumentApplicationParser parser = new ArgumentApplicationParser( + IOUtils.toString(SparkGraphImporterJob.class.getResourceAsStream( + "/eu/dnetlib/dhp/graph/input_graph_parameters.json"))); parser.parseArgument(args); - final SparkSession spark = SparkSession - .builder() - .appName(SparkGraphImporterJob.class.getSimpleName()) - .master(parser.get("master")) - .config("hive.metastore.uris", parser.get("hive_metastore_uris")) - .enableHiveSupport() - .getOrCreate(); - final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); - final String inputPath = parser.get("sourcePath"); - final String hiveDbName = parser.get("hive_db_name"); + try(SparkSession spark = getSparkSession(parser)) { - spark.sql(String.format("CREATE DATABASE IF NOT EXISTS %s", hiveDbName)); + final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); + final String inputPath = parser.get("sourcePath"); + final String hiveDbName = parser.get("hive_db_name"); - // Read the input file and convert it into RDD of serializable object - GraphMappingUtils.types.forEach((name, clazz) -> { - final JavaRDD> inputRDD = sc.sequenceFile(inputPath + "/" + name, Text.class, Text.class) - .map(item -> new Tuple2<>(item._1.toString(), item._2.toString())); + spark.sql(String.format("DROP DATABASE IF EXISTS %s CASCADE", hiveDbName)); + spark.sql(String.format("CREATE DATABASE IF NOT EXISTS %s", hiveDbName)); - spark.createDataset(inputRDD - .filter(s -> s._1().equals(clazz.getName())) - .map(Tuple2::_2) + // Read the input file and convert it into RDD of serializable object + GraphMappingUtils.types.forEach((name, clazz) -> spark.createDataset(sc.textFile(inputPath + "/" + name) .map(s -> new ObjectMapper().readValue(s, clazz)) .rdd(), Encoders.bean(clazz)) .write() .mode(SaveMode.Overwrite) - .saveAsTable(hiveDbName + "." + name); - }); + .saveAsTable(hiveDbName + "." + name)); + } + } + private static SparkSession getSparkSession(ArgumentApplicationParser parser) { + SparkConf conf = new SparkConf(); + conf.set("hive.metastore.uris", parser.get("hive_metastore_uris")); + + return SparkSession + .builder() + .appName(SparkGraphImporterJob.class.getSimpleName()) + .master(parser.get("master")) + .config(conf) + .enableHiveSupport() + .getOrCreate(); } } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/graph/Application/ConvertXMLToEntities/oozie_app/workflow.xml b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/graph/Application/ConvertXMLToEntities/oozie_app/workflow.xml index a1faaa0f54..bbee2f01cb 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/graph/Application/ConvertXMLToEntities/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/graph/Application/ConvertXMLToEntities/oozie_app/workflow.xml @@ -1,12 +1,13 @@ + sourcePath the source path - targetPath - the source path + hive_db_name + the target hive database name sparkDriverMemory @@ -20,40 +21,67 @@ sparkExecutorCores number of cores used by single executor - - entity - the entity type - - + + ${jobTracker} + ${nameNode} + + + mapreduce.job.queuename + ${queueName} + + + oozie.launcher.mapred.job.queue.name + ${oozieLauncherQueueName} + + + + + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] - - - - - + + + + yarn + cluster + MapGraphAsHiveDB + eu.dnetlib.dhp.graph.SparkGraphImporterJob + dhp-graph-mapper-${projectVersion}.jar + + --executor-memory ${sparkExecutorMemory} + --executor-cores ${sparkExecutorCores} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" + --conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" + --conf spark.sql.warehouse.dir="/user/hive/warehouse" + + -mt yarn-cluster + --sourcePath${sourcePath} + --hive_db_name${hive_db_name} + --hive_metastore_uris${hive_metastore_uris} + + - - + + ${jobTracker} ${nameNode} - yarn-cluster - cluster - Import ${entity} and related entities - eu.dnetlib.dhp.graph.scholexplorer.SparkScholexplorerGraphImporter - dhp-graph-mapper-${projectVersion}.jar - --executor-memory ${sparkExecutorMemory} --driver-memory=${sparkDriverMemory} ${sparkExtraOPT} - -mt yarn-cluster - --sourcePath${sourcePath} - --targetPath${targetPath} - --entity${entity} - + + + hive.metastore.uris + ${hive_metastore_uris} + + + ${hive_jdbc_url}/${hive_db_name} + + hive_db_name=${hive_db_name} + diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/graph/oozie_app/lib/scripts/postprocessing.sql b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/graph/oozie_app/lib/scripts/postprocessing.sql new file mode 100644 index 0000000000..c92f8d1af0 --- /dev/null +++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/graph/oozie_app/lib/scripts/postprocessing.sql @@ -0,0 +1,10 @@ +DROP VIEW IF EXISTS ${hive_db_name}.result; + +CREATE VIEW IF NOT EXISTS result as + select id, dateofcollection, title, publisher, bestaccessright, datainfo, collectedfrom, pid, author, resulttype, language, country, subject, description, dateofacceptance, embargoenddate, resourcetype, context, instance from ${hive_db_name}.publication p + union all + select id, dateofcollection, title, publisher, bestaccessright, datainfo, collectedfrom, pid, author, resulttype, language, country, subject, description, dateofacceptance, embargoenddate, resourcetype, context, instance from ${hive_db_name}.dataset d + union all + select id, dateofcollection, title, publisher, bestaccessright, datainfo, collectedfrom, pid, author, resulttype, language, country, subject, description, dateofacceptance, embargoenddate, resourcetype, context, instance from ${hive_db_name}.software s + union all + select id, dateofcollection, title, publisher, bestaccessright, datainfo, collectedfrom, pid, author, resulttype, language, country, subject, description, dateofacceptance, embargoenddate, resourcetype, context, instance from ${hive_db_name}.otherresearchproduct o; diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/graph/ImportDataFromMongoTest.java b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/graph/ImportDataFromMongoTest.java deleted file mode 100644 index 50248c83d7..0000000000 --- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/graph/ImportDataFromMongoTest.java +++ /dev/null @@ -1,22 +0,0 @@ -package eu.dnetlib.dhp.graph; - -import org.junit.Test; - -public class ImportDataFromMongoTest { - - @Test - public void doTest() throws Exception { - ImportDataFromMongo.main(new String[] { - "-h", "localhost", - "-p", "2800", - "-f", "PMF", - "-l", "store", - "-i", "cleaned", - "-dn", "mdstore_dli", - "-n", "file:///home/sandro/test.seq", - "-u", "sandro", - "-t", "file:///home/sandro/test.seq" - }); - } - -} diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/graph/SparkGraphImportCounterTest.java b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/graph/SparkGraphImportCounterTest.java deleted file mode 100644 index a8e810d4f8..0000000000 --- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/graph/SparkGraphImportCounterTest.java +++ /dev/null @@ -1,31 +0,0 @@ -package eu.dnetlib.dhp.graph; - -import org.apache.spark.api.java.JavaSparkContext; -import org.apache.spark.sql.Encoders; -import org.apache.spark.sql.SparkSession; -import scala.Tuple2; - -import java.util.List; -import java.util.stream.Collectors; - -public class SparkGraphImportCounterTest { - - public static List> countEntities(final String inputPath) throws Exception { - - final SparkSession spark = SparkSession - .builder() - .appName(SparkGraphImportCounterTest.class.getSimpleName()) - .master("local[*]") - .getOrCreate(); - final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); - - return GraphMappingUtils.types.entrySet() - .stream() - .map(entry -> { - final Long count = spark.read().load(inputPath + "/" + entry.getKey()).as(Encoders.bean(entry.getValue())).count(); - return new Tuple2(entry.getKey(), count); - }) - .collect(Collectors.toList()); - } - -} diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/graph/SparkGraphImporterJobTest.java b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/graph/SparkGraphImporterJobTest.java index 2a8703f864..c7743d6845 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/graph/SparkGraphImporterJobTest.java +++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/graph/SparkGraphImporterJobTest.java @@ -1,38 +1,52 @@ package eu.dnetlib.dhp.graph; -import org.apache.commons.io.FileUtils; -import org.junit.*; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.sql.Encoders; +import org.apache.spark.sql.SparkSession; +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.Disabled; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; +import scala.Tuple2; -import java.io.IOException; -import java.nio.file.Files; import java.nio.file.Path; +import java.util.List; +import java.util.stream.Collectors; public class SparkGraphImporterJobTest { private static final long MAX = 1000L; - private Path testDir; - @Before - public void setup() throws IOException { - testDir = Files.createTempDirectory(getClass().getSimpleName()); - } - - @After - public void tearDown() throws IOException { - FileUtils.deleteDirectory(testDir.toFile()); - } - - @Test - @Ignore - public void testImport() throws Exception { + @Disabled("must be parametrized to run locally") + public void testImport(@TempDir Path outPath) throws Exception { SparkGraphImporterJob.main(new String[] { "-mt", "local[*]", - "-i", getClass().getResource("/eu/dnetlib/dhp/dhp-sample/part-m-00010").getPath(), - "-o", testDir.toString()}); + "-s", getClass().getResource("/eu/dnetlib/dhp/graph/sample").getPath(), + "-h", "", + "-db", "test" + }); - SparkGraphImportCounterTest.countEntities(testDir.toString()).forEach(t -> { + countEntities(outPath.toString()).forEach(t -> { System.out.println(t); - //Assert.assertEquals(String.format("mapped %s must be %s", t._1(), MAX), MAX, t._2().longValue()); + Assertions.assertEquals(MAX, t._2().longValue(), String.format("mapped %s must be %s", t._1(), MAX)); }); } + + public static List> countEntities(final String inputPath) { + + final SparkSession spark = SparkSession + .builder() + .appName(SparkGraphImporterJobTest.class.getSimpleName()) + .master("local[*]") + .getOrCreate(); + //final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); + + return GraphMappingUtils.types.entrySet() + .stream() + .map(entry -> { + final Long count = spark.read().load(inputPath + "/" + entry.getKey()).as(Encoders.bean(entry.getValue())).count(); + return new Tuple2(entry.getKey(), count); + }) + .collect(Collectors.toList()); + } } diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/graph/scholexplorer/ScholexplorerParserTest.java b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/graph/scholexplorer/ScholexplorerParserTest.java index ead2ddf226..2185b7987e 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/graph/scholexplorer/ScholexplorerParserTest.java +++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/graph/scholexplorer/ScholexplorerParserTest.java @@ -7,9 +7,8 @@ import eu.dnetlib.dhp.graph.scholexplorer.parser.DatasetScholexplorerParser; import eu.dnetlib.dhp.schema.oaf.Oaf; import eu.dnetlib.scholexplorer.relation.RelationMapper; import org.apache.commons.io.IOUtils; -import org.junit.Test; +import org.junit.jupiter.api.Test; -import java.io.IOException; import java.util.List; public class ScholexplorerParserTest { diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/graph/scholexplorer/SparkScholexplorerGraphImporterTest.java b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/graph/scholexplorer/SparkScholexplorerGraphImporterTest.java index c6e4bac1d0..505e7581a8 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/graph/scholexplorer/SparkScholexplorerGraphImporterTest.java +++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/graph/scholexplorer/SparkScholexplorerGraphImporterTest.java @@ -1,19 +1,11 @@ package eu.dnetlib.dhp.graph.scholexplorer; -import org.junit.Test; + + public class SparkScholexplorerGraphImporterTest { - @Test - - public void testImport() throws Exception { - SparkScholexplorerGraphImporter.main(new String[]{ - "-mt", "local[*]", - "-e", "publication", - "-s", "file:///data/scholexplorer_dump/pmf.dli.seq", - "-t", "file:///data/scholexplorer_dump/pmf_dli_with_rel"} - ); - } + } diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/graph/scholexplorer/SparkScholexplorerMergeEntitiesJobTest.java b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/graph/scholexplorer/SparkScholexplorerMergeEntitiesJobTest.java index 0ab51f6f61..7a93c58344 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/graph/scholexplorer/SparkScholexplorerMergeEntitiesJobTest.java +++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/graph/scholexplorer/SparkScholexplorerMergeEntitiesJobTest.java @@ -1,18 +1,8 @@ package eu.dnetlib.dhp.graph.scholexplorer; -import org.junit.Ignore; -import org.junit.Test; + public class SparkScholexplorerMergeEntitiesJobTest { - @Test - @Ignore - public void testMerge() throws Exception { - SparkScholexplorerMergeEntitiesJob.main(new String[]{ - "-mt", "local[*]", - "-e", "relation", - "-s", "file:///Users/sandro/Downloads/scholix/relation", - "-t", "file:///Users/sandro/Downloads/scholix/relation"} - ); - } + } diff --git a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/graph/sample/dataset/dataset_10.json.gz b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/graph/sample/dataset/dataset_10.json.gz new file mode 100644 index 0000000000..ce0b9709be Binary files /dev/null and b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/graph/sample/dataset/dataset_10.json.gz differ diff --git a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/graph/sample/datasource/datasource_10.json.gz b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/graph/sample/datasource/datasource_10.json.gz new file mode 100644 index 0000000000..130dd4c36f Binary files /dev/null and b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/graph/sample/datasource/datasource_10.json.gz differ diff --git a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/graph/sample/organization/organization_10.json.gz b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/graph/sample/organization/organization_10.json.gz new file mode 100644 index 0000000000..01a2e28ed4 Binary files /dev/null and b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/graph/sample/organization/organization_10.json.gz differ diff --git a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/graph/sample/otherresearchproduct/otherresearchproduct_10.json.gz b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/graph/sample/otherresearchproduct/otherresearchproduct_10.json.gz new file mode 100644 index 0000000000..20b6a4dba3 Binary files /dev/null and b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/graph/sample/otherresearchproduct/otherresearchproduct_10.json.gz differ diff --git a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/graph/sample/project/project_10.json.gz b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/graph/sample/project/project_10.json.gz new file mode 100644 index 0000000000..b8df66b0b4 Binary files /dev/null and b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/graph/sample/project/project_10.json.gz differ diff --git a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/graph/sample/publication/publication_10.json.gz b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/graph/sample/publication/publication_10.json.gz new file mode 100644 index 0000000000..257e0db3a4 Binary files /dev/null and b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/graph/sample/publication/publication_10.json.gz differ diff --git a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/graph/sample/relation/relation_100.json.gz b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/graph/sample/relation/relation_100.json.gz new file mode 100644 index 0000000000..82a685ad26 Binary files /dev/null and b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/graph/sample/relation/relation_100.json.gz differ diff --git a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/graph/sample/software/software_10.json.gz b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/graph/sample/software/software_10.json.gz new file mode 100644 index 0000000000..c2389b7677 Binary files /dev/null and b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/graph/sample/software/software_10.json.gz differ diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/pom.xml b/dhp-workflows/dhp-graph-provision-scholexplorer/pom.xml index 913ab76de7..bb41858a6f 100644 --- a/dhp-workflows/dhp-graph-provision-scholexplorer/pom.xml +++ b/dhp-workflows/dhp-graph-provision-scholexplorer/pom.xml @@ -4,12 +4,43 @@ dhp-workflows eu.dnetlib.dhp - 1.0.5-SNAPSHOT + 1.1.6-SNAPSHOT 4.0.0 dhp-graph-provision-scholexplorer + + + + net.alchim31.maven + scala-maven-plugin + 4.0.1 + + + scala-compile-first + initialize + + add-source + compile + + + + scala-test-compile + process-test-resources + + testCompile + + + + + ${scala.version} + + + + + + org.apache.spark diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/test/java/eu/dnetlib/dhp/provision/ExtractInfoTest.java b/dhp-workflows/dhp-graph-provision-scholexplorer/src/test/java/eu/dnetlib/dhp/provision/ExtractInfoTest.java index be06380f72..b5142447d1 100644 --- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/test/java/eu/dnetlib/dhp/provision/ExtractInfoTest.java +++ b/dhp-workflows/dhp-graph-provision-scholexplorer/src/test/java/eu/dnetlib/dhp/provision/ExtractInfoTest.java @@ -1,23 +1,12 @@ package eu.dnetlib.dhp.provision; -import com.fasterxml.jackson.core.type.TypeReference; import com.fasterxml.jackson.databind.ObjectMapper; import eu.dnetlib.dhp.provision.scholix.Scholix; import eu.dnetlib.dhp.provision.scholix.summary.ScholixSummary; import org.apache.commons.io.IOUtils; -import org.junit.Ignore; -import org.junit.Test; - -import scala.Tuple2; +import org.junit.jupiter.api.Test; public class ExtractInfoTest { - - - - - - - @Test public void testSerialization() throws Exception { @@ -29,7 +18,6 @@ public class ExtractInfoTest { System.out.println(mapper.readValue(json, ScholixSummary.class).getDescription()); } - @Test public void testScholix() throws Exception { final String jsonSummary = IOUtils.toString(getClass().getResourceAsStream("summary.json")); @@ -37,18 +25,4 @@ public class ExtractInfoTest { Scholix.generateScholixWithSource(jsonSummary, jsonRelation); } - - - @Test - - public void testIndex() throws Exception { - SparkGenerateScholix.main( - - new String[] { - "-mt", "local[*]", - "-w", "/Users/sandro/Downloads/scholix/provision", - "-g", "/Users/sandro/Downloads/scholix/graph" - } - ); - } } diff --git a/dhp-workflows/dhp-graph-provision/job-override.properties b/dhp-workflows/dhp-graph-provision/job-override.properties new file mode 100644 index 0000000000..8230dfc18f --- /dev/null +++ b/dhp-workflows/dhp-graph-provision/job-override.properties @@ -0,0 +1,14 @@ +sparkExecutorCoresForJoining=1 +sparkDriverMemoryForJoining=10G +sparkExecutorMemoryForJoining=15G +sparkExecutorCoresForIndexing=64 +sparkDriverMemoryForIndexing=3G +sparkExecutorMemoryForIndexing=2G +#isLookupUrl=http://services.openaire.eu:8280/is/services/isLookUp +isLookupUrl=http://beta.services.openaire.eu:8280/is/services/isLookUp?wsdl +sourcePath=/tmp/db_openaireplus_services.export_dhp.2020.02.03 +outputPath=/tmp/openaire_provision +format=TMF +batchSize=2000 +reuseRecords=false +otherDsTypeId=scholarcomminfra, infospace, pubsrepository::mock, entityregistry, entityregistry::projects, entityregistry::repositories, websource \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-provision/pom.xml b/dhp-workflows/dhp-graph-provision/pom.xml new file mode 100644 index 0000000000..ac4e01d218 --- /dev/null +++ b/dhp-workflows/dhp-graph-provision/pom.xml @@ -0,0 +1,84 @@ + + + + dhp-workflows + eu.dnetlib.dhp + 1.1.6-SNAPSHOT + + 4.0.0 + + dhp-graph-provision + + + + + org.apache.spark + spark-core_2.11 + + + org.apache.spark + spark-sql_2.11 + + + com.jayway.jsonpath + json-path + + + dom4j + dom4j + + + jaxen + jaxen + + + com.mycila.xmltool + xmltool + + + org.antlr + stringtemplate + + + org.apache.solr + solr-solrj + + + com.lucidworks.spark + spark-solr + + + + org.apache.httpcomponents + httpclient + + + org.apache.httpcomponents + httpmime + + + + org.noggit + noggit + + + org.apache.zookeeper + zookeeper + + + + + eu.dnetlib.dhp + dhp-common + ${project.version} + + + eu.dnetlib.dhp + dhp-schemas + ${project.version} + + + + + + \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/GraphJoiner.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/GraphJoiner.java new file mode 100644 index 0000000000..d260e05512 --- /dev/null +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/GraphJoiner.java @@ -0,0 +1,291 @@ +package eu.dnetlib.dhp.graph; + +import com.fasterxml.jackson.databind.DeserializationFeature; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.google.common.collect.Iterables; +import com.google.common.collect.Maps; +import com.jayway.jsonpath.DocumentContext; +import com.jayway.jsonpath.JsonPath; +import eu.dnetlib.dhp.graph.model.*; +import eu.dnetlib.dhp.graph.utils.ContextMapper; +import eu.dnetlib.dhp.graph.utils.GraphMappingUtils; +import eu.dnetlib.dhp.graph.utils.RelationPartitioner; +import eu.dnetlib.dhp.graph.utils.XmlRecordFactory; +import eu.dnetlib.dhp.schema.oaf.*; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.compress.GzipCodec; +import org.apache.hadoop.mapred.SequenceFileOutputFormat; +import org.apache.spark.SparkContext; +import org.apache.spark.api.java.JavaPairRDD; +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.api.java.function.PairFunction; +import org.apache.spark.sql.SparkSession; +import org.apache.spark.util.LongAccumulator; +import scala.Tuple2; + +import java.io.IOException; +import java.io.Serializable; +import java.util.Map; + +import static eu.dnetlib.dhp.graph.utils.GraphMappingUtils.asRelatedEntity; + +/** + * Joins the graph nodes by resolving the links of distance = 1 to create an adjacency list of linked objects. + * The operation considers all the entity types (publication, dataset, software, ORP, project, datasource, organization, + * and all the possible relationships (similarity links produced by the Dedup process are excluded). + * + * The operation is implemented creating the union between the entity types (E), joined by the relationships (R), and again + * by E, finally grouped by E.id; + * + * Different manipulations of the E and R sets are introduced to reduce the complexity of the operation + * 1) treat the object payload as string, extracting only the necessary information beforehand using json path, + * it seems that deserializing it with jackson's object mapper has higher memory footprint. + * + * 2) only consider rels that are not virtually deleted ($.dataInfo.deletedbyinference == false) + * 3) we only need a subset of fields from the related entities, so we introduce a distinction between E_source = S + * and E_target = T. Objects in T are heavily pruned by all the unnecessary information + * + * 4) perform the join as (((T.id join R.target) union S) groupby S.id) yield S -> [ ] + */ +public class GraphJoiner implements Serializable { + + private Map accumulators = Maps.newHashMap(); + + public static final int MAX_RELS = 100; + + public static final String schemaLocation = "https://www.openaire.eu/schema/1.0/oaf-1.0.xsd"; + + private SparkSession spark; + + private ContextMapper contextMapper; + + private String inputPath; + + private String outPath; + + private String otherDsTypeId; + + public GraphJoiner(SparkSession spark, ContextMapper contextMapper, String otherDsTypeId, String inputPath, String outPath) { + this.spark = spark; + this.contextMapper = contextMapper; + this.otherDsTypeId = otherDsTypeId; + this.inputPath = inputPath; + this.outPath = outPath; + + final SparkContext sc = spark.sparkContext(); + prepareAccumulators(sc); + } + + public GraphJoiner adjacencyLists() { + final JavaSparkContext jsc = new JavaSparkContext(getSpark().sparkContext()); + + // read each entity + JavaPairRDD datasource = readPathEntity(jsc, getInputPath(), "datasource"); + JavaPairRDD organization = readPathEntity(jsc, getInputPath(), "organization"); + JavaPairRDD project = readPathEntity(jsc, getInputPath(), "project"); + JavaPairRDD dataset = readPathEntity(jsc, getInputPath(), "dataset"); + JavaPairRDD otherresearchproduct = readPathEntity(jsc, getInputPath(), "otherresearchproduct"); + JavaPairRDD software = readPathEntity(jsc, getInputPath(), "software"); + JavaPairRDD publication = readPathEntity(jsc, getInputPath(), "publication"); + + // create the union between all the entities + final String entitiesPath = getOutPath() + "/entities"; + datasource + .union(organization) + .union(project) + .union(dataset) + .union(otherresearchproduct) + .union(software) + .union(publication) + .map(e -> new EntityRelEntity().setSource(e._2())) + .map(GraphMappingUtils::serialize) + .saveAsTextFile(entitiesPath, GzipCodec.class); + + JavaPairRDD entities = jsc.textFile(entitiesPath) + .map(t -> new ObjectMapper().readValue(t, EntityRelEntity.class)) + .mapToPair(t -> new Tuple2<>(t.getSource().getSourceId(), t)); + + final String relationPath = getOutPath() + "/relation"; + // reads the relationships + final JavaPairRDD rels = readPathRelation(jsc, getInputPath()) + .filter(rel -> !rel.getDeleted()) //only consider those that are not virtually deleted + .map(p -> new EntityRelEntity().setRelation(p)) + .mapToPair(p -> new Tuple2<>(SortableRelationKey.from(p), p)); + rels + .groupByKey(new RelationPartitioner(rels.getNumPartitions())) + .map(p -> Iterables.limit(p._2(), MAX_RELS)) + .flatMap(p -> p.iterator()) + .map(s -> new ObjectMapper().writeValueAsString(s)) + .saveAsTextFile(relationPath, GzipCodec.class); + + final JavaPairRDD relation = jsc.textFile(relationPath) + .map(s -> new ObjectMapper().readValue(s, EntityRelEntity.class)) + .mapToPair(p -> new Tuple2<>(p.getRelation().getTargetId(), p)); + + final String bySourcePath = getOutPath() + "/join_by_source"; + relation + .join(entities + .filter(e -> !e._2().getSource().getDeleted()) + .mapToPair(e -> new Tuple2<>(e._1(), asRelatedEntity(e._2())))) + .map(s -> new EntityRelEntity() + .setRelation(s._2()._1().getRelation()) + .setTarget(s._2()._2().getSource())) + .map(j -> new ObjectMapper().writeValueAsString(j)) + .saveAsTextFile(bySourcePath, GzipCodec.class); + + JavaPairRDD bySource = jsc.textFile(bySourcePath) + .map(e -> getObjectMapper().readValue(e, EntityRelEntity.class)) + .mapToPair(t -> new Tuple2<>(t.getRelation().getSourceId(), t)); + + final XmlRecordFactory recordFactory = new XmlRecordFactory(accumulators, contextMapper, false, schemaLocation, otherDsTypeId); + entities + .union(bySource) + .groupByKey() // by source id + .map(l -> toJoinedEntity(l)) + .mapToPair(je -> new Tuple2<>( + new Text(je.getEntity().getId()), + new Text(recordFactory.build(je)))) + .saveAsHadoopFile(getOutPath() + "/xml", Text.class, Text.class, SequenceFileOutputFormat.class, GzipCodec.class); + + return this; + } + + public SparkSession getSpark() { + return spark; + } + + public String getInputPath() { + return inputPath; + } + + public String getOutPath() { + return outPath; + } + + // HELPERS + + private OafEntity parseOaf(final String json, final String type, final ObjectMapper mapper) { + try { + switch (GraphMappingUtils.EntityType.valueOf(type)) { + case publication: + return mapper.readValue(json, Publication.class); + case dataset: + return mapper.readValue(json, Dataset.class); + case otherresearchproduct: + return mapper.readValue(json, OtherResearchProduct.class); + case software: + return mapper.readValue(json, Software.class); + case datasource: + return mapper.readValue(json, Datasource.class); + case organization: + return mapper.readValue(json, Organization.class); + case project: + return mapper.readValue(json, Project.class); + default: + throw new IllegalArgumentException("invalid type: " + type); + } + } catch (IOException e) { + throw new IllegalArgumentException(e); + } + } + + private JoinedEntity toJoinedEntity(Tuple2> p) { + final ObjectMapper mapper = getObjectMapper(); + final JoinedEntity j = new JoinedEntity(); + final Links links = new Links(); + for(EntityRelEntity rel : p._2()) { + if (rel.hasMainEntity() & j.getEntity() == null) { + j.setType(rel.getSource().getType()); + j.setEntity(parseOaf(rel.getSource().getOaf(), rel.getSource().getType(), mapper)); + } + if (rel.hasRelatedEntity()) { + try { + links.add( + new eu.dnetlib.dhp.graph.model.Tuple2() + .setRelation(mapper.readValue(rel.getRelation().getOaf(), Relation.class)) + .setRelatedEntity(mapper.readValue(rel.getTarget().getOaf(), RelatedEntity.class))); + } catch (IOException e) { + throw new IllegalArgumentException(e); + } + } + } + j.setLinks(links); + if (j.getEntity() == null) { + throw new IllegalStateException("missing main entity on '" + p._1() + "'"); + } + return j; + } + + /** + * Reads a set of eu.dnetlib.dhp.schema.oaf.OafEntity objects from a sequence file , + * extracts necessary information using json path, wraps the oaf object in a eu.dnetlib.dhp.graph.model.TypedRow + * @param sc + * @param inputPath + * @param type + * @return the JavaPairRDD indexed by entity identifier + */ + private JavaPairRDD readPathEntity(final JavaSparkContext sc, final String inputPath, final String type) { + return sc.textFile(inputPath + "/" + type) + .mapToPair((PairFunction) s -> { + final DocumentContext json = JsonPath.parse(s); + final String id = json.read("$.id"); + return new Tuple2<>(id, new TypedRow() + .setSourceId(id) + .setDeleted(json.read("$.dataInfo.deletedbyinference")) + .setType(type) + .setOaf(s)); + }); + } + + /** + * Reads a set of eu.dnetlib.dhp.schema.oaf.Relation objects from a sequence file , + * extracts necessary information using json path, wraps the oaf object in a eu.dnetlib.dhp.graph.model.TypedRow + * @param sc + * @param inputPath + * @return the JavaRDD containing all the relationships + */ + private JavaRDD readPathRelation(final JavaSparkContext sc, final String inputPath) { + return sc.textFile(inputPath + "/relation") + .map(s -> { + final DocumentContext json = JsonPath.parse(s); + return new TypedRow() + .setSourceId(json.read("$.source")) + .setTargetId(json.read("$.target")) + .setDeleted(json.read("$.dataInfo.deletedbyinference")) + .setType("relation") + .setRelType("$.relType") + .setSubRelType("$.subRelType") + .setRelClass("$.relClass") + .setOaf(s); + }); + } + + private ObjectMapper getObjectMapper() { + return new ObjectMapper().configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); + } + + private void prepareAccumulators(SparkContext sc) { + accumulators.put("resultResult_similarity_isAmongTopNSimilarDocuments", sc.longAccumulator("resultResult_similarity_isAmongTopNSimilarDocuments")); + accumulators.put("resultResult_similarity_hasAmongTopNSimilarDocuments", sc.longAccumulator("resultResult_similarity_hasAmongTopNSimilarDocuments")); + accumulators.put("resultResult_supplement_isSupplementTo", sc.longAccumulator("resultResult_supplement_isSupplementTo")); + accumulators.put("resultResult_supplement_isSupplementedBy", sc.longAccumulator("resultResult_supplement_isSupplementedBy")); + accumulators.put("resultResult_dedup_isMergedIn", sc.longAccumulator("resultResult_dedup_isMergedIn")); + accumulators.put("resultResult_dedup_merges", sc.longAccumulator("resultResult_dedup_merges")); + + accumulators.put("resultResult_publicationDataset_isRelatedTo", sc.longAccumulator("resultResult_publicationDataset_isRelatedTo")); + accumulators.put("resultResult_relationship_isRelatedTo", sc.longAccumulator("resultResult_relationship_isRelatedTo")); + accumulators.put("resultProject_outcome_isProducedBy", sc.longAccumulator("resultProject_outcome_isProducedBy")); + accumulators.put("resultProject_outcome_produces", sc.longAccumulator("resultProject_outcome_produces")); + accumulators.put("resultOrganization_affiliation_isAuthorInstitutionOf", sc.longAccumulator("resultOrganization_affiliation_isAuthorInstitutionOf")); + + accumulators.put("resultOrganization_affiliation_hasAuthorInstitution", sc.longAccumulator("resultOrganization_affiliation_hasAuthorInstitution")); + accumulators.put("projectOrganization_participation_hasParticipant", sc.longAccumulator("projectOrganization_participation_hasParticipant")); + accumulators.put("projectOrganization_participation_isParticipant", sc.longAccumulator("projectOrganization_participation_isParticipant")); + accumulators.put("organizationOrganization_dedup_isMergedIn", sc.longAccumulator("organizationOrganization_dedup_isMergedIn")); + accumulators.put("organizationOrganization_dedup_merges", sc.longAccumulator("resultProject_outcome_produces")); + accumulators.put("datasourceOrganization_provision_isProvidedBy", sc.longAccumulator("datasourceOrganization_provision_isProvidedBy")); + accumulators.put("datasourceOrganization_provision_provides", sc.longAccumulator("datasourceOrganization_provision_provides")); + } + +} diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/SparkXmlIndexingJob.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/SparkXmlIndexingJob.java new file mode 100644 index 0000000000..63ff8fb312 --- /dev/null +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/SparkXmlIndexingJob.java @@ -0,0 +1,188 @@ +package eu.dnetlib.dhp.graph; + +import com.lucidworks.spark.util.SolrSupport; +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.graph.utils.StreamingInputDocumentFactory; +import eu.dnetlib.dhp.utils.ISLookupClientFactory; +import eu.dnetlib.dhp.utils.saxon.SaxonTransformerFactory; +import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpDocumentNotFoundException; +import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; +import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; +import org.apache.commons.io.IOUtils; +import org.apache.commons.lang3.StringUtils; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.io.Text; +import org.apache.solr.common.SolrInputDocument; +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.rdd.RDD; +import org.apache.spark.sql.SparkSession; + +import javax.xml.transform.Transformer; +import javax.xml.transform.TransformerException; +import javax.xml.transform.stream.StreamResult; +import javax.xml.transform.stream.StreamSource; +import java.io.IOException; +import java.io.StringReader; +import java.io.StringWriter; +import java.text.SimpleDateFormat; +import java.util.Date; + +public class SparkXmlIndexingJob { + + private static final Log log = LogFactory.getLog(SparkXmlIndexingJob.class); + + private static final Integer DEFAULT_BATCH_SIZE = 1000; + + private static final String LAYOUT = "index"; + + public static void main(String[] args) throws Exception { + + final ArgumentApplicationParser parser = new ArgumentApplicationParser(IOUtils.toString(SparkXmlIndexingJob.class.getResourceAsStream("/eu/dnetlib/dhp/graph/input_params_update_index.json"))); + parser.parseArgument(args); + + final String inputPath = parser.get("sourcePath"); + final String isLookupUrl = parser.get("isLookupUrl"); + final String format = parser.get("format"); + final Integer batchSize = parser.getObjectMap().containsKey("batchSize") ? Integer.valueOf(parser.get("batchSize")) : DEFAULT_BATCH_SIZE; + + final ISLookUpService isLookup = ISLookupClientFactory.getLookUpService(isLookupUrl); + final String fields = getLayoutSource(isLookup, format); + final String xslt = getLayoutTransformer(isLookup); + + final String dsId = getDsId(format, isLookup); + final String zkHost = getZkHost(isLookup); + final String version = getRecordDatestamp(); + + final String indexRecordXslt = getLayoutTransformer(format, fields, xslt); + + log.info("indexRecordTransformer: " + indexRecordXslt); + + final String master = parser.get("master"); + final SparkConf conf = new SparkConf() + .set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); + + try(SparkSession spark = getSession(conf, master)) { + + final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); + + RDD docs = sc.sequenceFile(inputPath, Text.class, Text.class) + .map(t -> t._2().toString()) + .map(s -> toIndexRecord(SaxonTransformerFactory.newInstance(indexRecordXslt), s)) + .map(s -> new StreamingInputDocumentFactory(version, dsId).parseDocument(s)) + .rdd(); + + SolrSupport.indexDocs(zkHost, format + "-" + LAYOUT + "-openaire", batchSize, docs); + } + } + + private static SparkSession getSession(SparkConf conf, String master) { + return SparkSession + .builder() + .config(conf) + .appName(SparkXmlRecordBuilderJob.class.getSimpleName()) + .master(master) + .getOrCreate(); + } + + private static String toIndexRecord(Transformer tr, final String record) { + final StreamResult res = new StreamResult(new StringWriter()); + try { + tr.transform(new StreamSource(new StringReader(record)), res); + return res.getWriter().toString(); + } catch (Throwable e) { + System.out.println("XPathException on record:\n" + record); + throw new IllegalArgumentException(e); + } + } + + /** + * Creates the XSLT responsible for building the index xml records. + * + * @param format Metadata format name (DMF|TMF) + * @param xslt xslt for building the index record transformer + * @param fields the list of fields + * @return the javax.xml.transform.Transformer + * @throws ISLookUpException could happen + * @throws IOException could happen + * @throws TransformerException could happen + */ + private static String getLayoutTransformer(String format, String fields, String xslt) throws TransformerException { + + final Transformer layoutTransformer = SaxonTransformerFactory.newInstance(xslt); + final StreamResult layoutToXsltXslt = new StreamResult(new StringWriter()); + + layoutTransformer.setParameter("format", format); + layoutTransformer.transform(new StreamSource(new StringReader(fields)), layoutToXsltXslt); + + return layoutToXsltXslt.getWriter().toString(); + } + + /** + * method return a solr-compatible string representation of a date, used to mark all records as indexed today + * @return the parsed date + */ + public static String getRecordDatestamp() { + return new SimpleDateFormat("yyyy-MM-dd'T'hh:mm:ss'Z'").format(new Date()); + } + + /** + * Method retrieves from the information system the list of fields associated to the given MDFormat name + * + * @param isLookup the ISLookup service stub + * @param format the Metadata format name + * @return the string representation of the list of fields to be indexed + * + * @throws ISLookUpDocumentNotFoundException + * @throws ISLookUpException + */ + private static String getLayoutSource(final ISLookUpService isLookup, final String format) throws ISLookUpDocumentNotFoundException, ISLookUpException { + return doLookup(isLookup, String.format( + "collection('')//RESOURCE_PROFILE[.//RESOURCE_TYPE/@value = 'MDFormatDSResourceType' and .//NAME='%s']//LAYOUT[@name='%s']", format, LAYOUT)); + } + + /** + * Method retrieves from the information system the openaireLayoutToRecordStylesheet + * + * @param isLookup the ISLookup service stub + * @return the string representation of the XSLT contained in the transformation rule profile + * + * @throws ISLookUpDocumentNotFoundException + * @throws ISLookUpException + */ + private static String getLayoutTransformer(ISLookUpService isLookup) throws ISLookUpException { + return doLookup(isLookup, "collection('/db/DRIVER/TransformationRuleDSResources/TransformationRuleDSResourceType')" + + "//RESOURCE_PROFILE[./BODY/CONFIGURATION/SCRIPT/TITLE/text() = 'openaireLayoutToRecordStylesheet']//CODE/node()"); + } + + /** + * Method retrieves from the information system the IndexDS profile ID associated to the given MDFormat name + * @param format + * @param isLookup + * @return the IndexDS identifier + * @throws ISLookUpException + */ + private static String getDsId(String format, ISLookUpService isLookup) throws ISLookUpException { + return doLookup(isLookup, String.format("collection('/db/DRIVER/IndexDSResources/IndexDSResourceType')" + + "//RESOURCE_PROFILE[./BODY/CONFIGURATION/METADATA_FORMAT/text() = '%s']//RESOURCE_IDENTIFIER/@value/string()", format)); + } + + /** + * Method retrieves from the information system the zookeeper quorum of the Solr server + * @param isLookup + * @return the zookeeper quorum of the Solr server + * @throws ISLookUpException + */ + private static String getZkHost(ISLookUpService isLookup) throws ISLookUpException { + return doLookup(isLookup, "for $x in /RESOURCE_PROFILE[.//RESOURCE_TYPE/@value='IndexServiceResourceType'] return $x//PROTOCOL[./@name='solr']/@address/string()"); + } + + private static String doLookup(ISLookUpService isLookup, String xquery) throws ISLookUpException { + log.info(String.format("running xquery: %s", xquery)); + final String res = isLookup.getResourceProfileByQuery(xquery); + log.info(String.format("got response (100 chars): %s", StringUtils.left(res, 100) + " ...")); + return res; + } + +} diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/SparkXmlRecordBuilderJob.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/SparkXmlRecordBuilderJob.java new file mode 100644 index 0000000000..5fa3e63850 --- /dev/null +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/SparkXmlRecordBuilderJob.java @@ -0,0 +1,50 @@ +package eu.dnetlib.dhp.graph; + +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.graph.utils.ContextMapper; +import org.apache.commons.io.IOUtils; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.spark.SparkConf; +import org.apache.spark.sql.SparkSession; + +public class SparkXmlRecordBuilderJob { + + public static void main(String[] args) throws Exception { + + final ArgumentApplicationParser parser = new ArgumentApplicationParser(IOUtils.toString(SparkXmlRecordBuilderJob.class.getResourceAsStream("/eu/dnetlib/dhp/graph/input_params_build_adjacency_lists.json"))); + parser.parseArgument(args); + + final String master = parser.get("master"); + final SparkConf conf = new SparkConf() + .set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); + + try(SparkSession spark = getSession(conf, master)) { + + final String inputPath = parser.get("sourcePath"); + final String outputPath = parser.get("outputPath"); + final String isLookupUrl = parser.get("isLookupUrl"); + final String otherDsTypeId = parser.get("otherDsTypeId"); + + final FileSystem fs = FileSystem.get(spark.sparkContext().hadoopConfiguration()); + if (fs.exists(new Path(outputPath))) { + fs.delete(new Path(outputPath), true); + fs.mkdirs(new Path(outputPath)); + } + + new GraphJoiner(spark, ContextMapper.fromIS(isLookupUrl), otherDsTypeId, inputPath, outputPath) + .adjacencyLists(); + //.asXML(); + } + } + + private static SparkSession getSession(SparkConf conf, String master) { + return SparkSession + .builder() + .config(conf) + .appName(SparkXmlRecordBuilderJob.class.getSimpleName()) + .master(master) + .getOrCreate(); + } + +} diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/model/EntityRelEntity.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/model/EntityRelEntity.java new file mode 100644 index 0000000000..8c08337e20 --- /dev/null +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/model/EntityRelEntity.java @@ -0,0 +1,54 @@ +package eu.dnetlib.dhp.graph.model; + +import java.io.Serializable; + +public class EntityRelEntity implements Serializable { + + private TypedRow source; + private TypedRow relation; + private TypedRow target; + + public EntityRelEntity() { + } + + public EntityRelEntity(TypedRow source) { + this.source = source; + } + + //helpers + public Boolean hasMainEntity() { + return getSource() != null & getRelation() == null & getTarget() == null; + } + + public Boolean hasRelatedEntity() { + return getSource() == null & getRelation() != null & getTarget() != null; + } + + + public TypedRow getSource() { + return source; + } + + public EntityRelEntity setSource(TypedRow source) { + this.source = source; + return this; + } + + public TypedRow getRelation() { + return relation; + } + + public EntityRelEntity setRelation(TypedRow relation) { + this.relation = relation; + return this; + } + + public TypedRow getTarget() { + return target; + } + + public EntityRelEntity setTarget(TypedRow target) { + this.target = target; + return this; + } +} diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/model/JoinedEntity.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/model/JoinedEntity.java new file mode 100644 index 0000000000..f89273a0d5 --- /dev/null +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/model/JoinedEntity.java @@ -0,0 +1,41 @@ +package eu.dnetlib.dhp.graph.model; + +import eu.dnetlib.dhp.schema.oaf.OafEntity; + +import java.io.Serializable; + +public class JoinedEntity implements Serializable { + + private String type; + + private OafEntity entity; + + private Links links; + + public String getType() { + return type; + } + + public JoinedEntity setType(String type) { + this.type = type; + return this; + } + + public OafEntity getEntity() { + return entity; + } + + public JoinedEntity setEntity(OafEntity entity) { + this.entity = entity; + return this; + } + + public Links getLinks() { + return links; + } + + public JoinedEntity setLinks(Links links) { + this.links = links; + return this; + } +} diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/model/Links.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/model/Links.java new file mode 100644 index 0000000000..96ad67b0ce --- /dev/null +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/model/Links.java @@ -0,0 +1,6 @@ +package eu.dnetlib.dhp.graph.model; + +import java.util.ArrayList; + +public class Links extends ArrayList { +} diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/model/RelatedEntity.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/model/RelatedEntity.java new file mode 100644 index 0000000000..baeff1c6a0 --- /dev/null +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/model/RelatedEntity.java @@ -0,0 +1,257 @@ +package eu.dnetlib.dhp.graph.model; + +import eu.dnetlib.dhp.schema.oaf.Instance; +import eu.dnetlib.dhp.schema.oaf.KeyValue; +import eu.dnetlib.dhp.schema.oaf.Qualifier; +import eu.dnetlib.dhp.schema.oaf.StructuredProperty; +import org.codehaus.jackson.map.ObjectMapper; + +import java.io.IOException; +import java.io.Serializable; +import java.util.List; + +public class RelatedEntity implements Serializable { + + private String id; + private String type; + + // common fields + private StructuredProperty title; + private String websiteurl; // datasource, organizations, projects + + // results + private String dateofacceptance; + private String publisher; + private List pid; + private String codeRepositoryUrl; + private Qualifier resulttype; + private List collectedfrom; + private List instances; + + // datasource + private String officialname; + private Qualifier datasourcetype; + private Qualifier datasourcetypeui; + private Qualifier openairecompatibility; + //private String aggregatortype; + + // organization + private String legalname; + private String legalshortname; + private Qualifier country; + + // project + private String projectTitle; + private String code; + private String acronym; + private Qualifier contracttype; + private List fundingtree; + + public String getId() { + return id; + } + + public RelatedEntity setId(String id) { + this.id = id; + return this; + } + + public StructuredProperty getTitle() { + return title; + } + + public RelatedEntity setTitle(StructuredProperty title) { + this.title = title; + return this; + } + + public String getDateofacceptance() { + return dateofacceptance; + } + + public RelatedEntity setDateofacceptance(String dateofacceptance) { + this.dateofacceptance = dateofacceptance; + return this; + } + + public String getPublisher() { + return publisher; + } + + public RelatedEntity setPublisher(String publisher) { + this.publisher = publisher; + return this; + } + + public List getPid() { + return pid; + } + + public RelatedEntity setPid(List pid) { + this.pid = pid; + return this; + } + + public String getCodeRepositoryUrl() { + return codeRepositoryUrl; + } + + public RelatedEntity setCodeRepositoryUrl(String codeRepositoryUrl) { + this.codeRepositoryUrl = codeRepositoryUrl; + return this; + } + + public Qualifier getResulttype() { + return resulttype; + } + + public RelatedEntity setResulttype(Qualifier resulttype) { + this.resulttype = resulttype; + return this; + } + + public List getCollectedfrom() { + return collectedfrom; + } + + public RelatedEntity setCollectedfrom(List collectedfrom) { + this.collectedfrom = collectedfrom; + return this; + } + + public List getInstances() { + return instances; + } + + public RelatedEntity setInstances(List instances) { + this.instances = instances; + return this; + } + + public String getOfficialname() { + return officialname; + } + + public RelatedEntity setOfficialname(String officialname) { + this.officialname = officialname; + return this; + } + + public String getWebsiteurl() { + return websiteurl; + } + + public RelatedEntity setWebsiteurl(String websiteurl) { + this.websiteurl = websiteurl; + return this; + } + + public Qualifier getDatasourcetype() { + return datasourcetype; + } + + public RelatedEntity setDatasourcetype(Qualifier datasourcetype) { + this.datasourcetype = datasourcetype; + return this; + } + + public Qualifier getDatasourcetypeui() { + return datasourcetypeui; + } + + public RelatedEntity setDatasourcetypeui(Qualifier datasourcetypeui) { + this.datasourcetypeui = datasourcetypeui; + return this; + } + + public Qualifier getOpenairecompatibility() { + return openairecompatibility; + } + + public RelatedEntity setOpenairecompatibility(Qualifier openairecompatibility) { + this.openairecompatibility = openairecompatibility; + return this; + } + + public String getLegalname() { + return legalname; + } + + public RelatedEntity setLegalname(String legalname) { + this.legalname = legalname; + return this; + } + + public String getLegalshortname() { + return legalshortname; + } + + public RelatedEntity setLegalshortname(String legalshortname) { + this.legalshortname = legalshortname; + return this; + } + + public Qualifier getCountry() { + return country; + } + + public RelatedEntity setCountry(Qualifier country) { + this.country = country; + return this; + } + + public String getCode() { + return code; + } + + public RelatedEntity setCode(String code) { + this.code = code; + return this; + } + + public String getAcronym() { + return acronym; + } + + public RelatedEntity setAcronym(String acronym) { + this.acronym = acronym; + return this; + } + + public Qualifier getContracttype() { + return contracttype; + } + + public RelatedEntity setContracttype(Qualifier contracttype) { + this.contracttype = contracttype; + return this; + } + + public List getFundingtree() { + return fundingtree; + } + + public RelatedEntity setFundingtree(List fundingtree) { + this.fundingtree = fundingtree; + return this; + } + + public String getProjectTitle() { + return projectTitle; + } + + public RelatedEntity setProjectTitle(String projectTitle) { + this.projectTitle = projectTitle; + return this; + } + + public String getType() { + return type; + } + + public RelatedEntity setType(String type) { + this.type = type; + return this; + } + +} \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/model/SortableRelationKey.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/model/SortableRelationKey.java new file mode 100644 index 0000000000..6bfbab5471 --- /dev/null +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/model/SortableRelationKey.java @@ -0,0 +1,99 @@ +package eu.dnetlib.dhp.graph.model; + +import com.google.common.collect.ComparisonChain; +import com.google.common.collect.Maps; + +import java.io.Serializable; +import java.util.Map; + +/** + * Allows to sort relationships according to the priority defined in weights map. + */ +public class SortableRelationKey implements Comparable, Serializable { + + private String sourceId; + private String targetId; + + private String relType; + private String subRelType; + private String relClass; + + private final static Map weights = Maps.newHashMap(); + + static { + weights.put("outcome", 0); + weights.put("supplement", 1); + weights.put("publicationDataset", 2); + weights.put("relationship", 3); + weights.put("similarity", 4); + weights.put("affiliation", 5); + + weights.put("provision", 6); + weights.put("participation", 7); + weights.put("dedup", 8); + } + + public static SortableRelationKey from(final EntityRelEntity e) { + return new SortableRelationKey() + .setSourceId(e.getRelation().getSourceId()) + .setTargetId(e.getRelation().getTargetId()) + .setRelType(e.getRelation().getRelType()) + .setSubRelType(e.getRelation().getSubRelType()) + .setRelClass(e.getRelation().getRelClass()); + } + + public String getSourceId() { + return sourceId; + } + + public SortableRelationKey setSourceId(String sourceId) { + this.sourceId = sourceId; + return this; + } + + public String getTargetId() { + return targetId; + } + + public SortableRelationKey setTargetId(String targetId) { + this.targetId = targetId; + return this; + } + + public String getRelType() { + return relType; + } + + public SortableRelationKey setRelType(String relType) { + this.relType = relType; + return this; + } + + public String getSubRelType() { + return subRelType; + } + + public SortableRelationKey setSubRelType(String subRelType) { + this.subRelType = subRelType; + return this; + } + + public String getRelClass() { + return relClass; + } + + public SortableRelationKey setRelClass(String relClass) { + this.relClass = relClass; + return this; + } + + @Override + public int compareTo(SortableRelationKey o) { + return ComparisonChain.start() + .compare(weights.get(getSubRelType()), weights.get(o.getSubRelType())) + .compare(getSourceId(), o.getSourceId()) + .compare(getTargetId(), o.getTargetId()) + .result(); + } + +} diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/model/Tuple2.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/model/Tuple2.java new file mode 100644 index 0000000000..ab965808bf --- /dev/null +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/model/Tuple2.java @@ -0,0 +1,28 @@ +package eu.dnetlib.dhp.graph.model; + +import eu.dnetlib.dhp.schema.oaf.Relation; + +public class Tuple2 { + + private Relation relation; + + private RelatedEntity relatedEntity; + + public Relation getRelation() { + return relation; + } + + public Tuple2 setRelation(Relation relation) { + this.relation = relation; + return this; + } + + public RelatedEntity getRelatedEntity() { + return relatedEntity; + } + + public Tuple2 setRelatedEntity(RelatedEntity relatedEntity) { + this.relatedEntity = relatedEntity; + return this; + } +} diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/model/TypedRow.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/model/TypedRow.java new file mode 100644 index 0000000000..8205c38ef7 --- /dev/null +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/model/TypedRow.java @@ -0,0 +1,92 @@ +package eu.dnetlib.dhp.graph.model; + +import java.io.Serializable; + +public class TypedRow implements Serializable { + + private String sourceId; + + private String targetId; + + private Boolean deleted; + + private String type; + + private String relType; + private String subRelType; + private String relClass; + + private String oaf; + + public String getSourceId() { + return sourceId; + } + + public TypedRow setSourceId(String sourceId) { + this.sourceId = sourceId; + return this; + } + + public String getTargetId() { + return targetId; + } + + public TypedRow setTargetId(String targetId) { + this.targetId = targetId; + return this; + } + + public Boolean getDeleted() { + return deleted; + } + + public TypedRow setDeleted(Boolean deleted) { + this.deleted = deleted; + return this; + } + + public String getType() { + return type; + } + + public TypedRow setType(String type) { + this.type = type; + return this; + } + + public String getRelType() { + return relType; + } + + public TypedRow setRelType(String relType) { + this.relType = relType; + return this; + } + + public String getSubRelType() { + return subRelType; + } + + public TypedRow setSubRelType(String subRelType) { + this.subRelType = subRelType; + return this; + } + + public String getRelClass() { + return relClass; + } + + public TypedRow setRelClass(String relClass) { + this.relClass = relClass; + return this; + } + + public String getOaf() { + return oaf; + } + + public TypedRow setOaf(String oaf) { + this.oaf = oaf; + return this; + } +} diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/utils/ContextDef.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/utils/ContextDef.java new file mode 100644 index 0000000000..05d9456f68 --- /dev/null +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/utils/ContextDef.java @@ -0,0 +1,51 @@ +package eu.dnetlib.dhp.graph.utils; + +import java.io.Serializable; + +public class ContextDef implements Serializable { + + private String id; + private String label; + private String name; + private String type; + + public ContextDef(final String id, final String label, final String name, final String type) { + super(); + this.setId(id); + this.setLabel(label); + this.setName(name); + this.setType(type); + } + + public String getLabel() { + return label; + } + + public void setLabel(final String label) { + this.label = label; + } + + public String getId() { + return id; + } + + public void setId(final String id) { + this.id = id; + } + + public String getName() { + return name; + } + + public void setName(final String name) { + this.name = name; + } + + public String getType() { + return type; + } + + public void setType(final String type) { + this.type = type; + } +} \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/utils/ContextMapper.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/utils/ContextMapper.java new file mode 100644 index 0000000000..ad9e7dfadd --- /dev/null +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/utils/ContextMapper.java @@ -0,0 +1,46 @@ +package eu.dnetlib.dhp.graph.utils; + +import com.google.common.base.Joiner; +import eu.dnetlib.dhp.utils.ISLookupClientFactory; +import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; +import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; +import org.dom4j.Document; +import org.dom4j.DocumentException; +import org.dom4j.Node; +import org.dom4j.io.SAXReader; + +import java.io.Serializable; +import java.io.StringReader; +import java.util.HashMap; + +public class ContextMapper extends HashMap implements Serializable { + + private static final long serialVersionUID = 2159682308502487305L; + + private final static String XQUERY = "for $x in //RESOURCE_PROFILE[.//RESOURCE_TYPE/@value='ContextDSResourceType']//*[name()='context' or name()='category' or name()='concept'] return "; + + public static ContextMapper fromIS(final String isLookupUrl) throws DocumentException, ISLookUpException { + ISLookUpService isLookUp = ISLookupClientFactory.getLookUpService(isLookupUrl); + StringBuilder sb = new StringBuilder(""); + Joiner.on("").appendTo(sb, isLookUp.quickSearchProfile(XQUERY)); + sb.append(""); + return fromXml(sb.toString()); + } + + public static ContextMapper fromXml(final String xml) throws DocumentException { + final ContextMapper contextMapper = new ContextMapper(); + + final Document doc = new SAXReader().read(new StringReader(xml)); + for (Object o : doc.selectNodes("//entry")) { + Node node = (Node) o; + String id = node.valueOf("./@id"); + String label = node.valueOf("./@label"); + String name = node.valueOf("./@name"); + String type = node.valueOf("./@type") + ""; + + contextMapper.put(id, new ContextDef(id, label, name, type)); + } + return contextMapper; + } + +} diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/utils/GraphMappingUtils.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/utils/GraphMappingUtils.java new file mode 100644 index 0000000000..3d8cde703e --- /dev/null +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/utils/GraphMappingUtils.java @@ -0,0 +1,231 @@ +package eu.dnetlib.dhp.graph.utils; + +import com.fasterxml.jackson.annotation.JsonInclude; +import com.fasterxml.jackson.core.JsonProcessingException; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.google.common.base.Predicate; +import com.google.common.collect.BiMap; +import com.google.common.collect.HashBiMap; +import com.google.common.collect.Maps; +import com.google.common.collect.Sets; +import com.jayway.jsonpath.DocumentContext; +import com.jayway.jsonpath.JsonPath; +import eu.dnetlib.dhp.graph.model.EntityRelEntity; +import eu.dnetlib.dhp.graph.model.RelatedEntity; +import eu.dnetlib.dhp.graph.model.TypedRow; +import eu.dnetlib.dhp.schema.oaf.*; +import net.minidev.json.JSONArray; +import org.apache.commons.lang3.StringUtils; + +import java.util.LinkedHashMap; +import java.util.Map; +import java.util.Set; +import java.util.stream.Collectors; + +import static org.apache.commons.lang3.StringUtils.*; + +public class GraphMappingUtils { + + public static final String SEPARATOR = "_"; + + public enum EntityType { + publication, dataset, otherresearchproduct, software, datasource, organization, project + } + + public enum MainEntityType { + result, datasource, organization, project + } + + public static Set authorPidTypes = Sets.newHashSet("orcid", "magidentifier"); + + public static Set instanceFieldFilter = Sets.newHashSet("instancetype", "hostedby", "license", "accessright", "collectedfrom", "dateofacceptance", "distributionlocation"); + + private static final String schemeTemplate = "dnet:%s_%s_relations"; + + private static Map entityMapping = Maps.newHashMap(); + + static { + entityMapping.put(EntityType.publication, MainEntityType.result); + entityMapping.put(EntityType.dataset, MainEntityType.result); + entityMapping.put(EntityType.otherresearchproduct, MainEntityType.result); + entityMapping.put(EntityType.software, MainEntityType.result); + entityMapping.put(EntityType.datasource, MainEntityType.datasource); + entityMapping.put(EntityType.organization, MainEntityType.organization); + entityMapping.put(EntityType.project, MainEntityType.project); + } + + public static String getScheme(final String sourceType, final String targetType) { + return String.format(schemeTemplate, + entityMapping.get(EntityType.valueOf(sourceType)).name(), + entityMapping.get(EntityType.valueOf(targetType)).name()); + } + + public static String getMainType(final String type) { + return entityMapping.get(EntityType.valueOf(type)).name(); + } + + public static boolean isResult(String type) { + return MainEntityType.result.name().equals(getMainType(type)); + } + + public static Predicate instanceFilter = s -> instanceFieldFilter.contains(s); + + public static EntityRelEntity asRelatedEntity(EntityRelEntity e) { + + final DocumentContext j = JsonPath.parse(e.getSource().getOaf()); + final RelatedEntity re = new RelatedEntity().setId(j.read("$.id")).setType(e.getSource().getType()); + + switch (EntityType.valueOf(e.getSource().getType())) { + case publication: + case dataset: + case otherresearchproduct: + case software: + mapTitle(j, re); + re.setDateofacceptance(j.read("$.dateofacceptance.value")); + re.setPublisher(j.read("$.publisher.value")); + + JSONArray pids = j.read("$.pid"); + re.setPid(pids.stream() + .map(p -> asStructuredProperty((LinkedHashMap) p)) + .collect(Collectors.toList())); + + re.setResulttype(asQualifier(j.read("$.resulttype"))); + + JSONArray collfrom = j.read("$.collectedfrom"); + re.setCollectedfrom(collfrom.stream() + .map(c -> asKV((LinkedHashMap) c)) + .collect(Collectors.toList())); + + // will throw exception when the instance is not found + JSONArray instances = j.read("$.instance"); + re.setInstances(instances.stream() + .map(i -> { + final LinkedHashMap p = (LinkedHashMap) i; + final Field license = new Field(); + license.setValue((String) ((LinkedHashMap) p.get("license")).get("value")); + final Instance instance = new Instance(); + instance.setLicense(license); + instance.setAccessright(asQualifier((LinkedHashMap) p.get("accessright"))); + instance.setInstancetype(asQualifier((LinkedHashMap) p.get("instancetype"))); + instance.setHostedby(asKV((LinkedHashMap) p.get("hostedby"))); + //TODO mapping of distributionlocation + instance.setCollectedfrom(asKV((LinkedHashMap) p.get("collectedfrom"))); + + Field dateofacceptance = new Field(); + dateofacceptance.setValue((String) ((LinkedHashMap) p.get("dateofacceptance")).get("value")); + instance.setDateofacceptance(dateofacceptance); + return instance; + }).collect(Collectors.toList())); + + //TODO still to be mapped + //re.setCodeRepositoryUrl(j.read("$.coderepositoryurl")); + + break; + case datasource: + re.setOfficialname(j.read("$.officialname.value")); + re.setWebsiteurl(j.read("$.websiteurl.value")); + re.setDatasourcetype(asQualifier(j.read("$.datasourcetype"))); + re.setOpenairecompatibility(asQualifier(j.read("$.openairecompatibility"))); + + break; + case organization: + re.setLegalname(j.read("$.legalname.value")); + re.setLegalshortname(j.read("$.legalshortname.value")); + re.setCountry(asQualifier(j.read("$.country"))); + re.setWebsiteurl(j.read("$.websiteurl.value")); + break; + case project: + re.setProjectTitle(j.read("$.title.value")); + re.setCode(j.read("$.code.value")); + re.setAcronym(j.read("$.acronym.value")); + re.setContracttype(asQualifier(j.read("$.contracttype"))); + + JSONArray f = j.read("$.fundingtree"); + if (!f.isEmpty()) { + re.setFundingtree(f.stream() + .map(s -> ((LinkedHashMap) s).get("value")) + .collect(Collectors.toList())); + } + + break; + } + return new EntityRelEntity().setSource( + new TypedRow() + .setSourceId(e.getSource().getSourceId()) + .setDeleted(e.getSource().getDeleted()) + .setType(e.getSource().getType()) + .setOaf(serialize(re))); + } + + private static KeyValue asKV(LinkedHashMap j) { + final KeyValue kv = new KeyValue(); + kv.setKey((String) j.get("key")); + kv.setValue((String) j.get("value")); + return kv; + } + + private static void mapTitle(DocumentContext j, RelatedEntity re) { + final JSONArray a = j.read("$.title"); + if (!a.isEmpty()) { + final StructuredProperty sp = asStructuredProperty((LinkedHashMap) a.get(0)); + if (StringUtils.isNotBlank(sp.getValue())) { + re.setTitle(sp); + } + } + } + + private static StructuredProperty asStructuredProperty(LinkedHashMap j) { + final StructuredProperty sp = new StructuredProperty(); + final String value = (String) j.get("value"); + if (StringUtils.isNotBlank(value)) { + sp.setValue((String) j.get("value")); + sp.setQualifier(asQualifier((LinkedHashMap) j.get("qualifier"))); + } + return sp; + } + + public static Qualifier asQualifier(LinkedHashMap j) { + final Qualifier q = new Qualifier(); + + final String classid = j.get("classid"); + if (StringUtils.isNotBlank(classid)) { + q.setClassid(classid); + } + + final String classname = j.get("classname"); + if (StringUtils.isNotBlank(classname)) { + q.setClassname(classname); + } + + final String schemeid = j.get("schemeid"); + if (StringUtils.isNotBlank(schemeid)) { + q.setSchemeid(schemeid); + } + + final String schemename = j.get("schemename"); + if (StringUtils.isNotBlank(schemename)) { + q.setSchemename(schemename); + } + return q; + } + + public static String serialize(final Object o) { + try { + return new ObjectMapper() + .setSerializationInclusion(JsonInclude.Include.NON_NULL) + .writeValueAsString(o); + } catch (JsonProcessingException e) { + throw new IllegalArgumentException("unable to serialize: " + o.toString(), e); + } + } + + public static String removePrefix(final String s) { + if (s.contains("|")) return substringAfter(s, "|"); + return s; + } + + public static String getRelDescriptor(String relType, String subRelType, String relClass) { + return relType + SEPARATOR + subRelType + SEPARATOR + relClass; + } + +} diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/utils/LicenseComparator.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/utils/LicenseComparator.java new file mode 100644 index 0000000000..c4cbfadea3 --- /dev/null +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/utils/LicenseComparator.java @@ -0,0 +1,49 @@ +package eu.dnetlib.dhp.graph.utils; + +import eu.dnetlib.dhp.schema.oaf.Qualifier; + +import java.util.Comparator; + +public class LicenseComparator implements Comparator { + + @Override + public int compare(Qualifier left, Qualifier right) { + + if (left == null && right == null) return 0; + if (left == null) return 1; + if (right == null) return -1; + + String lClass = left.getClassid(); + String rClass = right.getClassid(); + + if (lClass.equals(rClass)) return 0; + + if (lClass.equals("OPEN SOURCE")) return -1; + if (rClass.equals("OPEN SOURCE")) return 1; + + if (lClass.equals("OPEN")) return -1; + if (rClass.equals("OPEN")) return 1; + + if (lClass.equals("6MONTHS")) return -1; + if (rClass.equals("6MONTHS")) return 1; + + if (lClass.equals("12MONTHS")) return -1; + if (rClass.equals("12MONTHS")) return 1; + + if (lClass.equals("EMBARGO")) return -1; + if (rClass.equals("EMBARGO")) return 1; + + if (lClass.equals("RESTRICTED")) return -1; + if (rClass.equals("RESTRICTED")) return 1; + + if (lClass.equals("CLOSED")) return -1; + if (rClass.equals("CLOSED")) return 1; + + if (lClass.equals("UNKNOWN")) return -1; + if (rClass.equals("UNKNOWN")) return 1; + + // Else (but unlikely), lexicographical ordering will do. + return lClass.compareTo(rClass); + } + +} diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/utils/RelationPartitioner.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/utils/RelationPartitioner.java new file mode 100644 index 0000000000..f4b1514d0e --- /dev/null +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/utils/RelationPartitioner.java @@ -0,0 +1,29 @@ +package eu.dnetlib.dhp.graph.utils; + +import eu.dnetlib.dhp.graph.model.SortableRelationKey; +import org.apache.spark.Partitioner; +import org.apache.spark.util.Utils; + +/** + * Used in combination with SortableRelationKey, allows to partition the records by source id, therefore + * allowing to sort relations sharing the same source id by the ordering defined in SortableRelationKey. + */ +public class RelationPartitioner extends Partitioner { + + private int numPartitions; + + public RelationPartitioner(int numPartitions) { + this.numPartitions = numPartitions; + } + + @Override + public int numPartitions() { + return numPartitions; + } + + @Override + public int getPartition(Object key) { + return Utils.nonNegativeMod(((SortableRelationKey) key).getSourceId().hashCode(), numPartitions()); + } + +} diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/utils/StreamingInputDocumentFactory.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/utils/StreamingInputDocumentFactory.java new file mode 100644 index 0000000000..736c9fc287 --- /dev/null +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/utils/StreamingInputDocumentFactory.java @@ -0,0 +1,253 @@ +package eu.dnetlib.dhp.graph.utils; + +import java.io.StringReader; +import java.io.StringWriter; +import java.util.Arrays; +import java.util.HashMap; +import java.util.Iterator; +import java.util.List; +import javax.xml.stream.*; +import javax.xml.stream.events.Namespace; +import javax.xml.stream.events.StartElement; +import javax.xml.stream.events.XMLEvent; + +import com.google.common.collect.Lists; +import org.apache.solr.common.SolrInputDocument; + +/** + * Optimized version of the document parser, drop in replacement of InputDocumentFactory. + * + *

+ * Faster because: + *

+ *
    + *
  • Doesn't create a DOM for the full document
  • + *
  • Doesn't execute xpaths agains the DOM
  • + *
  • Quickly serialize the 'result' element directly in a string.
  • + *
  • Uses less memory: less pressure on GC and allows more threads to process this in parallel
  • + *
+ * + *

+ * This class is fully reentrant and can be invoked in parallel. + *

+ * + * @author claudio + * + */ +public class StreamingInputDocumentFactory { + + private static final String INDEX_FIELD_PREFIX = "__"; + + private static final String DS_VERSION = INDEX_FIELD_PREFIX + "dsversion"; + + private static final String DS_ID = INDEX_FIELD_PREFIX + "dsid"; + + private static final String RESULT = "result"; + + private static final String INDEX_RESULT = INDEX_FIELD_PREFIX + RESULT; + + private static final String INDEX_RECORD_ID = INDEX_FIELD_PREFIX + "indexrecordidentifier"; + + private static final String outFormat = new String("yyyy-MM-dd'T'hh:mm:ss'Z'"); + + private final static List dateFormats = Arrays.asList("yyyy-MM-dd'T'hh:mm:ss", "yyyy-MM-dd", "dd-MM-yyyy", "dd/MM/yyyy", "yyyy"); + + private static final String DEFAULTDNETRESULT = "dnetResult"; + + private static final String TARGETFIELDS = "targetFields"; + + private static final String INDEX_RECORD_ID_ELEMENT = "indexRecordIdentifier"; + + private static final String ROOT_ELEMENT = "indexRecord"; + + private static final int MAX_FIELD_LENGTH = 25000; + + private ThreadLocal inputFactory = ThreadLocal.withInitial(() -> XMLInputFactory.newInstance()); + + private ThreadLocal outputFactory = ThreadLocal.withInitial(() -> XMLOutputFactory.newInstance()); + + private ThreadLocal eventFactory = ThreadLocal.withInitial(() -> XMLEventFactory.newInstance()); + + private String version; + + private String dsId; + + private String resultName = DEFAULTDNETRESULT; + + public StreamingInputDocumentFactory(final String version, final String dsId) { + this(version, dsId, DEFAULTDNETRESULT); + } + + public StreamingInputDocumentFactory(final String version, final String dsId, final String resultName) { + this.version = version; + this.dsId = dsId; + this.resultName = resultName; + } + + public SolrInputDocument parseDocument(final String inputDocument) { + + final StringWriter results = new StringWriter(); + final List nsList = Lists.newLinkedList(); + try { + + XMLEventReader parser = inputFactory.get().createXMLEventReader(new StringReader(inputDocument)); + + final SolrInputDocument indexDocument = new SolrInputDocument(new HashMap<>()); + + while (parser.hasNext()) { + final XMLEvent event = parser.nextEvent(); + if ((event != null) && event.isStartElement()) { + final String localName = event.asStartElement().getName().getLocalPart(); + + if (ROOT_ELEMENT.equals(localName)) { + nsList.addAll(getNamespaces(event)); + } else if (INDEX_RECORD_ID_ELEMENT.equals(localName)) { + final XMLEvent text = parser.nextEvent(); + String recordId = getText(text); + indexDocument.addField(INDEX_RECORD_ID, recordId); + } else if (TARGETFIELDS.equals(localName)) { + parseTargetFields(indexDocument, parser); + } else if (resultName.equals(localName)) { + copyResult(indexDocument, results, parser, nsList, resultName); + } + } + } + + if (version != null) { + indexDocument.addField(DS_VERSION, version); + } + + if (dsId != null) { + indexDocument.addField(DS_ID, dsId); + } + + if (!indexDocument.containsKey(INDEX_RECORD_ID)) { + indexDocument.clear(); + System.err.println("missing indexrecord id:\n" + inputDocument); + } + + return indexDocument; + } catch (XMLStreamException e) { + return new SolrInputDocument(); + } + } + + private List getNamespaces(final XMLEvent event) { + final List res = Lists.newLinkedList(); + @SuppressWarnings("unchecked") + Iterator nsIter = event.asStartElement().getNamespaces(); + while (nsIter.hasNext()) { + Namespace ns = nsIter.next(); + res.add(ns); + } + return res; + } + + /** + * Parse the targetFields block and add fields to the solr document. + * + * @param indexDocument + * @param parser + * @throws XMLStreamException + */ + protected void parseTargetFields(final SolrInputDocument indexDocument, final XMLEventReader parser) throws XMLStreamException { + + boolean hasFields = false; + + while (parser.hasNext()) { + final XMLEvent targetEvent = parser.nextEvent(); + if (targetEvent.isEndElement() && targetEvent.asEndElement().getName().getLocalPart().equals(TARGETFIELDS)) { + break; + } + + if (targetEvent.isStartElement()) { + final String fieldName = targetEvent.asStartElement().getName().getLocalPart(); + final XMLEvent text = parser.nextEvent(); + + String data = getText(text); + + addField(indexDocument, fieldName, data); + hasFields = true; + } + } + + if (!hasFields) { + indexDocument.clear(); + } + } + + /** + * Copy the /indexRecord/result element and children, preserving namespace declarations etc. + * + * @param indexDocument + * @param results + * @param parser + * @param nsList + * @throws XMLStreamException + */ + protected void copyResult(final SolrInputDocument indexDocument, + final StringWriter results, + final XMLEventReader parser, + final List nsList, + final String dnetResult) throws XMLStreamException { + final XMLEventWriter writer = outputFactory.get().createXMLEventWriter(results); + + for (Namespace ns : nsList) { + eventFactory.get().createNamespace(ns.getPrefix(), ns.getNamespaceURI()); + } + + StartElement newRecord = eventFactory.get().createStartElement("", null, RESULT, null, nsList.iterator()); + + // new root record + writer.add(newRecord); + + // copy the rest as it is + while (parser.hasNext()) { + final XMLEvent resultEvent = parser.nextEvent(); + + // TODO: replace with depth tracking instead of close tag tracking. + if (resultEvent.isEndElement() && resultEvent.asEndElement().getName().getLocalPart().equals(dnetResult)) { + writer.add(eventFactory.get().createEndElement("", null, RESULT)); + break; + } + + writer.add(resultEvent); + } + writer.close(); + indexDocument.addField(INDEX_RESULT, results.toString()); + } + + /** + * Helper used to add a field to a solr doc. It avoids to add empy fields + * + * @param indexDocument + * @param field + * @param value + */ + private final void addField(final SolrInputDocument indexDocument, final String field, final String value) { + String cleaned = value.trim(); + if (!cleaned.isEmpty()) { + // log.info("\n\n adding field " + field.toLowerCase() + " value: " + cleaned + "\n"); + indexDocument.addField(field.toLowerCase(), cleaned); + } + } + + /** + * Helper used to get the string from a text element. + * + * @param text + * @return the + */ + protected final String getText(final XMLEvent text) { + if (text.isEndElement()) // log.warn("skipping because isEndOfElement " + text.asEndElement().getName().getLocalPart()); + return ""; + + final String data = text.asCharacters().getData(); + if (data != null && data.length() > MAX_FIELD_LENGTH) { + return data.substring(0, MAX_FIELD_LENGTH); + } + + return data; + } + +} diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/utils/TemplateFactory.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/utils/TemplateFactory.java new file mode 100644 index 0000000000..27c55fab7f --- /dev/null +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/utils/TemplateFactory.java @@ -0,0 +1,107 @@ +package eu.dnetlib.dhp.graph.utils; + +import eu.dnetlib.dhp.schema.oaf.DataInfo; +import eu.dnetlib.dhp.schema.oaf.OafEntity; +import org.apache.commons.lang3.StringUtils; +import org.stringtemplate.v4.ST; + +import java.io.IOException; +import java.util.Collection; +import java.util.List; +import java.util.stream.Collectors; + +import static eu.dnetlib.dhp.graph.utils.GraphMappingUtils.removePrefix; +import static eu.dnetlib.dhp.graph.utils.XmlSerializationUtils.escapeXml; + +public class TemplateFactory { + + private TemplateResources resources; + + private final static char DELIMITER = '$'; + + public TemplateFactory() { + try { + resources = new TemplateResources(); + } catch (IOException e) { + throw new IllegalStateException(e); + } + } + + public String buildBody(final String type, final List metadata, final List rels, final List children, final List extraInfo) { + ST body = getTemplate(resources.getEntity()); + + body.add("name", type); + body.add("metadata", metadata); + body.add("rels", rels); + body.add("children", children); + body.add("extrainfo", extraInfo); + + return body.render(); + } + + public String getChild(final String name, final String id, final List metadata) { + return getTemplate(resources.getChild()) + .add("name", name) + .add("hasId", !(id == null)) + .add("id", id != null ? escapeXml(removePrefix(id)) : "") + .add("metadata", metadata) + .render(); + } + + public String buildRecord( + final OafEntity entity, + final String schemaLocation, + final String body) { + return getTemplate(resources.getRecord()) + .add("id", escapeXml(removePrefix(entity.getId()))) + .add("dateofcollection", entity.getDateofcollection()) + .add("dateoftransformation", entity.getDateoftransformation()) + .add("schemaLocation", schemaLocation) + .add("it", body) + .render(); + } + + public String getRel(final String type, + final String objIdentifier, + final Collection fields, + final String semanticclass, + final String semantischeme, + final DataInfo info) { + return getTemplate(resources.getRel()) + .add("type", type) + .add("objIdentifier", escapeXml(removePrefix(objIdentifier))) + .add("class", semanticclass) + .add("scheme", semantischeme) + .add("metadata", fields) + .add("inferred", info.getInferred()) + .add("trust", info.getTrust()) + .add("inferenceprovenance", info.getInferenceprovenance()) + .add("provenanceaction", info.getProvenanceaction() != null ? info.getProvenanceaction().getClassid() : "") + .render(); + } + + public String getInstance(final String resultId, final List instancemetadata, final List webresources) { + return getTemplate(resources.getInstance()) + .add("instanceId", escapeXml(removePrefix(resultId))) + .add("metadata", instancemetadata) + .add("webresources", webresources + .stream() + .filter(StringUtils::isNotBlank) + .map(w -> getWebResource(w)) + .collect(Collectors.toList())) + .render(); + } + + private String getWebResource(final String identifier) { + return getTemplate(resources.getWebresource()) + .add("identifier", escapeXml(identifier)) + .render(); + } + + // HELPERS + + private ST getTemplate(final String res) { + return new ST(res, DELIMITER, DELIMITER); + } + +} diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/utils/TemplateResources.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/utils/TemplateResources.java new file mode 100644 index 0000000000..92aaedfd3c --- /dev/null +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/utils/TemplateResources.java @@ -0,0 +1,54 @@ +package eu.dnetlib.dhp.graph.utils; + +import com.google.common.io.Resources; + +import java.io.IOException; +import java.nio.charset.StandardCharsets; + +public class TemplateResources { + + private String record = read("eu/dnetlib/dhp/graph/template/record.st"); + + private String instance = read("eu/dnetlib/dhp/graph/template/instance.st"); + + private String rel = read("eu/dnetlib/dhp/graph/template/rel.st"); + + private String webresource = read("eu/dnetlib/dhp/graph/template/webresource.st"); + + private String child = read("eu/dnetlib/dhp/graph/template/child.st"); + + private String entity = read("eu/dnetlib/dhp/graph/template/entity.st"); + + private static String read(final String classpathResource) throws IOException { + return Resources.toString(Resources.getResource(classpathResource), StandardCharsets.UTF_8); + } + + public TemplateResources() throws IOException { + + } + + public String getEntity() { + return entity; + } + + public String getRecord() { + return record; + } + + public String getInstance() { + return instance; + } + + public String getRel() { + return rel; + } + + public String getWebresource() { + return webresource; + } + + public String getChild() { + return child; + } + +} diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/utils/XmlRecordFactory.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/utils/XmlRecordFactory.java new file mode 100644 index 0000000000..74e36a818c --- /dev/null +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/utils/XmlRecordFactory.java @@ -0,0 +1,990 @@ +package eu.dnetlib.dhp.graph.utils; + +import com.google.common.base.Joiner; +import com.google.common.base.Splitter; +import com.google.common.collect.Lists; +import com.google.common.collect.Maps; +import com.google.common.collect.Sets; +import com.mycila.xmltool.XMLDoc; +import com.mycila.xmltool.XMLTag; +import eu.dnetlib.dhp.graph.model.JoinedEntity; +import eu.dnetlib.dhp.graph.model.RelatedEntity; +import eu.dnetlib.dhp.graph.model.Tuple2; +import eu.dnetlib.dhp.schema.oaf.Result; +import eu.dnetlib.dhp.schema.oaf.*; +import org.apache.commons.lang3.StringUtils; +import org.apache.spark.util.LongAccumulator; +import org.dom4j.Document; +import org.dom4j.DocumentException; +import org.dom4j.Element; +import org.dom4j.Node; +import org.dom4j.io.OutputFormat; +import org.dom4j.io.SAXReader; +import org.dom4j.io.XMLWriter; + +import javax.xml.transform.*; +import javax.xml.transform.dom.DOMSource; +import javax.xml.transform.stream.StreamResult; +import java.io.IOException; +import java.io.Serializable; +import java.io.StringReader; +import java.io.StringWriter; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.stream.Collectors; + +import static eu.dnetlib.dhp.graph.utils.GraphMappingUtils.*; +import static eu.dnetlib.dhp.graph.utils.XmlSerializationUtils.*; +import static org.apache.commons.lang3.StringUtils.isNotBlank; +import static org.apache.commons.lang3.StringUtils.substringBefore; + +public class XmlRecordFactory implements Serializable { + + private Map accumulators; + + private Set specialDatasourceTypes; + + private ContextMapper contextMapper; + + private String schemaLocation; + + private boolean indent = false; + + public XmlRecordFactory( + final ContextMapper contextMapper, final boolean indent, + final String schemaLocation, final String otherDatasourceTypesUForUI) { + + this(Maps.newHashMap(), contextMapper, indent, schemaLocation, otherDatasourceTypesUForUI); + } + + public XmlRecordFactory( + final Map accumulators, + final ContextMapper contextMapper, final boolean indent, + final String schemaLocation, final String otherDatasourceTypesUForUI) { + + this.accumulators = accumulators; + this.contextMapper = contextMapper; + this.schemaLocation = schemaLocation; + this.specialDatasourceTypes = Sets.newHashSet(Splitter.on(",").trimResults().split(otherDatasourceTypesUForUI)); + + this.indent = indent; + } + + public String build(final JoinedEntity je) { + + final Set contexts = Sets.newHashSet(); + + final OafEntity entity = je.getEntity(); + TemplateFactory templateFactory = new TemplateFactory(); + try { + final List metadata = metadata(je.getType(), entity, contexts); + + // rels has to be processed before the contexts because they enrich the contextMap with the funding info. + final List relations = listRelations(je, templateFactory, contexts); + + metadata.addAll(buildContexts(getMainType(je.getType()), contexts)); + metadata.add(parseDataInfo(entity.getDataInfo())); + + final String body = templateFactory.buildBody( + getMainType(je.getType()), + metadata, + relations, + listChildren(je, templateFactory), listExtraInfo(je)); + + return printXML(templateFactory.buildRecord(entity, schemaLocation, body), indent); + } catch (final Throwable e) { + throw new RuntimeException(String.format("error building record '%s'", entity.getId()), e); + } + } + + private String printXML(String xml, boolean indent) { + try { + final Document doc = new SAXReader().read(new StringReader(xml)); + OutputFormat format = indent ? OutputFormat.createPrettyPrint() : OutputFormat.createCompactFormat(); + format.setExpandEmptyElements(false); + format.setSuppressDeclaration(true); + StringWriter sw = new StringWriter(); + XMLWriter writer = new XMLWriter(sw, format); + writer.write(doc); + return sw.toString(); + } catch (IOException | DocumentException e) { + throw new IllegalArgumentException("Unable to indent XML. Invalid record:\n" + xml, e); + } + } + + private List metadata(final String type, final OafEntity entity, final Set contexts) { + + final List metadata = Lists.newArrayList(); + + + if (entity.getCollectedfrom() != null) { + metadata.addAll(entity.getCollectedfrom() + .stream() + .map(kv -> mapKeyValue("collectedfrom", kv)) + .collect(Collectors.toList())); + } + if (entity.getOriginalId() != null) { + metadata.addAll(entity.getOriginalId() + .stream() + .map(s -> asXmlElement("originalId", s)) + .collect(Collectors.toList())); + } + if (entity.getPid() != null) { + metadata.addAll(entity.getPid() + .stream() + .map(p -> mapStructuredProperty("pid", p)) + .collect(Collectors.toList())); + } + + if (GraphMappingUtils.isResult(type)) { + final Result r = (Result) entity; + + if (r.getContext() != null) { + contexts.addAll(r.getContext() + .stream() + .map(c -> c.getId()) + .collect(Collectors.toList())); + /* FIXME: Workaround for CLARIN mining issue: #3670#note-29 */ + if (contexts.contains("dh-ch::subcommunity::2")) { + contexts.add("clarin"); + } + } + + if (r.getTitle() != null) { + metadata.addAll(r.getTitle() + .stream() + .map(t -> mapStructuredProperty("title", t)) + .collect(Collectors.toList())); + } + if (r.getBestaccessright() != null) { + metadata.add(mapQualifier("bestaccessright", r.getBestaccessright())); + } + if (r.getAuthor() != null) { + metadata.addAll(r.getAuthor() + .stream() + .map(a -> { + final StringBuilder sb = new StringBuilder(" isNotBlank(sp.getQualifier().getClassid()) && isNotBlank(sp.getValue())) + .forEach(sp -> { + String pidType = escapeXml(sp.getQualifier().getClassid()).replaceAll("\\W", ""); + String pidValue = escapeXml(sp.getValue()); + + // ugly hack: some records provide swapped pidtype and pidvalue + if (authorPidTypes.contains(pidValue.toLowerCase().trim())) { + sb.append(String.format(" %s=\"%s\"", pidValue, pidType)); + } else { + pidType = pidType.replaceAll("\\W", "").replaceAll("\\d", ""); + if (isNotBlank(pidType)) { + sb.append(String.format(" %s=\"%s\"", + pidType, + pidValue.toLowerCase().replaceAll("orcid", ""))); + } + } + }); + } + sb.append(">" + escapeXml(a.getFullname()) + ""); + return sb.toString(); + }).collect(Collectors.toList())); + } + if (r.getContributor() != null) { + metadata.addAll(r.getContributor() + .stream() + .map(c -> asXmlElement("contributor", c.getValue())) + .collect(Collectors.toList())); + } + if (r.getCountry() != null) { + metadata.addAll(r.getCountry() + .stream() + .map(c -> mapQualifier("country", c)) + .collect(Collectors.toList())); + } + if (r.getCoverage() != null) { + metadata.addAll(r.getCoverage() + .stream() + .map(c -> asXmlElement("coverage", c.getValue())) + .collect(Collectors.toList())); + } + if (r.getDateofacceptance() != null) { + metadata.add(asXmlElement("dateofacceptance", r.getDateofacceptance().getValue())); + } + if (r.getDescription() != null) { + metadata.addAll(r.getDescription() + .stream() + .map(c -> asXmlElement("description", c.getValue())) + .collect(Collectors.toList())); + } + if (r.getEmbargoenddate() != null) { + metadata.add(asXmlElement("embargoenddate", r.getEmbargoenddate().getValue())); + } + if (r.getSubject() != null) { + metadata.addAll(r.getSubject() + .stream() + .map(s -> mapStructuredProperty("subject", s)) + .collect(Collectors.toList())); + } + if (r.getLanguage() != null) { + metadata.add(mapQualifier("language", r.getLanguage())); + } + if (r.getRelevantdate() != null) { + metadata.addAll(r.getRelevantdate() + .stream() + .map(s -> mapStructuredProperty("relevantdate", s)) + .collect(Collectors.toList())); + } + if (r.getPublisher() != null) { + metadata.add(asXmlElement("publisher", r.getPublisher().getValue())); + } + if (r.getSource() != null) { + metadata.addAll(r.getSource() + .stream() + .map(c -> asXmlElement("source", c.getValue())) + .collect(Collectors.toList())); + } + if (r.getFormat() != null) { + metadata.addAll(r.getFormat() + .stream() + .map(c -> asXmlElement("format", c.getValue())) + .collect(Collectors.toList())); + } + if (r.getResulttype() != null) { + metadata.add(mapQualifier("resulttype", r.getResulttype())); + } + if (r.getResourcetype() != null) { + metadata.add(mapQualifier("resourcetype", r.getResourcetype())); + } + + metadata.add(mapQualifier("bestaccessright", getBestAccessright(r))); + } + + switch (EntityType.valueOf(type)) { + case publication: + final Publication pub = (Publication) entity; + + if (pub.getJournal() != null) { + final Journal j = pub.getJournal(); + metadata.add(mapJournal(j)); + } + + break; + case dataset: + final Dataset d = (Dataset) entity; + if (d.getDevice() != null) { + metadata.add(asXmlElement("device", d.getDevice().getValue())); + } + if (d.getLastmetadataupdate() != null) { + metadata.add(asXmlElement("lastmetadataupdate", d.getLastmetadataupdate().getValue())); + } + if (d.getMetadataversionnumber() != null) { + metadata.add(asXmlElement("metadataversionnumber", d.getMetadataversionnumber().getValue())); + } + if (d.getSize() != null) { + metadata.add(asXmlElement("size", d.getSize().getValue())); + } + if (d.getStoragedate() != null) { + metadata.add(asXmlElement("storagedate", d.getStoragedate().getValue())); + } + if (d.getVersion() != null) { + metadata.add(asXmlElement("version", d.getVersion().getValue())); + } + //TODO d.getGeolocation() + + break; + case otherresearchproduct: + final OtherResearchProduct orp = (OtherResearchProduct) entity; + + if (orp.getContactperson() != null) { + metadata.addAll(orp.getContactperson() + .stream() + .map(c -> asXmlElement("contactperson", c.getValue())) + .collect(Collectors.toList())); + } + + if (orp.getContactgroup() != null) { + metadata.addAll(orp.getContactgroup() + .stream() + .map(c -> asXmlElement("contactgroup", c.getValue())) + .collect(Collectors.toList())); + } + if (orp.getTool() != null) { + metadata.addAll(orp.getTool() + .stream() + .map(c -> asXmlElement("tool", c.getValue())) + .collect(Collectors.toList())); + } + break; + case software: + final Software s = (Software) entity; + + if (s.getDocumentationUrl() != null) { + metadata.addAll(s.getDocumentationUrl() + .stream() + .map(c -> asXmlElement("documentationUrl", c.getValue())) + .collect(Collectors.toList())); + } + if (s.getLicense() != null) { + metadata.addAll(s.getLicense() + .stream() + .map(l -> mapStructuredProperty("license", l)) + .collect(Collectors.toList())); + } + if (s.getCodeRepositoryUrl() != null) { + metadata.add(asXmlElement("codeRepositoryUrl", s.getCodeRepositoryUrl().getValue())); + } + if (s.getProgrammingLanguage() != null) { + metadata.add(mapQualifier("programmingLanguage", s.getProgrammingLanguage())); + } + break; + case datasource: + final Datasource ds = (Datasource) entity; + + if (ds.getDatasourcetype() != null) { + mapDatasourceType(metadata, ds.getDatasourcetype()); + } + if (ds.getOpenairecompatibility() != null) { + metadata.add(mapQualifier("openairecompatibility", ds.getOpenairecompatibility())); + } + if (ds.getOfficialname() != null) { + metadata.add(asXmlElement("officialname", ds.getOfficialname().getValue())); + } + if (ds.getEnglishname() != null) { + metadata.add(asXmlElement("englishname", ds.getEnglishname().getValue())); + } + if (ds.getWebsiteurl() != null) { + metadata.add(asXmlElement("websiteurl", ds.getWebsiteurl().getValue())); + } + if (ds.getLogourl() != null) { + metadata.add(asXmlElement("logourl", ds.getLogourl().getValue())); + } + if (ds.getContactemail() != null) { + metadata.add(asXmlElement("contactemail", ds.getContactemail().getValue())); + } + if (ds.getNamespaceprefix() != null) { + metadata.add(asXmlElement("namespaceprefix", ds.getNamespaceprefix().getValue())); + } + if (ds.getLatitude() != null) { + metadata.add(asXmlElement("latitude", ds.getLatitude().getValue())); + } + if (ds.getLongitude() != null) { + metadata.add(asXmlElement("longitude", ds.getLongitude().getValue())); + } + if (ds.getDateofvalidation() != null) { + metadata.add(asXmlElement("dateofvalidation", ds.getDateofvalidation().getValue())); + } + if (ds.getDescription() != null) { + metadata.add(asXmlElement("description", ds.getDescription().getValue())); + } + if (ds.getOdnumberofitems() != null) { + metadata.add(asXmlElement("odnumberofitems", ds.getOdnumberofitems().getValue())); + } + if (ds.getOdnumberofitemsdate() != null) { + metadata.add(asXmlElement("odnumberofitemsdate", ds.getOdnumberofitemsdate().getValue())); + } + if (ds.getOdpolicies() != null) { + metadata.add(asXmlElement("odpolicies", ds.getOdpolicies().getValue())); + } + if (ds.getOdlanguages() != null) { + metadata.addAll(ds.getOdlanguages() + .stream() + .map(c -> asXmlElement("odlanguages", c.getValue())) + .collect(Collectors.toList())); + } + if (ds.getOdcontenttypes() != null) { + metadata.addAll(ds.getOdcontenttypes() + .stream() + .map(c -> asXmlElement("odcontenttypes", c.getValue())) + .collect(Collectors.toList())); + } + if (ds.getAccessinfopackage() != null) { + metadata.addAll(ds.getAccessinfopackage() + .stream() + .map(c -> asXmlElement("accessinfopackage", c.getValue())) + .collect(Collectors.toList())); + } + if (ds.getReleaseenddate() != null) { + metadata.add(asXmlElement("releasestartdate", ds.getReleaseenddate().getValue())); + } + if (ds.getReleaseenddate() != null) { + metadata.add(asXmlElement("releaseenddate", ds.getReleaseenddate().getValue())); + } + if (ds.getMissionstatementurl() != null) { + metadata.add(asXmlElement("missionstatementurl", ds.getMissionstatementurl().getValue())); + } + if (ds.getDataprovider() != null) { + metadata.add(asXmlElement("dataprovider", ds.getDataprovider().getValue().toString())); + } + if (ds.getServiceprovider() != null) { + metadata.add(asXmlElement("serviceprovider", ds.getServiceprovider().getValue().toString())); + } + if (ds.getDatabaseaccesstype() != null) { + metadata.add(asXmlElement("databaseaccesstype", ds.getDatabaseaccesstype().getValue())); + } + if (ds.getDatauploadtype() != null) { + metadata.add(asXmlElement("datauploadtype", ds.getDatauploadtype().getValue())); + } + if (ds.getDatabaseaccessrestriction() != null) { + metadata.add(asXmlElement("databaseaccessrestriction", ds.getDatabaseaccessrestriction().getValue())); + } + if (ds.getDatauploadrestriction() != null) { + metadata.add(asXmlElement("datauploadrestriction", ds.getDatauploadrestriction().getValue())); + } + if (ds.getVersioning() != null) { + metadata.add(asXmlElement("versioning", ds.getVersioning().getValue().toString())); + } + if (ds.getCitationguidelineurl() != null) { + metadata.add(asXmlElement("citationguidelineurl", ds.getCitationguidelineurl().getValue())); + } + if (ds.getQualitymanagementkind() != null) { + metadata.add(asXmlElement("qualitymanagementkind", ds.getQualitymanagementkind().getValue())); + } + if (ds.getPidsystems() != null) { + metadata.add(asXmlElement("pidsystems", ds.getPidsystems().getValue())); + } + if (ds.getCertificates() != null) { + metadata.add(asXmlElement("certificates", ds.getCertificates().getValue())); + } + if (ds.getPolicies() != null) { + metadata.addAll(ds.getPolicies() + .stream() + .map(kv -> mapKeyValue("policies", kv)) + .collect(Collectors.toList())); + } + if (ds.getJournal() != null) { + metadata.add(mapJournal(ds.getJournal())); + } + if (ds.getSubjects() != null) { + metadata.addAll(ds.getSubjects() + .stream() + .map(sp -> mapStructuredProperty("subjects", sp)) + .collect(Collectors.toList())); + } + + break; + case organization: + final Organization o = (Organization) entity; + + if (o.getLegalshortname() != null) { + metadata.add(asXmlElement("legalshortname", o.getLegalshortname().getValue())); + } + if (o.getLegalname() != null) { + metadata.add(asXmlElement("legalname", o.getLegalname().getValue())); + } + if (o.getAlternativeNames() != null) { + metadata.addAll(o.getAlternativeNames() + .stream() + .map(c -> asXmlElement("alternativeNames", c.getValue())) + .collect(Collectors.toList())); + } + if (o.getWebsiteurl() != null) { + metadata.add(asXmlElement("websiteurl", o.getWebsiteurl().getValue())); + } + if (o.getLogourl() != null) { + metadata.add(asXmlElement("websiteurl", o.getLogourl().getValue())); + } + + if (o.getEclegalbody() != null) { + metadata.add(asXmlElement("eclegalbody", o.getEclegalbody().getValue())); + } + if (o.getEclegalperson() != null) { + metadata.add(asXmlElement("eclegalperson", o.getEclegalperson().getValue())); + } + if (o.getEcnonprofit() != null) { + metadata.add(asXmlElement("ecnonprofit", o.getEcnonprofit().getValue())); + } + if (o.getEcresearchorganization() != null) { + metadata.add(asXmlElement("ecresearchorganization", o.getEcresearchorganization().getValue())); + } + if (o.getEchighereducation() != null) { + metadata.add(asXmlElement("echighereducation", o.getEchighereducation().getValue())); + } + if (o.getEcinternationalorganization() != null) { + metadata.add(asXmlElement("ecinternationalorganizationeurinterests", o.getEcinternationalorganization().getValue())); + } + if (o.getEcinternationalorganization() != null) { + metadata.add(asXmlElement("ecinternationalorganization", o.getEcinternationalorganization().getValue())); + } + if (o.getEcenterprise() != null) { + metadata.add(asXmlElement("ecenterprise", o.getEcenterprise().getValue())); + } + if (o.getEcsmevalidated() != null) { + metadata.add(asXmlElement("ecsmevalidated", o.getEcsmevalidated().getValue())); + } + if (o.getEcnutscode() != null) { + metadata.add(asXmlElement("ecnutscode", o.getEcnutscode().getValue())); + } + if (o.getCountry() != null) { + metadata.add(mapQualifier("country", o.getCountry())); + } + + break; + case project: + + final Project p = (Project) entity; + + if (p.getWebsiteurl() != null) { + metadata.add(asXmlElement("websiteurl", p.getWebsiteurl().getValue())); + } + if (p.getCode() != null) { + metadata.add(asXmlElement("code", p.getCode().getValue())); + } + if (p.getAcronym() != null) { + metadata.add(asXmlElement("acronym", p.getAcronym().getValue())); + } + if (p.getTitle() != null) { + metadata.add(asXmlElement("title", p.getTitle().getValue())); + } + if (p.getStartdate() != null) { + metadata.add(asXmlElement("startdate", p.getStartdate().getValue())); + } + if (p.getEnddate() != null) { + metadata.add(asXmlElement("enddate", p.getEnddate().getValue())); + } + if (p.getCallidentifier() != null) { + metadata.add(asXmlElement("callidentifier", p.getCallidentifier().getValue())); + } + if (p.getKeywords() != null) { + metadata.add(asXmlElement("keywords", p.getKeywords().getValue())); + } + if (p.getDuration() != null) { + metadata.add(asXmlElement("duration", p.getDuration().getValue())); + } + if (p.getEcarticle29_3() != null) { + metadata.add(asXmlElement("ecarticle29_3", p.getEcarticle29_3().getValue())); + } + if (p.getSubjects() != null) { + metadata.addAll(p.getSubjects() + .stream() + .map(sp -> mapStructuredProperty("subject", sp)) + .collect(Collectors.toList())); + } + if (p.getContracttype() != null) { + metadata.add(mapQualifier("contracttype", p.getContracttype())); + } + if (p.getEcsc39() != null) { + metadata.add(asXmlElement("ecsc39", p.getEcsc39().getValue())); + } + if (p.getContactfullname() != null) { + metadata.add(asXmlElement("contactfullname", p.getContactfullname().getValue())); + } + if (p.getContactfax() != null) { + metadata.add(asXmlElement("contactfax", p.getContactfax().getValue())); + } + if (p.getContactphone() != null) { + metadata.add(asXmlElement("contactphone", p.getContactphone().getValue())); + } + if (p.getContactemail() != null) { + metadata.add(asXmlElement("contactemail", p.getContactemail().getValue())); + } + if (p.getSummary() != null) { + metadata.add(asXmlElement("summary", p.getSummary().getValue())); + } + if (p.getCurrency() != null) { + metadata.add(asXmlElement("currency", p.getCurrency().getValue())); + } + if (p.getTotalcost() != null) { + metadata.add(asXmlElement("totalcost", p.getTotalcost().toString())); + } + if (p.getFundedamount() != null) { + metadata.add(asXmlElement("fundedamount", p.getFundedamount().toString())); + } + if (p.getFundingtree() != null) { + metadata.addAll(p.getFundingtree() + .stream() + .map(ft -> ft.getValue()) + .collect(Collectors.toList())); + } + + break; + default: + throw new IllegalArgumentException("invalid entity type: " + type); + } + + return metadata; + } + + private void mapDatasourceType(List metadata, final Qualifier dsType) { + metadata.add(mapQualifier("datasourcetype", dsType)); + + if (specialDatasourceTypes.contains(dsType.getClassid())) { + dsType.setClassid("other"); + dsType.setClassname("other"); + } + metadata.add(mapQualifier("datasourcetypeui", dsType)); + } + + private Qualifier getBestAccessright(final Result r) { + Qualifier bestAccessRight = new Qualifier(); + bestAccessRight.setClassid("UNKNOWN"); + bestAccessRight.setClassname("not available"); + bestAccessRight.setSchemeid("dnet:access_modes"); + bestAccessRight.setSchemename("dnet:access_modes"); + + final LicenseComparator lc = new LicenseComparator(); + for (final Instance instance : r.getInstance()) { + if (lc.compare(bestAccessRight, instance.getAccessright()) > 0) { + bestAccessRight = instance.getAccessright(); + } + } + return bestAccessRight; + } + + private List listRelations(final JoinedEntity je, TemplateFactory templateFactory, final Set contexts) { + final List rels = Lists.newArrayList(); + + for (final Tuple2 link : je.getLinks()) { + + final Relation rel = link.getRelation(); + final RelatedEntity re = link.getRelatedEntity(); + final String targetType = link.getRelatedEntity().getType(); + + final List metadata = Lists.newArrayList(); + switch (EntityType.valueOf(targetType)) { + case publication: + case dataset: + case otherresearchproduct: + case software: + if (re.getTitle() != null && isNotBlank(re.getTitle().getValue())) { + metadata.add(mapStructuredProperty("title", re.getTitle())); + } + if (isNotBlank(re.getDateofacceptance())) { + metadata.add(asXmlElement("dateofacceptance", re.getDateofacceptance())); + } + if (isNotBlank(re.getPublisher())) { + metadata.add(asXmlElement("publisher", re.getPublisher())); + } + if (isNotBlank(re.getCodeRepositoryUrl())) { + metadata.add(asXmlElement("coderepositoryurl", re.getCodeRepositoryUrl())); + } + if (re.getResulttype() != null & !re.getResulttype().isBlank()) { + metadata.add(mapQualifier("resulttype", re.getResulttype())); + } + if (re.getCollectedfrom() != null) { + metadata.addAll(re.getCollectedfrom() + .stream() + .map(kv -> mapKeyValue("collectedfrom", kv)) + .collect(Collectors.toList())); + } + if (re.getPid() != null) { + metadata.addAll(re.getPid() + .stream() + .map(p -> mapStructuredProperty("pid", p)) + .collect(Collectors.toList())); + } + break; + case datasource: + if (isNotBlank(re.getOfficialname())) { + metadata.add(asXmlElement("officialname", re.getOfficialname())); + } + if (re.getDatasourcetype() != null & !re.getDatasourcetype().isBlank()) { + mapDatasourceType(metadata, re.getDatasourcetype()); + } + if (re.getOpenairecompatibility() != null & !re.getOpenairecompatibility().isBlank()) { + metadata.add(mapQualifier("openairecompatibility", re.getOpenairecompatibility())); + } + break; + case organization: + if (isNotBlank(re.getLegalname())) { + metadata.add(asXmlElement("legalname", re.getLegalname())); + } + if (isNotBlank(re.getLegalshortname())) { + metadata.add(asXmlElement("legalshortname", re.getLegalshortname())); + } + if (re.getCountry() != null & !re.getCountry().isBlank()) { + metadata.add(mapQualifier("country", re.getCountry())); + } + break; + case project: + if (isNotBlank(re.getProjectTitle())) { + metadata.add(asXmlElement("title", re.getProjectTitle())); + } + if (isNotBlank(re.getCode())) { + metadata.add(asXmlElement("code", re.getCode())); + } + if (isNotBlank(re.getAcronym())) { + metadata.add(asXmlElement("acronym", re.getAcronym())); + } + if (re.getContracttype() != null & !re.getContracttype().isBlank()) { + metadata.add(mapQualifier("contracttype", re.getContracttype())); + } + if (re.getFundingtree() != null) { + metadata.addAll(re.getFundingtree() + .stream() + .peek(ft -> fillContextMap(ft, contexts)) + .map(ft -> getRelFundingTree(ft)) + .collect(Collectors.toList())); + } + break; + default: + throw new IllegalArgumentException("invalid target type: " + targetType); + + } + final DataInfo info = rel.getDataInfo(); + final String scheme = getScheme(re.getType(), targetType); + + if (StringUtils.isBlank(scheme)) { + throw new IllegalArgumentException(String.format("missing scheme for: <%s - %s>", re.getType(), targetType)); + } + + final String accumulatorName = getRelDescriptor(rel.getRelType(), rel.getSubRelType(), rel.getRelClass()); + if (accumulators.containsKey(accumulatorName)) { + accumulators.get(accumulatorName).add(1); + } + + rels.add(templateFactory.getRel( + targetType, + rel.getTarget(), + Sets.newHashSet(metadata), + rel.getRelClass(), + scheme, + info)); + } + return rels; + } + + private List listChildren(final JoinedEntity je, TemplateFactory templateFactory) { + + final List children = Lists.newArrayList(); + + if (MainEntityType.result.toString().equals(getMainType(je.getType()))) { + final List instances = ((Result) je.getEntity()).getInstance(); + if (instances != null) { + for (final Instance instance : ((Result) je.getEntity()).getInstance()) { + + final List fields = Lists.newArrayList(); + + if (instance.getAccessright() != null && !instance.getAccessright().isBlank()) { + fields.add(mapQualifier("accessright", instance.getAccessright())); + } + if (instance.getCollectedfrom() != null) { + fields.add(mapKeyValue("collectedfrom", instance.getCollectedfrom())); + } + if (instance.getHostedby() != null) { + fields.add(mapKeyValue("hostedby", instance.getHostedby())); + } + if (instance.getDateofacceptance() != null && isNotBlank(instance.getDateofacceptance().getValue())) { + fields.add(asXmlElement("dateofacceptance", instance.getDateofacceptance().getValue())); + } + if (instance.getInstancetype() != null && !instance.getInstancetype().isBlank()) { + fields.add(mapQualifier("instancetype", instance.getInstancetype())); + } + if (isNotBlank(instance.getDistributionlocation())) { + fields.add(asXmlElement("distributionlocation", instance.getDistributionlocation())); + } + if (instance.getRefereed() != null && isNotBlank(instance.getRefereed().getValue())) { + fields.add(asXmlElement("refereed", instance.getRefereed().getValue())); + } + if (instance.getProcessingchargeamount() != null && isNotBlank(instance.getProcessingchargeamount().getValue())) { + fields.add(asXmlElement("processingchargeamount", instance.getProcessingchargeamount().getValue())); + } + if (instance.getProcessingchargecurrency() != null && isNotBlank(instance.getProcessingchargecurrency().getValue())) { + fields.add(asXmlElement("processingchargecurrency", instance.getProcessingchargecurrency().getValue())); + } + + children.add(templateFactory.getInstance(instance.getHostedby().getKey(), fields, instance.getUrl())); + } + } + final List ext = ((Result) je.getEntity()).getExternalReference(); + if (ext != null) { + for (final ExternalReference er : ((Result) je.getEntity()).getExternalReference()) { + + final List fields = Lists.newArrayList(); + + if (isNotBlank(er.getSitename())) { + fields.add(asXmlElement("sitename", er.getSitename())); + } + if (isNotBlank(er.getLabel())) { + fields.add(asXmlElement("label", er.getLabel())); + } + if (isNotBlank(er.getUrl())) { + fields.add(asXmlElement("url", er.getUrl())); + } + if (isNotBlank(er.getDescription())) { + fields.add(asXmlElement("description", er.getDescription())); + } + if (isNotBlank(er.getUrl())) { + fields.add(mapQualifier("qualifier", er.getQualifier())); + } + if (isNotBlank(er.getRefidentifier())) { + fields.add(asXmlElement("refidentifier", er.getRefidentifier())); + } + if (isNotBlank(er.getQuery())) { + fields.add(asXmlElement("query", er.getQuery())); + } + + children.add(templateFactory.getChild("externalreference", null, fields)); + } + } + } + + return children; + } + + private List listExtraInfo(JoinedEntity je) { + final List extraInfo = je.getEntity().getExtraInfo(); + return extraInfo != null ? extraInfo + .stream() + .map(e -> mapExtraInfo(e)) + .collect(Collectors.toList()) : Lists.newArrayList(); + } + + private List buildContexts(final String type, final Set contexts) { + final List res = Lists.newArrayList(); + + if ((contextMapper != null) && !contextMapper.isEmpty() && MainEntityType.result.toString().equals(type)) { + + XMLTag document = XMLDoc.newDocument(true).addRoot("contextRoot"); + + for (final String context : contexts) { + + String id = ""; + for (final String token : Splitter.on("::").split(context)) { + id += token; + + final ContextDef def = contextMapper.get(id); + + if (def == null) { + continue; + // throw new IllegalStateException(String.format("cannot find context for id '%s'", id)); + } + + if (def.getName().equals("context")) { + final String xpath = "//context/@id='" + def.getId() + "'"; + if (!document.gotoRoot().rawXpathBoolean(xpath, new Object())) { + document = addContextDef(document.gotoRoot(), def); + } + } + + if (def.getName().equals("category")) { + final String rootId = substringBefore(def.getId(), "::"); + document = addContextDef(document.gotoRoot().gotoTag("//context[./@id='" + rootId + "']", new Object()), def); + } + + if (def.getName().equals("concept")) { + document = addContextDef(document, def).gotoParent(); + } + id += "::"; + } + } + final Transformer transformer = getTransformer(); + for (final org.w3c.dom.Element x : document.gotoRoot().getChildElement()) { + try { + res.add(asStringElement(x, transformer)); + } catch (final TransformerException e) { + throw new RuntimeException(e); + } + } + } + + return res; + } + + private Transformer getTransformer() { + try { + Transformer transformer = TransformerFactory.newInstance().newTransformer(); + transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "yes"); + return transformer; + } catch (TransformerConfigurationException e) { + throw new IllegalStateException("unable to create javax.xml.transform.Transformer", e); + } + } + + private XMLTag addContextDef(final XMLTag tag, final ContextDef def) { + tag.addTag(def.getName()).addAttribute("id", def.getId()).addAttribute("label", def.getLabel()); + if ((def.getType() != null) && !def.getType().isEmpty()) { + tag.addAttribute("type", def.getType()); + } + return tag; + } + + private String asStringElement(final org.w3c.dom.Element element, final Transformer transformer) throws TransformerException { + final StringWriter buffer = new StringWriter(); + transformer.transform(new DOMSource(element), new StreamResult(buffer)); + return buffer.toString(); + } + + private void fillContextMap(final String xmlTree, final Set contexts) { + + Document fundingPath; + try { + fundingPath = new SAXReader().read(new StringReader(xmlTree)); + } catch (final DocumentException e) { + throw new RuntimeException(e); + } + try { + final Node funder = fundingPath.selectSingleNode("//funder"); + + if (funder != null) { + + final String funderShortName = funder.valueOf("./shortname"); + contexts.add(funderShortName); + + contextMapper.put(funderShortName, new ContextDef(funderShortName, funder.valueOf("./name"), "context", "funding")); + final Node level0 = fundingPath.selectSingleNode("//funding_level_0"); + if (level0 != null) { + final String level0Id = Joiner.on("::").join(funderShortName, level0.valueOf("./name")); + contextMapper.put(level0Id, new ContextDef(level0Id, level0.valueOf("./description"), "category", "")); + final Node level1 = fundingPath.selectSingleNode("//funding_level_1"); + if (level1 == null) { + contexts.add(level0Id); + } else { + final String level1Id = Joiner.on("::").join(level0Id, level1.valueOf("./name")); + contextMapper.put(level1Id, new ContextDef(level1Id, level1.valueOf("./description"), "concept", "")); + final Node level2 = fundingPath.selectSingleNode("//funding_level_2"); + if (level2 == null) { + contexts.add(level1Id); + } else { + final String level2Id = Joiner.on("::").join(level1Id, level2.valueOf("./name")); + contextMapper.put(level2Id, new ContextDef(level2Id, level2.valueOf("./description"), "concept", "")); + contexts.add(level2Id); + } + } + } + } + } catch (final NullPointerException e) { + throw new IllegalArgumentException("malformed funding path: " + xmlTree, e); + } + } + + + + @SuppressWarnings("unchecked") + protected static String getRelFundingTree(final String xmlTree) { + String funding = ""; + try { + final Document ftree = new SAXReader().read(new StringReader(xmlTree)); + funding = ""; + + funding += getFunderElement(ftree); + + for (final Object o : Lists.reverse(ftree.selectNodes("//fundingtree//*[starts-with(local-name(),'funding_level_')]"))) { + final Element e = (Element) o; + final String _id = e.valueOf("./id"); + funding += "<" + e.getName() + " name=\"" + escapeXml(e.valueOf("./name")) + "\">" + escapeXml(_id) + ""; + } + } catch (final DocumentException e) { + throw new IllegalArgumentException("unable to parse funding tree: " + xmlTree + "\n" + e.getMessage()); + } finally { + funding += ""; + } + return funding; + } + + private static String getFunderElement(final Document ftree) { + final String funderId = ftree.valueOf("//fundingtree/funder/id"); + final String funderShortName = ftree.valueOf("//fundingtree/funder/shortname"); + final String funderName = ftree.valueOf("//fundingtree/funder/name"); + final String funderJurisdiction = ftree.valueOf("//fundingtree/funder/jurisdiction"); + + return ""; + } + +} \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/utils/XmlSerializationUtils.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/utils/XmlSerializationUtils.java new file mode 100644 index 0000000000..3088828ab0 --- /dev/null +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/graph/utils/XmlSerializationUtils.java @@ -0,0 +1,151 @@ +package eu.dnetlib.dhp.graph.utils; + +import eu.dnetlib.dhp.schema.oaf.*; + +import static eu.dnetlib.dhp.graph.utils.GraphMappingUtils.removePrefix; +import static org.apache.commons.lang3.StringUtils.isBlank; +import static org.apache.commons.lang3.StringUtils.isNotBlank; + +public class XmlSerializationUtils { + + // XML 1.0 + // #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF] + private final static String xml10pattern = "[^" + + "\u0009\r\n" + + "\u0020-\uD7FF" + + "\uE000-\uFFFD" + + "\ud800\udc00-\udbff\udfff" + + "]"; + + public static String mapJournal(Journal j) { + final String attrs = new StringBuilder() + .append(attr("issn", j.getIssnPrinted())) + .append(attr("eissn", j.getIssnOnline())) + .append(attr("lissn", j.getIssnLinking())) + .append(attr("ep", j.getEp())) + .append(attr("iss", j.getIss())) + .append(attr("sp", j.getSp())) + .append(attr("vol", j.getVol())) + .toString() + .trim(); + + return new StringBuilder() + .append("") + .append(escapeXml(j.getName())) + .append("") + .toString(); + } + + private static String attr(final String name, final String value) { + return isNotBlank(value) ? name + "=\"" + escapeXml(value) + "\" " : ""; + } + + public static String mapStructuredProperty(String name, StructuredProperty t) { + return asXmlElement(name, t.getValue(), t.getQualifier(), t.getDataInfo() != null ? t.getDataInfo() : null); + } + + public static String mapQualifier(String name, Qualifier q) { + return asXmlElement(name, "", q, null); + } + + public static String escapeXml(final String value) { + return value + .replaceAll("&", "&") + .replaceAll("<", "<") + .replaceAll(">", ">") + .replaceAll("\"", """) + .replaceAll("'", "'") + .replaceAll(xml10pattern, ""); + } + + public static String parseDataInfo(final DataInfo dataInfo) { + return new StringBuilder() + .append("") + .append(asXmlElement("inferred", dataInfo.getInferred() + "")) + .append(asXmlElement("deletedbyinference", dataInfo.getDeletedbyinference() + "")) + .append(asXmlElement("trust", dataInfo.getTrust() + "")) + .append(asXmlElement("inferenceprovenance", dataInfo.getInferenceprovenance() + "")) + .append(asXmlElement("provenanceaction", null, dataInfo.getProvenanceaction(), null)) + .append("") + .toString(); + } + + private static StringBuilder dataInfoAsAttributes(final StringBuilder sb, final DataInfo info) { + return sb + .append(attr("inferred", info.getInferred() != null ? info.getInferred().toString() : "")) + .append(attr("inferenceprovenance", info.getInferenceprovenance())) + .append(attr("provenanceaction", info.getProvenanceaction() != null ? info.getProvenanceaction().getClassid() : "")) + .append(attr("trust", info.getTrust())); + } + + public static String mapKeyValue(final String name, final KeyValue kv) { + return new StringBuilder() + .append("<") + .append(name) + .append(" name=\"") + .append(escapeXml(kv.getValue())) + .append("\" id=\"") + .append(escapeXml(removePrefix(kv.getKey()))) + .append("\"/>") + .toString(); + } + + public static String mapExtraInfo(final ExtraInfo e) { + return new StringBuilder("") + .append(e.getValue()) + .append("") + .toString(); + } + + public static String asXmlElement(final String name, final String value) { + return asXmlElement(name, value, null, null); + } + + public static String asXmlElement(final String name, final String value, final Qualifier q, final DataInfo info) { + StringBuilder sb = new StringBuilder(); + sb.append("<"); + sb.append(name); + if (q != null) { + sb.append(getAttributes(q)); + } + if (info != null) { + sb + .append(" ") + .append(attr("inferred", info.getInferred() != null ? info.getInferred().toString() : "")) + .append(attr("inferenceprovenance", info.getInferenceprovenance())) + .append(attr("provenanceaction", info.getProvenanceaction() != null ? info.getProvenanceaction().getClassid() : "")) + .append(attr("trust", info.getTrust())); + } + if (isBlank(value)) { + sb.append("/>"); + return sb.toString(); + } + + sb.append(">"); + sb.append(escapeXml(value)); + sb.append(""); + + return sb.toString(); + } + + public static String getAttributes(final Qualifier q) { + if (q == null || q.isBlank()) return ""; + + return new StringBuilder(" ") + .append(attr("classid", q.getClassid())) + .append(attr("classname", q.getClassname())) + .append(attr("schemeid", q.getSchemeid())) + .append(attr("schemename", q.getSchemename())) + .toString(); + } + +} diff --git a/dhp-workflows/dhp-graph-provision/src/main/resources/META-INF/services/javax.xml.transform.TransformerFactory b/dhp-workflows/dhp-graph-provision/src/main/resources/META-INF/services/javax.xml.transform.TransformerFactory new file mode 100644 index 0000000000..b53ca855ff --- /dev/null +++ b/dhp-workflows/dhp-graph-provision/src/main/resources/META-INF/services/javax.xml.transform.TransformerFactory @@ -0,0 +1 @@ +net.sf.saxon.TransformerFactoryImpl \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/graph/input_params_build_adjacency_lists.json b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/graph/input_params_build_adjacency_lists.json new file mode 100644 index 0000000000..a5d20a55f4 --- /dev/null +++ b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/graph/input_params_build_adjacency_lists.json @@ -0,0 +1,7 @@ +[ + {"paramName":"mt", "paramLongName":"master", "paramDescription": "should be local or yarn", "paramRequired": true}, + {"paramName":"is", "paramLongName":"isLookupUrl", "paramDescription": "URL of the isLookUp Service", "paramRequired": true}, + {"paramName":"o", "paramLongName":"outputPath", "paramDescription": "the path used to store temporary output files", "paramRequired": true}, + {"paramName":"s", "paramLongName":"sourcePath", "paramDescription": "the path of the sequence file to read", "paramRequired": true}, + {"paramName":"t", "paramLongName":"otherDsTypeId", "paramDescription": "list of datasource types to populate field datasourcetypeui", "paramRequired": true} +] \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/graph/input_params_update_index.json b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/graph/input_params_update_index.json new file mode 100644 index 0000000000..0d45e9e29f --- /dev/null +++ b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/graph/input_params_update_index.json @@ -0,0 +1,7 @@ +[ + {"paramName":"mt", "paramLongName":"master", "paramDescription": "should be local or yarn", "paramRequired": true}, + {"paramName":"is", "paramLongName":"isLookupUrl", "paramDescription": "URL of the isLookUp Service", "paramRequired": true}, + {"paramName":"s", "paramLongName":"sourcePath", "paramDescription": "the path of the sequence file to read the XML records", "paramRequired": true}, + {"paramName":"f", "paramLongName":"format", "paramDescription": "MDFormat name found in the IS profile", "paramRequired": true}, + {"paramName":"b", "paramLongName":"batchSize", "paramDescription": "size of the batch of documents sent to solr", "paramRequired": false} +] diff --git a/dhp-workflows/dhp-dedup-scholexplorer/src/main/resources/eu/dnetlib/dhp/dedup/oozie_app/config-default.xml b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/graph/oozie_app/config-default.xml similarity index 69% rename from dhp-workflows/dhp-dedup-scholexplorer/src/main/resources/eu/dnetlib/dhp/dedup/oozie_app/config-default.xml rename to dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/graph/oozie_app/config-default.xml index fcab9dd00a..624d3ea763 100644 --- a/dhp-workflows/dhp-dedup-scholexplorer/src/main/resources/eu/dnetlib/dhp/dedup/oozie_app/config-default.xml +++ b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/graph/oozie_app/config-default.xml @@ -23,4 +23,12 @@ hive_db_name openaire
+ + spark2YarnHistoryServerAddress + http://iis-cdh5-test-gw.ocean.icm.edu.pl:18088 + + + spark2EventLogDir + /user/spark/applicationHistory + \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/graph/oozie_app/workflow.xml b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/graph/oozie_app/workflow.xml new file mode 100644 index 0000000000..b154b61e11 --- /dev/null +++ b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/graph/oozie_app/workflow.xml @@ -0,0 +1,114 @@ + + + + + hive_db_name + the target hive database name + + + sparkDriverMemory + memory for driver process + + + sparkExecutorMemory + memory for individual executor + + + sparkExecutorCores + number of cores used by single executor + + + spark2YarnHistoryServerAddress + spark 2.* yarn history server address + + + spark2EventLogDir + spark 2.* event log dir location + + + + + ${jobTracker} + ${nameNode} + + + mapreduce.job.queuename + ${queueName} + + + oozie.launcher.mapred.job.queue.name + ${oozieLauncherQueueName} + + + + + + + + + ${wf:conf('reuseRecords') eq false} + ${wf:conf('reuseRecords') eq true} + + + + + + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] + + + + + yarn + cluster + build_adjacency_lists + eu.dnetlib.dhp.graph.SparkXmlRecordBuilderJob + dhp-graph-provision-${projectVersion}.jar + + --executor-cores ${sparkExecutorCoresForJoining} + --executor-memory ${sparkExecutorMemoryForJoining} + --driver-memory=${sparkDriverMemoryForJoining} + --conf spark.dynamicAllocation.maxExecutors=${sparkExecutorCoresForJoining} + --conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" + --conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + + -mt yarn + -is ${isLookupUrl} + -t ${otherDsTypeId} + --sourcePath${sourcePath} + --outputPath${outputPath} + + + + + + + + yarn + cluster + to_solr_index + eu.dnetlib.dhp.graph.SparkXmlIndexingJob + dhp-graph-provision-${projectVersion}.jar + + --executor-cores ${sparkExecutorCoresForIndexing} + --executor-memory ${sparkExecutorMemoryForIndexing} + --driver-memory=${sparkDriverMemoryForIndexing} + --conf spark.dynamicAllocation.maxExecutors=${sparkExecutorCoresForIndexing} + --conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" + --conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + + -mt yarn + -is ${isLookupUrl} + --sourcePath${outputPath}/xml + --format${format} + --batchSize${batchSize} + + + + + + + \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/graph/template/child.st b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/graph/template/child.st new file mode 100644 index 0000000000..89f81e16bf --- /dev/null +++ b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/graph/template/child.st @@ -0,0 +1,3 @@ +> + $metadata:{ it | $it$ }$ + \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/graph/template/entity.st b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/graph/template/entity.st new file mode 100644 index 0000000000..d16f3c3e09 --- /dev/null +++ b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/graph/template/entity.st @@ -0,0 +1,10 @@ + + $metadata:{ it | $it$ }$ + + $rels:{ it | $it$ }$ + + + $children:{ it | $it$ }$ + + +$extrainfo:{ it | $it$ }$ \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/graph/template/instance.st b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/graph/template/instance.st new file mode 100644 index 0000000000..64bed05b45 --- /dev/null +++ b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/graph/template/instance.st @@ -0,0 +1,4 @@ + + $metadata:{ it | $it$ }$ + $webresources:{ it | $it$ }$ + \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/graph/template/record.st b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/graph/template/record.st new file mode 100644 index 0000000000..dea68eab88 --- /dev/null +++ b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/graph/template/record.st @@ -0,0 +1,17 @@ + + + +
+ $id$ + $dateofcollection$ + $dateoftransformation$ +
+ + + $it$ + + +
+
\ No newline at end of file diff --git a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/graph/template/rel.st b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/graph/template/rel.st new file mode 100644 index 0000000000..af19ba497b --- /dev/null +++ b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/graph/template/rel.st @@ -0,0 +1,4 @@ + + $objIdentifier$ + $metadata:{ it | $it$ }$ + \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/graph/template/webresource.st b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/graph/template/webresource.st new file mode 100644 index 0000000000..7ff6c5d7fa --- /dev/null +++ b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/graph/template/webresource.st @@ -0,0 +1,3 @@ + + $identifier$ + \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/graph/GraphJoinerTest.java b/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/graph/GraphJoinerTest.java new file mode 100644 index 0000000000..74416ea597 --- /dev/null +++ b/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/graph/GraphJoinerTest.java @@ -0,0 +1,38 @@ +package eu.dnetlib.dhp.graph; + +import org.junit.jupiter.api.BeforeEach; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; + +public class GraphJoinerTest { + + private ClassLoader cl = getClass().getClassLoader(); + private Path workingDir; + private Path inputDir; + private Path outputDir; + + @BeforeEach + public void before() throws IOException { + workingDir = Files.createTempDirectory("promote_action_set"); + inputDir = workingDir.resolve("input"); + outputDir = workingDir.resolve("output"); + } + + private static void copyFiles(Path source, Path target) throws IOException { + Files.list(source).forEach(f -> { + try { + if (Files.isDirectory(f)) { + Path subTarget = Files.createDirectories(target.resolve(f.getFileName())); + copyFiles(f, subTarget); + } else { + Files.copy(f, target.resolve(f.getFileName())); + } + } catch (IOException e) { + e.printStackTrace(); + throw new RuntimeException(e); + } + }); + } +} diff --git a/dhp-workflows/docs/oozie-installer.markdown b/dhp-workflows/docs/oozie-installer.markdown index b1953a54eb..d2de80dcc5 100644 --- a/dhp-workflows/docs/oozie-installer.markdown +++ b/dhp-workflows/docs/oozie-installer.markdown @@ -10,9 +10,8 @@ This module is automatically executed when running: on module having set: - eu.dnetlib - dhp-wf - 1.0.0-SNAPSHOT + eu.dnetlib.dhp + dhp-workflows in `pom.xml` file. `oozie-package` profile initializes oozie workflow packaging, `workflow.source.dir` property points to a workflow (notice: this is not a relative path but a classpath to directory usually holding `oozie_app` subdirectory). diff --git a/dhp-workflows/pom.xml b/dhp-workflows/pom.xml index 41465eca87..b882ede18d 100644 --- a/dhp-workflows/pom.xml +++ b/dhp-workflows/pom.xml @@ -6,7 +6,7 @@ eu.dnetlib.dhp dhp - 1.0.5-SNAPSHOT + 1.1.6-SNAPSHOT ../ @@ -17,6 +17,8 @@ dhp-aggregation dhp-distcp dhp-graph-mapper + dhp-dedup-openaire + dhp-graph-provision dhp-dedup-scholexplorer dhp-graph-provision-scholexplorer diff --git a/pom.xml b/pom.xml index 039b94d446..ae46e1e249 100644 --- a/pom.xml +++ b/pom.xml @@ -1,242 +1,353 @@ - + - 4.0.0 - eu.dnetlib.dhp - dhp - 1.0.5-SNAPSHOT - pom + 4.0.0 + eu.dnetlib.dhp + dhp + 1.1.6-SNAPSHOT + pom - http://www.d-net.research-infrastructures.eu + http://www.d-net.research-infrastructures.eu - - - The Apache Software License, Version 2.0 - http://www.apache.org/licenses/LICENSE-2.0.txt - repo - A business-friendly OSS license - - + + + The Apache Software License, Version 2.0 + http://www.apache.org/licenses/LICENSE-2.0.txt + repo + A business-friendly OSS license + + - - dhp-build - dhp-schemas - dhp-common - dhp-workflows - + + dhp-build + dhp-schemas + dhp-common + dhp-workflows + - - Redmine - https://issue.openaire.research-infrastructures.eu/projects/openaire - + + Redmine + https://issue.openaire.research-infrastructures.eu/projects/openaire + - - jenkins - https://jenkins-dnet.d4science.org/ - + + jenkins + https://jenkins-dnet.d4science.org/ + - - scm:git:gitea@code-repo.d4science.org:D-Net/dnet-hadoop.git - scm:git:gitea@code-repo.d4science.org:D-Net/dnet-hadoop.git - https://code-repo.d4science.org/D-Net/dnet-hadoop/ - HEAD - + + scm:git:gitea@code-repo.d4science.org:D-Net/dnet-hadoop.git + scm:git:gitea@code-repo.d4science.org:D-Net/dnet-hadoop.git + https://code-repo.d4science.org/D-Net/dnet-hadoop/ + HEAD + - - + + - - - dnet45-releases - D-Net 45 releases - http://maven.research-infrastructures.eu/nexus/content/repositories/dnet45-releases - default - - false - - - true - - - - cloudera - Cloudera Repository - https://repository.cloudera.com/artifactory/cloudera-repos - - true - - - false - - - + + + dnet45-releases + D-Net 45 releases + http://maven.research-infrastructures.eu/nexus/content/repositories/dnet45-releases + default + + false + + + true + + + + cloudera + Cloudera Repository + https://repository.cloudera.com/artifactory/cloudera-repos + + true + + + false + + + - - - junit - junit - 4.12 - test - + + + org.junit.jupiter + junit-jupiter + ${junit-jupiter.version} + test + - - org.mockito - mockito-core - 2.7.22 - test - + + org.mockito + mockito-core + ${mockito-core.version} + test + + + + org.mockito + mockito-junit-jupiter + ${mockito-core.version} + test + - - - - - - org.apache.hadoop - hadoop-hdfs - ${dhp.hadoop.version} - provided - - - org.apache.hadoop - hadoop-client - ${dhp.hadoop.version} - provided - - - org.apache.spark - spark-core_2.11 - ${dhp.spark.version} - provided - - - org.apache.spark - spark-sql_2.11 - ${dhp.spark.version} - provided - - - org.apache.spark - spark-graphx_2.11 - ${dhp.spark.version} - provided - - - - org.apache.commons - commons-lang3 - ${dhp.commons.lang.version} - - - - commons-codec - commons-codec - 1.9 - - - - commons-io - commons-io - 2.4 - - - org.mongodb - mongo-java-driver - 3.4.2 - - - commons-cli - commons-cli - 1.2 - provided - + - - net.sf.saxon - Saxon-HE - 9.5.1-5 - + + + + org.apache.hadoop + hadoop-hdfs + ${dhp.hadoop.version} + provided + + + org.apache.hadoop + hadoop-common + ${dhp.hadoop.version} + provided + + + org.apache.hadoop + hadoop-client + ${dhp.hadoop.version} + provided + + + org.apache.hadoop + hadoop-distcp + ${dhp.hadoop.version} + provided + + + org.apache.spark + spark-core_2.11 + ${dhp.spark.version} + provided + + + org.apache.spark + spark-sql_2.11 + ${dhp.spark.version} + provided + + + org.apache.spark + spark-graphx_2.11 + ${dhp.spark.version} + provided + - - dom4j - dom4j - 1.6.1 - + + org.slf4j + jcl-over-slf4j + 1.7.25 + provided + - - xml-apis - xml-apis - 1.4.01 - + + org.apache.commons + commons-lang3 + ${dhp.commons.lang.version} + - - jaxen - jaxen - 1.1.6 - - - - net.schmizz - sshj - 0.10.0 - test - - - - com.fasterxml.jackson.core - jackson-core - ${dhp.jackson.version} - provided - - - - com.fasterxml.jackson.core - jackson-annotations - ${dhp.jackson.version} - provided - - - com.fasterxml.jackson.core - jackson-databind - ${dhp.jackson.version} - provided - - - - eu.dnetlib - dnet-pace-core - 4.0.0 - + + com.google.guava + guava + ${dhp.guava.version} + - - javax.persistence - javax.persistence-api - 2.2 - provided - + + commons-codec + commons-codec + 1.9 + + + + commons-io + commons-io + 2.4 + + + + commons-cli + commons-cli + 1.2 + provided + + + + net.sf.saxon + Saxon-HE + 9.9.1-6 + + + + dom4j + dom4j + 1.6.1 + + + + xml-apis + xml-apis + 1.4.01 + + + + jaxen + jaxen + 1.1.6 + + + + com.mycila.xmltool + xmltool + 3.3 + + + + org.apache.solr + solr-solrj + 7.5.0 + + + * + * + + + + + com.lucidworks.spark + spark-solr + 3.6.0 + + + * + * + + + + + + org.apache.httpcomponents + httpclient + 4.5.3 + + + org.apache.httpcomponents + httpmime + 4.5.3 + + + org.noggit + noggit + 0.8 + + + org.apache.zookeeper + zookeeper + 3.4.11 + + + + net.schmizz + sshj + 0.10.0 + test + + + + com.fasterxml.jackson.core + jackson-core + ${dhp.jackson.version} + provided + + + + com.fasterxml.jackson.core + jackson-annotations + ${dhp.jackson.version} + provided + + + com.fasterxml.jackson.core + jackson-databind + ${dhp.jackson.version} + provided + + + + eu.dnetlib + dnet-actionmanager-common + 6.0.5 + + + eu.dnetlib + dnet-openaire-data-protos + 3.9.8-proto250 + + + eu.dnetlib + dnet-pace-core + 4.0.0 + + + eu.dnetlib + cnr-rmi-api + [2.0.0,3.0.0) + + + + org.apache.cxf + cxf-rt-transports-http + 3.1.5 + + + javax.persistence + javax.persistence-api + 2.2 + provided + + + + com.rabbitmq + amqp-client + 5.6.0 + + + com.jayway.jsonpath + json-path + 2.4.0 + + + com.arakelian + java-jq + 0.10.1 + + + edu.cmu + secondstring + 1.0.0 + + + org.mongodb + mongo-java-driver + ${mongodb.driver.version} + + + org.antlr + stringtemplate + 4.0 + - - com.rabbitmq - amqp-client - 5.6.0 - - - com.jayway.jsonpath - json-path - 2.4.0 - - - com.arakelian - java-jq - 0.10.1 - - - edu.cmu - secondstring - 1.0.0 - com.ximpleware vtd-xml @@ -250,197 +361,175 @@ - - org.apache.oozie - oozie-client - ${dhp.oozie.version} - provided - - - - slf4j-simple - org.slf4j - - - - - + org.apache.oozie + oozie-client + ${dhp.oozie.version} + provided + + + + slf4j-simple + org.slf4j + + + + + - - target - target/classes - ${project.artifactId}-${project.version} - target/test-classes - - - - org.apache.maven.plugins - maven-compiler-plugin - ${maven.compiler.plugin.version} - - 1.8 - 1.8 - ${project.build.sourceEncoding} - - + + target + target/classes + ${project.artifactId}-${project.version} + target/test-classes + + + + org.apache.maven.plugins + maven-compiler-plugin + ${maven.compiler.plugin.version} + + 1.8 + 1.8 + ${project.build.sourceEncoding} + + - - org.apache.maven.plugins - maven-jar-plugin - 3.0.2 - + + org.apache.maven.plugins + maven-jar-plugin + 3.0.2 + - - org.apache.maven.plugins - maven-source-plugin - 3.0.1 - - - attach-sources - verify - - jar-no-fork - - - - + + org.apache.maven.plugins + maven-source-plugin + 3.0.1 + + + attach-sources + verify + + jar-no-fork + + + + - - org.apache.maven.plugins - maven-surefire-plugin - 2.19.1 - - true - - - - org.apache.maven.plugins - maven-javadoc-plugin - 2.10.4 - - true - - - - org.apache.maven.plugins - maven-dependency-plugin - 3.0.0 - + + org.apache.maven.plugins + maven-surefire-plugin + 3.0.0-M4 + + true + + + + org.apache.maven.plugins + maven-javadoc-plugin + 2.10.4 + + true + + + + org.apache.maven.plugins + maven-dependency-plugin + 3.0.0 + - - org.codehaus.mojo - build-helper-maven-plugin - 1.12 - - - - - - org.apache.maven.plugins - maven-release-plugin - 2.5.3 - - - org.jacoco - jacoco-maven-plugin - 0.7.9 - - - **/schemas/* - **/com/cloudera/**/* - **/org/apache/avro/io/**/* - - - - - default-prepare-agent - - prepare-agent - - - - default-report - prepare-package - - report - - - - - - net.alchim31.maven - scala-maven-plugin - 4.0.1 - - - scala-compile-first - initialize - - add-source - compile - - - - scala-test-compile - process-test-resources - - testCompile - - - - - ${scala.version} - - - + + org.codehaus.mojo + build-helper-maven-plugin + 1.12 + + + + + + org.apache.maven.plugins + maven-release-plugin + 2.5.3 + + + org.jacoco + jacoco-maven-plugin + 0.7.9 + + + **/schemas/* + **/com/cloudera/**/* + **/org/apache/avro/io/**/* + + + + + default-prepare-agent + + prepare-agent + + + + default-report + prepare-package + + report + + + + - - - org.apache.maven.wagon - wagon-ssh - 2.10 - - - - - - dnet45-snapshots - DNet45 Snapshots - http://maven.research-infrastructures.eu/nexus/content/repositories/dnet45-snapshots - default - - - dnet45-releases - http://maven.research-infrastructures.eu/nexus/content/repositories/dnet45-releases - - - - - - org.apache.maven.plugins - maven-javadoc-plugin - 2.10.4 - - true - - - - + - - UTF-8 - UTF-8 - 3.6.0 - 2.22.2 - cdh5.9.2 - 2.6.0-${dhp.cdh.version} - 4.1.0-${dhp.cdh.version} - 2.4.0.cloudera2 - 2.9.6 - 3.5 - 2.11.12 - [2.12,3.0) - + + + org.apache.maven.wagon + wagon-ssh + 2.10 + + + + + + dnet45-snapshots + DNet45 Snapshots + http://maven.research-infrastructures.eu/nexus/content/repositories/dnet45-snapshots + default + + + dnet45-releases + http://maven.research-infrastructures.eu/nexus/content/repositories/dnet45-releases + + + + + + org.apache.maven.plugins + maven-javadoc-plugin + 2.10.4 + + true + + + + + + + UTF-8 + UTF-8 + 3.6.0 + 2.22.2 + cdh5.9.2 + 2.6.0-${dhp.cdh.version} + 4.1.0-${dhp.cdh.version} + 2.4.0.cloudera2 + 2.9.6 + 3.5 + 11.0.2 + 2.11.12 + 5.6.1 + 3.3.3 + 3.4.2 + [2.12,3.0) + -