merge with master

unique_field_in_lists
Sandro La Bruzzo 4 years ago
commit 0cd022ad6a

4
.gitignore vendored

@ -3,6 +3,8 @@
*.iws
*.ipr
*.iml
*.ipr
*.iws
*~
.vscode
.classpath
@ -21,5 +23,5 @@
/*/build
/build
spark-warehouse
/*/*/job-override.properties
/**/job-override.properties

@ -6,7 +6,7 @@
<parent>
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-build</artifactId>
<version>1.0.5-SNAPSHOT</version>
<version>1.1.6-SNAPSHOT</version>
</parent>
<artifactId>dhp-build-assembly-resources</artifactId>

@ -6,7 +6,7 @@
<parent>
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-build</artifactId>
<version>1.0.5-SNAPSHOT</version>
<version>1.1.6-SNAPSHOT</version>
</parent>
<artifactId>dhp-build-properties-maven-plugin</artifactId>
@ -76,6 +76,41 @@
</configuration>
</plugin>
</plugins>
<pluginManagement>
<plugins>
<!--This plugin's configuration is used to store Eclipse m2e settings only. It has no influence on the Maven build itself.-->
<plugin>
<groupId>org.eclipse.m2e</groupId>
<artifactId>lifecycle-mapping</artifactId>
<version>1.0.0</version>
<configuration>
<lifecycleMappingMetadata>
<pluginExecutions>
<pluginExecution>
<pluginExecutionFilter>
<groupId>
org.apache.maven.plugins
</groupId>
<artifactId>
maven-plugin-plugin
</artifactId>
<versionRange>
[3.2,)
</versionRange>
<goals>
<goal>descriptor</goal>
</goals>
</pluginExecutionFilter>
<action>
<ignore></ignore>
</action>
</pluginExecution>
</pluginExecutions>
</lifecycleMappingMetadata>
</configuration>
</plugin>
</plugins>
</pluginManagement>
</build>
</project>

@ -1,22 +1,21 @@
package eu.dnetlib.maven.plugin.properties;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import static eu.dnetlib.maven.plugin.properties.GenerateOoziePropertiesMojo.PROPERTY_NAME_SANDBOX_NAME;
import static eu.dnetlib.maven.plugin.properties.GenerateOoziePropertiesMojo.PROPERTY_NAME_WF_SOURCE_DIR;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNull;
import org.junit.Before;
import org.junit.Test;
import static org.junit.jupiter.api.Assertions.*;
/**
* @author mhorst
* @author mhorst, claudio.atzori
*
*/
public class GenerateOoziePropertiesMojoTest {
private GenerateOoziePropertiesMojo mojo = new GenerateOoziePropertiesMojo();
@Before
@BeforeEach
public void clearSystemProperties() {
System.clearProperty(PROPERTY_NAME_SANDBOX_NAME);
System.clearProperty(PROPERTY_NAME_WF_SOURCE_DIR);
@ -28,7 +27,7 @@ public class GenerateOoziePropertiesMojoTest {
mojo.execute();
// assert
assertNull(System.getProperty(PROPERTY_NAME_SANDBOX_NAME));
assertNull(System.getProperty(PROPERTY_NAME_SANDBOX_NAME));
}
@Test

@ -1,51 +1,41 @@
package eu.dnetlib.maven.plugin.properties;
import static eu.dnetlib.maven.plugin.properties.WritePredefinedProjectProperties.PROPERTY_PREFIX_ENV;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertTrue;
import static org.mockito.Mockito.doReturn;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.util.Properties;
import org.apache.maven.plugin.MojoExecutionException;
import org.apache.maven.project.MavenProject;
import org.junit.Before;
import org.junit.Rule;
import org.junit.Test;
import org.junit.rules.TemporaryFolder;
import org.junit.runner.RunWith;
import org.junit.jupiter.api.*;
import org.junit.jupiter.api.extension.ExtendWith;
import org.junit.jupiter.api.io.TempDir;
import org.mockito.Mock;
import org.mockito.runners.MockitoJUnitRunner;
import org.mockito.MockitoAnnotations;
import org.mockito.junit.jupiter.MockitoExtension;
import java.io.*;
import java.util.Properties;
import static eu.dnetlib.maven.plugin.properties.WritePredefinedProjectProperties.PROPERTY_PREFIX_ENV;
import static org.junit.jupiter.api.Assertions.*;
import static org.mockito.Mockito.doReturn;
import static org.mockito.Mockito.lenient;
/**
* @author mhorst
* @author mhorst, claudio.atzori
*
*/
@RunWith(MockitoJUnitRunner.class)
@ExtendWith(MockitoExtension.class)
public class WritePredefinedProjectPropertiesTest {
@Rule
public TemporaryFolder testFolder = new TemporaryFolder();
@Mock
private MavenProject mavenProject;
private WritePredefinedProjectProperties mojo;
@Before
public void init() {
@BeforeEach
public void init(@TempDir File testFolder) {
MockitoAnnotations.initMocks(this);
mojo = new WritePredefinedProjectProperties();
mojo.outputFile = getPropertiesFileLocation();
mojo.outputFile = getPropertiesFileLocation(testFolder);
mojo.project = mavenProject;
doReturn(new Properties()).when(mavenProject).getProperties();
lenient().doReturn(new Properties()).when(mavenProject).getProperties();
}
// ----------------------------------- TESTS ---------------------------------------------
@ -57,7 +47,7 @@ public class WritePredefinedProjectPropertiesTest {
// assert
assertTrue(mojo.outputFile.exists());
Properties storedProperties = getStoredProperties();
Properties storedProperties = getStoredProperties(mojo.outputFile.getParentFile());
assertEquals(0, storedProperties.size());
}
@ -75,28 +65,28 @@ public class WritePredefinedProjectPropertiesTest {
// assert
assertTrue(mojo.outputFile.exists());
Properties storedProperties = getStoredProperties();
Properties storedProperties = getStoredProperties(mojo.outputFile.getParentFile());
assertEquals(1, storedProperties.size());
assertTrue(storedProperties.containsKey(key));
assertEquals(value, storedProperties.getProperty(key));
}
@Test(expected=MojoExecutionException.class)
public void testExecuteWithProjectPropertiesAndInvalidOutputFile() throws Exception {
@Test()
public void testExecuteWithProjectPropertiesAndInvalidOutputFile(@TempDir File testFolder) {
// given
String key = "projectPropertyKey";
String value = "projectPropertyValue";
Properties projectProperties = new Properties();
projectProperties.setProperty(key, value);
doReturn(projectProperties).when(mavenProject).getProperties();
mojo.outputFile = testFolder.getRoot();
mojo.outputFile = testFolder;
// execute
mojo.execute();
Assertions.assertThrows(MojoExecutionException.class, () -> mojo.execute());
}
@Test
public void testExecuteWithProjectPropertiesExclusion() throws Exception {
public void testExecuteWithProjectPropertiesExclusion(@TempDir File testFolder) throws Exception {
// given
String key = "projectPropertyKey";
String value = "projectPropertyValue";
@ -113,14 +103,14 @@ public class WritePredefinedProjectPropertiesTest {
// assert
assertTrue(mojo.outputFile.exists());
Properties storedProperties = getStoredProperties();
Properties storedProperties = getStoredProperties(testFolder);
assertEquals(1, storedProperties.size());
assertTrue(storedProperties.containsKey(key));
assertEquals(value, storedProperties.getProperty(key));
}
@Test
public void testExecuteWithProjectPropertiesInclusion() throws Exception {
public void testExecuteWithProjectPropertiesInclusion(@TempDir File testFolder) throws Exception {
// given
String key = "projectPropertyKey";
String value = "projectPropertyValue";
@ -137,14 +127,14 @@ public class WritePredefinedProjectPropertiesTest {
// assert
assertTrue(mojo.outputFile.exists());
Properties storedProperties = getStoredProperties();
Properties storedProperties = getStoredProperties(testFolder);
assertEquals(1, storedProperties.size());
assertTrue(storedProperties.containsKey(includedKey));
assertEquals(includedValue, storedProperties.getProperty(includedKey));
}
@Test
public void testExecuteIncludingPropertyKeysFromFile() throws Exception {
public void testExecuteIncludingPropertyKeysFromFile(@TempDir File testFolder) throws Exception {
// given
String key = "projectPropertyKey";
String value = "projectPropertyValue";
@ -155,7 +145,7 @@ public class WritePredefinedProjectPropertiesTest {
projectProperties.setProperty(includedKey, includedValue);
doReturn(projectProperties).when(mavenProject).getProperties();
File includedPropertiesFile = new File(testFolder.getRoot(), "included.properties");
File includedPropertiesFile = new File(testFolder, "included.properties");
Properties includedProperties = new Properties();
includedProperties.setProperty(includedKey, "irrelevantValue");
includedProperties.store(new FileWriter(includedPropertiesFile), null);
@ -167,14 +157,14 @@ public class WritePredefinedProjectPropertiesTest {
// assert
assertTrue(mojo.outputFile.exists());
Properties storedProperties = getStoredProperties();
Properties storedProperties = getStoredProperties(testFolder);
assertEquals(1, storedProperties.size());
assertTrue(storedProperties.containsKey(includedKey));
assertEquals(includedValue, storedProperties.getProperty(includedKey));
}
@Test
public void testExecuteIncludingPropertyKeysFromClasspathResource() throws Exception {
public void testExecuteIncludingPropertyKeysFromClasspathResource(@TempDir File testFolder) throws Exception {
// given
String key = "projectPropertyKey";
String value = "projectPropertyValue";
@ -192,14 +182,14 @@ public class WritePredefinedProjectPropertiesTest {
// assert
assertTrue(mojo.outputFile.exists());
Properties storedProperties = getStoredProperties();
Properties storedProperties = getStoredProperties(testFolder);
assertEquals(1, storedProperties.size());
assertTrue(storedProperties.containsKey(includedKey));
assertEquals(includedValue, storedProperties.getProperty(includedKey));
}
@Test(expected=MojoExecutionException.class)
public void testExecuteIncludingPropertyKeysFromBlankLocation() throws Exception {
@Test
public void testExecuteIncludingPropertyKeysFromBlankLocation() {
// given
String key = "projectPropertyKey";
String value = "projectPropertyValue";
@ -213,11 +203,11 @@ public class WritePredefinedProjectPropertiesTest {
mojo.setIncludePropertyKeysFromFiles(new String[] {""});
// execute
mojo.execute();
Assertions.assertThrows(MojoExecutionException.class, () -> mojo.execute());
}
@Test
public void testExecuteIncludingPropertyKeysFromXmlFile() throws Exception {
public void testExecuteIncludingPropertyKeysFromXmlFile(@TempDir File testFolder) throws Exception {
// given
String key = "projectPropertyKey";
String value = "projectPropertyValue";
@ -228,7 +218,7 @@ public class WritePredefinedProjectPropertiesTest {
projectProperties.setProperty(includedKey, includedValue);
doReturn(projectProperties).when(mavenProject).getProperties();
File includedPropertiesFile = new File(testFolder.getRoot(), "included.xml");
File includedPropertiesFile = new File(testFolder, "included.xml");
Properties includedProperties = new Properties();
includedProperties.setProperty(includedKey, "irrelevantValue");
includedProperties.storeToXML(new FileOutputStream(includedPropertiesFile), null);
@ -240,14 +230,14 @@ public class WritePredefinedProjectPropertiesTest {
// assert
assertTrue(mojo.outputFile.exists());
Properties storedProperties = getStoredProperties();
Properties storedProperties = getStoredProperties(testFolder);
assertEquals(1, storedProperties.size());
assertTrue(storedProperties.containsKey(includedKey));
assertEquals(includedValue, storedProperties.getProperty(includedKey));
}
@Test(expected=MojoExecutionException.class)
public void testExecuteIncludingPropertyKeysFromInvalidXmlFile() throws Exception {
@Test
public void testExecuteIncludingPropertyKeysFromInvalidXmlFile(@TempDir File testFolder) throws Exception {
// given
String key = "projectPropertyKey";
String value = "projectPropertyValue";
@ -258,7 +248,7 @@ public class WritePredefinedProjectPropertiesTest {
projectProperties.setProperty(includedKey, includedValue);
doReturn(projectProperties).when(mavenProject).getProperties();
File includedPropertiesFile = new File(testFolder.getRoot(), "included.xml");
File includedPropertiesFile = new File(testFolder, "included.xml");
Properties includedProperties = new Properties();
includedProperties.setProperty(includedKey, "irrelevantValue");
includedProperties.store(new FileOutputStream(includedPropertiesFile), null);
@ -266,11 +256,11 @@ public class WritePredefinedProjectPropertiesTest {
mojo.setIncludePropertyKeysFromFiles(new String[] {includedPropertiesFile.getAbsolutePath()});
// execute
mojo.execute();
Assertions.assertThrows(MojoExecutionException.class, () -> mojo.execute());
}
@Test
public void testExecuteWithQuietModeOn() throws Exception {
public void testExecuteWithQuietModeOn(@TempDir File testFolder) throws Exception {
// given
mojo.setQuiet(true);
mojo.setIncludePropertyKeysFromFiles(new String[] {"invalid location"});
@ -280,21 +270,21 @@ public class WritePredefinedProjectPropertiesTest {
// assert
assertTrue(mojo.outputFile.exists());
Properties storedProperties = getStoredProperties();
Properties storedProperties = getStoredProperties(testFolder);
assertEquals(0, storedProperties.size());
}
@Test(expected=MojoExecutionException.class)
public void testExecuteIncludingPropertyKeysFromInvalidFile() throws Exception {
@Test
public void testExecuteIncludingPropertyKeysFromInvalidFile() {
// given
mojo.setIncludePropertyKeysFromFiles(new String[] {"invalid location"});
// execute
mojo.execute();
Assertions.assertThrows(MojoExecutionException.class, () -> mojo.execute());
}
@Test
public void testExecuteWithEnvironmentProperties() throws Exception {
public void testExecuteWithEnvironmentProperties(@TempDir File testFolder) throws Exception {
// given
mojo.setIncludeEnvironmentVariables(true);
@ -303,7 +293,7 @@ public class WritePredefinedProjectPropertiesTest {
// assert
assertTrue(mojo.outputFile.exists());
Properties storedProperties = getStoredProperties();
Properties storedProperties = getStoredProperties(testFolder);
assertTrue(storedProperties.size() > 0);
for (Object currentKey : storedProperties.keySet()) {
assertTrue(((String)currentKey).startsWith(PROPERTY_PREFIX_ENV));
@ -311,7 +301,7 @@ public class WritePredefinedProjectPropertiesTest {
}
@Test
public void testExecuteWithSystemProperties() throws Exception {
public void testExecuteWithSystemProperties(@TempDir File testFolder) throws Exception {
// given
String key = "systemPropertyKey";
String value = "systemPropertyValue";
@ -323,14 +313,14 @@ public class WritePredefinedProjectPropertiesTest {
// assert
assertTrue(mojo.outputFile.exists());
Properties storedProperties = getStoredProperties();
Properties storedProperties = getStoredProperties(testFolder);
assertTrue(storedProperties.size() > 0);
assertTrue(storedProperties.containsKey(key));
assertEquals(value, storedProperties.getProperty(key));
}
@Test
public void testExecuteWithSystemPropertiesAndEscapeChars() throws Exception {
public void testExecuteWithSystemPropertiesAndEscapeChars(@TempDir File testFolder) throws Exception {
// given
String key = "systemPropertyKey ";
String value = "systemPropertyValue";
@ -344,7 +334,7 @@ public class WritePredefinedProjectPropertiesTest {
// assert
assertTrue(mojo.outputFile.exists());
Properties storedProperties = getStoredProperties();
Properties storedProperties = getStoredProperties(testFolder);
assertTrue(storedProperties.size() > 0);
assertFalse(storedProperties.containsKey(key));
assertTrue(storedProperties.containsKey(key.trim()));
@ -353,13 +343,13 @@ public class WritePredefinedProjectPropertiesTest {
// ----------------------------------- PRIVATE -------------------------------------------
private File getPropertiesFileLocation() {
return new File(testFolder.getRoot(), "test.properties");
private File getPropertiesFileLocation(File testFolder) {
return new File(testFolder, "test.properties");
}
private Properties getStoredProperties() throws FileNotFoundException, IOException {
private Properties getStoredProperties(File testFolder) throws FileNotFoundException, IOException {
Properties properties = new Properties();
properties.load(new FileInputStream(getPropertiesFileLocation()));
properties.load(new FileInputStream(getPropertiesFileLocation(testFolder)));
return properties;
}
}

@ -4,7 +4,7 @@
<parent>
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp</artifactId>
<version>1.0.5-SNAPSHOT</version>
<version>1.1.6-SNAPSHOT</version>
</parent>
<artifactId>dhp-build</artifactId>
<packaging>pom</packaging>

@ -5,7 +5,7 @@
<parent>
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp</artifactId>
<version>1.0.5-SNAPSHOT</version>
<version>1.1.6-SNAPSHOT</version>
<relativePath>../</relativePath>
</parent>
@ -42,6 +42,23 @@
<groupId>com.rabbitmq</groupId>
<artifactId>amqp-client</artifactId>
</dependency>
<dependency>
<groupId>net.sf.saxon</groupId>
<artifactId>Saxon-HE</artifactId>
</dependency>
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>jcl-over-slf4j</artifactId>
</dependency>
<dependency>
<groupId>org.apache.cxf</groupId>
<artifactId>cxf-rt-transports-http</artifactId>
</dependency>
<dependency>
<groupId>eu.dnetlib</groupId>
<artifactId>cnr-rmi-api</artifactId>
</dependency>
<dependency>
<groupId>com.ximpleware</groupId>
<artifactId>vtd-xml</artifactId>

@ -0,0 +1,24 @@
package eu.dnetlib.dhp.utils;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.cxf.jaxws.JaxWsProxyFactoryBean;
public class ISLookupClientFactory {
private static final Log log = LogFactory.getLog(ISLookupClientFactory.class);
public static ISLookUpService getLookUpService(final String isLookupUrl) {
return getServiceStub(ISLookUpService.class, isLookupUrl);
}
@SuppressWarnings("unchecked")
private static <T> T getServiceStub(final Class<T> clazz, final String endpoint) {
log.info(String.format("creating %s stub from %s", clazz.getName(), endpoint));
final JaxWsProxyFactoryBean jaxWsProxyFactory = new JaxWsProxyFactoryBean();
jaxWsProxyFactory.setServiceClass(clazz);
jaxWsProxyFactory.setAddress(endpoint);
return (T) jaxWsProxyFactory.create();
}
}

@ -0,0 +1,32 @@
package eu.dnetlib.dhp.utils.saxon;
import net.sf.saxon.expr.XPathContext;
import net.sf.saxon.lib.ExtensionFunctionCall;
import net.sf.saxon.lib.ExtensionFunctionDefinition;
import net.sf.saxon.om.Sequence;
import net.sf.saxon.om.StructuredQName;
import net.sf.saxon.trans.XPathException;
public abstract class AbstractExtensionFunction extends ExtensionFunctionDefinition {
public static String DEFAULT_SAXON_EXT_NS_URI = "http://www.d-net.research-infrastructures.eu/saxon-extension";
public abstract String getName();
public abstract Sequence doCall(XPathContext context, Sequence[] arguments) throws XPathException;
@Override
public StructuredQName getFunctionQName() {
return new StructuredQName("dnet", DEFAULT_SAXON_EXT_NS_URI, getName());
}
@Override
public ExtensionFunctionCall makeCallExpression() {
return new ExtensionFunctionCall() {
@Override
public Sequence call(XPathContext context, Sequence[] arguments) throws XPathException {
return doCall(context, arguments);
}
};
}
}

@ -0,0 +1,67 @@
package eu.dnetlib.dhp.utils.saxon;
import net.sf.saxon.expr.XPathContext;
import net.sf.saxon.om.Item;
import net.sf.saxon.om.Sequence;
import net.sf.saxon.trans.XPathException;
import net.sf.saxon.value.SequenceType;
import net.sf.saxon.value.StringValue;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Calendar;
import java.util.GregorianCalendar;
public class ExtractYear extends AbstractExtensionFunction {
private static final String[] dateFormats = { "yyyy-MM-dd", "yyyy/MM/dd" };
@Override
public String getName() {
return "extractYear";
}
@Override
public Sequence doCall(XPathContext context, Sequence[] arguments) throws XPathException {
if (arguments == null | arguments.length == 0) {
return new StringValue("");
}
final Item item = arguments[0].head();
if (item == null) {
return new StringValue("");
}
return new StringValue(_year(item.getStringValue()));
}
@Override
public int getMinimumNumberOfArguments() {
return 0;
}
@Override
public int getMaximumNumberOfArguments() {
return 1;
}
@Override
public SequenceType[] getArgumentTypes() {
return new SequenceType[] { SequenceType.OPTIONAL_ITEM };
}
@Override
public SequenceType getResultType(SequenceType[] suppliedArgumentTypes) {
return SequenceType.SINGLE_STRING;
}
private String _year(String s) {
Calendar c = new GregorianCalendar();
for (String format : dateFormats) {
try {
c.setTime(new SimpleDateFormat(format).parse(s));
String year = String.valueOf(c.get(Calendar.YEAR));
return year;
} catch (ParseException e) {}
}
return "";
}
}

@ -0,0 +1,66 @@
package eu.dnetlib.dhp.utils.saxon;
import net.sf.saxon.expr.XPathContext;
import net.sf.saxon.om.Sequence;
import net.sf.saxon.trans.XPathException;
import net.sf.saxon.value.SequenceType;
import net.sf.saxon.value.StringValue;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Date;
public class NormalizeDate extends AbstractExtensionFunction {
private static final String[] normalizeDateFormats = { "yyyy-MM-dd'T'hh:mm:ss", "yyyy-MM-dd", "yyyy/MM/dd", "yyyy" };
private static final String normalizeOutFormat = new String("yyyy-MM-dd'T'hh:mm:ss'Z'");
@Override
public String getName() {
return "normalizeDate";
}
@Override
public Sequence doCall(XPathContext context, Sequence[] arguments) throws XPathException {
if (arguments == null | arguments.length == 0) {
return new StringValue("");
}
String s = arguments[0].head().getStringValue();
return new StringValue(_year(s));
}
@Override
public int getMinimumNumberOfArguments() {
return 0;
}
@Override
public int getMaximumNumberOfArguments() {
return 1;
}
@Override
public SequenceType[] getArgumentTypes() {
return new SequenceType[] { SequenceType.OPTIONAL_ITEM };
}
@Override
public SequenceType getResultType(SequenceType[] suppliedArgumentTypes) {
return SequenceType.SINGLE_STRING;
}
private String _year(String s) {
final String date = s != null ? s.trim() : "";
for (String format : normalizeDateFormats) {
try {
Date parse = new SimpleDateFormat(format).parse(date);
String res = new SimpleDateFormat(normalizeOutFormat).format(parse);
return res;
} catch (ParseException e) {}
}
return "";
}
}

@ -0,0 +1,60 @@
package eu.dnetlib.dhp.utils.saxon;
import net.sf.saxon.expr.XPathContext;
import net.sf.saxon.om.Item;
import net.sf.saxon.om.Sequence;
import net.sf.saxon.trans.XPathException;
import net.sf.saxon.value.SequenceType;
import net.sf.saxon.value.StringValue;
import org.apache.commons.lang3.StringUtils;
public class PickFirst extends AbstractExtensionFunction {
@Override
public String getName() {
return "pickFirst";
}
@Override
public Sequence doCall(XPathContext context, Sequence[] arguments) throws XPathException {
if (arguments == null | arguments.length == 0) {
return new StringValue("");
}
final String s1 = getValue(arguments[0]);
final String s2 = getValue(arguments[1]);
return new StringValue(StringUtils.isNotBlank(s1) ? s1 : StringUtils.isNotBlank(s2) ? s2 : "");
}
private String getValue(final Sequence arg) throws XPathException {
if (arg != null) {
final Item item = arg.head();
if (item != null) {
return item.getStringValue();
}
}
return "";
}
@Override
public int getMinimumNumberOfArguments() {
return 0;
}
@Override
public int getMaximumNumberOfArguments() {
return 2;
}
@Override
public SequenceType[] getArgumentTypes() {
return new SequenceType[] { SequenceType.OPTIONAL_ITEM };
}
@Override
public SequenceType getResultType(SequenceType[] suppliedArgumentTypes) {
return SequenceType.SINGLE_STRING;
}
}

@ -0,0 +1,30 @@
package eu.dnetlib.dhp.utils.saxon;
import net.sf.saxon.Configuration;
import net.sf.saxon.TransformerFactoryImpl;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerException;
import javax.xml.transform.stream.StreamSource;
import java.io.StringReader;
public class SaxonTransformerFactory {
/**
* Creates the index record transformer from the given XSLT
* @param xslt
* @return
* @throws TransformerException
*/
public static Transformer newInstance(final String xslt) throws TransformerException {
final TransformerFactoryImpl factory = new TransformerFactoryImpl();
final Configuration conf = factory.getConfiguration();
conf.registerExtensionFunction(new ExtractYear());
conf.registerExtensionFunction(new NormalizeDate());
conf.registerExtensionFunction(new PickFirst());
return factory.newTransformer(new StreamSource(new StringReader(xslt)));
}
}

@ -1,18 +1,13 @@
package eu.dnetlib.dhp.application;
import org.apache.commons.io.IOUtils;
import org.junit.Test;
import org.junit.jupiter.api.Test;
import java.io.ByteArrayOutputStream;
import java.util.Base64;
import java.util.zip.GZIPOutputStream;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNotNull;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertNotNull;
public class ArgumentApplicationParserTest {
@Test
public void testParseParameter() throws Exception {
final String jsonConfiguration = IOUtils.toString(this.getClass().getResourceAsStream("/eu/dnetlib/application/parameters.json"));

@ -1,8 +1,8 @@
package eu.dnetlib.dhp.model.mdstore;
import org.junit.Test;
import org.junit.jupiter.api.Test;
import static org.junit.Assert.assertTrue;
import static org.junit.jupiter.api.Assertions.assertTrue;
public class MetadataRecordTest {
@ -10,6 +10,6 @@ public class MetadataRecordTest {
public void getTimestamp() {
MetadataRecord r = new MetadataRecord();
assertTrue(r.getDateOfCollection() >0);
assertTrue(r.getDateOfCollection() > 0);
}
}

@ -1,12 +1,12 @@
package eu.dnetlib.message;
import org.junit.Test;
import org.junit.jupiter.api.Test;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
import static org.junit.Assert.*;
import static org.junit.jupiter.api.Assertions.*;
public class MessageTest {

@ -1,7 +1,7 @@
package eu.dnetlib.scholexplorer.relation;
import org.apache.commons.io.IOUtils;
import org.junit.Test;
import org.junit.jupiter.api.Test;
public class RelationMapperTest {

Binary file not shown.

After

Width:  |  Height:  |  Size: 689 KiB

@ -1,3 +1,11 @@
Description of the project
--------------------------
This project defines **serialization schemas** of Avro data store files that are used to pass data between workflow nodes in the system.
This project defines **object schemas** of the OpenAIRE main entities and the relationships that intercur among them.
Namely it defines the model for
- **research product (result)** which subclasses in publication, dataset, other research product, software
- **data source** object describing the data provider (institutional repository, aggregators, cris systems)
- **organization** research bodies managing a data source or participating to a research project
- **project** research project
Te serialization of such objects (data store files) are used to pass data between workflow nodes in the processing pipeline.

@ -5,7 +5,7 @@
<parent>
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp</artifactId>
<version>1.0.5-SNAPSHOT</version>
<version>1.1.6-SNAPSHOT</version>
<relativePath>../</relativePath>
</parent>
@ -26,18 +26,15 @@
<artifactId>commons-lang3</artifactId>
</dependency>
<dependency>
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-common</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-databind</artifactId>
<scope>test</scope>
</dependency>
<dependency>
<groupId>com.google.guava</groupId>
<artifactId>guava</artifactId>
</dependency>
</dependencies>

@ -0,0 +1,38 @@
package eu.dnetlib.dhp.schema.action;
import com.fasterxml.jackson.databind.annotation.JsonDeserialize;
import eu.dnetlib.dhp.schema.oaf.Oaf;
import java.io.Serializable;
@JsonDeserialize(using = AtomicActionDeserializer.class)
public class AtomicAction<T extends Oaf> implements Serializable {
private Class<T> clazz;
private T payload;
public AtomicAction() {
}
public AtomicAction(Class<T> clazz, T payload) {
this.clazz = clazz;
this.payload = payload;
}
public Class<T> getClazz() {
return clazz;
}
public void setClazz(Class<T> clazz) {
this.clazz = clazz;
}
public T getPayload() {
return payload;
}
public void setPayload(T payload) {
this.payload = payload;
}
}

@ -0,0 +1,29 @@
package eu.dnetlib.dhp.schema.action;
import com.fasterxml.jackson.core.JsonParser;
import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.DeserializationContext;
import com.fasterxml.jackson.databind.JsonDeserializer;
import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.schema.oaf.Oaf;
import java.io.IOException;
public class AtomicActionDeserializer extends JsonDeserializer {
@Override
public Object deserialize(JsonParser jp, DeserializationContext ctxt) throws IOException, JsonProcessingException {
JsonNode node = jp.getCodec().readTree(jp);
String classTag = node.get("clazz").asText();
JsonNode payload = node.get("payload");
ObjectMapper mapper = new ObjectMapper();
try {
final Class<?> clazz = Class.forName(classTag);
return new AtomicAction(clazz, (Oaf) mapper.readValue(payload.toString(), clazz));
} catch (ClassNotFoundException e) {
throw new IOException(e);
}
}
}

@ -0,0 +1,15 @@
package eu.dnetlib.dhp.schema.oaf;
public class Country extends Qualifier {
private DataInfo dataInfo;
public DataInfo getDataInfo() {
return dataInfo;
}
public void setDataInfo(DataInfo dataInfo) {
this.dataInfo = dataInfo;
}
}

@ -40,9 +40,9 @@ public class Datasource extends OafEntity implements Serializable {
private List<Field<String>> odlanguages;
private List< Field<String>> odcontenttypes;
private List<Field<String>> odcontenttypes;
private List< Field<String>> accessinfopackage;
private List<Field<String>> accessinfopackage;
// re3data fields
private Field<String> releasestartdate;

@ -1,5 +1,6 @@
package eu.dnetlib.dhp.schema.oaf;
import com.fasterxml.jackson.annotation.JsonIgnore;
import org.apache.commons.lang3.StringUtils;
import java.io.Serializable;
@ -36,7 +37,7 @@ public class GeoLocation implements Serializable {
this.place = place;
}
@JsonIgnore
public boolean isBlank() {
return StringUtils.isBlank(point) &&
StringUtils.isBlank(box) &&

@ -22,6 +22,14 @@ public class Instance implements Serializable {
private Field<String> dateofacceptance;
// ( article | book ) processing charges. Defined here to cope with possible wrongly typed results
private Field<String> processingchargeamount;
// currency - alphabetic code describe in ISO-4217. Defined here to cope with possible wrongly typed results
private Field<String> processingchargecurrency;
private Field<String> refereed; //peer-review status
public Field<String> getLicense() {
return license;
}
@ -86,7 +94,29 @@ public class Instance implements Serializable {
this.dateofacceptance = dateofacceptance;
}
public Field<String> getProcessingchargeamount() {
return processingchargeamount;
}
public void setProcessingchargeamount(Field<String> processingchargeamount) {
this.processingchargeamount = processingchargeamount;
}
public Field<String> getProcessingchargecurrency() {
return processingchargecurrency;
}
public void setProcessingchargecurrency(Field<String> processingchargecurrency) {
this.processingchargecurrency = processingchargecurrency;
}
public Field<String> getRefereed() {
return refereed;
}
public void setRefereed(Field<String> refereed) {
this.refereed = refereed;
}
public String toComparableString(){
return String.format("%s::%s::%s::%s",

@ -1,12 +1,10 @@
package eu.dnetlib.dhp.schema.oaf;
import com.fasterxml.jackson.annotation.JacksonInject;
import com.fasterxml.jackson.annotation.JsonIgnore;
import com.fasterxml.jackson.annotation.JsonIgnoreProperties;
import org.apache.commons.lang3.StringUtils;
import java.io.Serializable;
@JsonIgnoreProperties({"blank"})
public class KeyValue implements Serializable {
private String key;
@ -39,7 +37,6 @@ public class KeyValue implements Serializable {
this.dataInfo = dataInfo;
}
@JsonIgnore
public String toComparableString() {
return isBlank()?"":String.format("%s::%s", key != null ? key.toLowerCase() : "", value != null ? value.toLowerCase() : "");
}

@ -1,5 +1,6 @@
package eu.dnetlib.dhp.schema.oaf;
import com.fasterxml.jackson.annotation.JsonIgnore;
import org.apache.commons.lang3.StringUtils;
import java.io.Serializable;
@ -50,6 +51,8 @@ public class Qualifier implements Serializable {
schemeid != null ? schemeid : "",
schemename != null ? schemename : "");
}
@JsonIgnore
public boolean isBlank() {
return StringUtils.isBlank(classid) &&
StringUtils.isBlank(classname) &&

@ -1,85 +1,104 @@
package eu.dnetlib.dhp.schema.oaf;
import java.util.*;
import java.util.ArrayList;
import java.util.List;
import java.util.Objects;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import static com.google.common.base.Preconditions.checkArgument;
public class Relation extends Oaf {
private String relType;
private String relType;
private String subRelType;
private String relClass;
private String source;
private String target;
private String subRelType;
private List<KeyValue> collectedFrom = new ArrayList<>();
private String relClass;
public String getRelType() {
return relType;
}
private String source;
public void setRelType(final String relType) {
this.relType = relType;
}
private String target;
public String getSubRelType() {
return subRelType;
}
private List<KeyValue> collectedFrom;
public void setSubRelType(final String subRelType) {
this.subRelType = subRelType;
}
public String getRelType() {
return relType;
}
public String getRelClass() {
return relClass;
}
public void setRelType(String relType) {
this.relType = relType;
}
public void setRelClass(final String relClass) {
this.relClass = relClass;
}
public String getSubRelType() {
return subRelType;
}
public String getSource() {
return source;
}
public void setSubRelType(String subRelType) {
this.subRelType = subRelType;
}
public void setSource(final String source) {
this.source = source;
}
public String getRelClass() {
return relClass;
}
public String getTarget() {
return target;
}
public void setRelClass(String relClass) {
this.relClass = relClass;
}
public void setTarget(final String target) {
this.target = target;
}
public String getSource() {
return source;
}
public List<KeyValue> getCollectedFrom() {
return collectedFrom;
}
public void setSource(String source) {
this.source = source;
}
public void setCollectedFrom(final List<KeyValue> collectedFrom) {
this.collectedFrom = collectedFrom;
}
public String getTarget() {
return target;
}
public void mergeFrom(final Relation r) {
public void setTarget(String target) {
this.target = target;
}
checkArgument(Objects.equals(getSource(), r.getSource()),"source ids must be equal");
checkArgument(Objects.equals(getTarget(), r.getTarget()),"target ids must be equal");
checkArgument(Objects.equals(getRelType(), r.getRelType()),"relType(s) must be equal");
checkArgument(Objects.equals(getSubRelType(), r.getSubRelType()),"subRelType(s) must be equal");
checkArgument(Objects.equals(getRelClass(), r.getRelClass()),"relClass(es) must be equal");
public List<KeyValue> getCollectedFrom() {
return collectedFrom;
}
setCollectedFrom(Stream.concat(getCollectedFrom().stream(), r.getCollectedFrom().stream())
.distinct() // relies on KeyValue.equals
.collect(Collectors.toList()));
}
public void setCollectedFrom(List<KeyValue> collectedFrom) {
this.collectedFrom = collectedFrom;
}
@Override
public boolean equals(Object o) {
if (this == o) return true;
if (o == null || getClass() != o.getClass()) return false;
Relation relation = (Relation) o;
return relType.equals(relation.relType) &&
subRelType.equals(relation.subRelType) &&
relClass.equals(relation.relClass) &&
source.equals(relation.source) &&
target.equals(relation.target) &&
Objects.equals(collectedFrom, relation.collectedFrom);
}
public void mergeFrom(Relation other) {
this.mergeOAFDataInfo(other);
if (other.getCollectedFrom() == null || other.getCollectedFrom().size() == 0)
return;
if (collectedFrom == null && other.getCollectedFrom() != null) {
collectedFrom = other.getCollectedFrom();
return;
}
if (other.getCollectedFrom() != null) {
collectedFrom.addAll(other.getCollectedFrom());
@Override
public int hashCode() {
return Objects.hash(relType, subRelType, relClass, source, target, collectedFrom);
}
collectedFrom = new ArrayList<>(collectedFrom
.stream()
.collect(Collectors.toMap(KeyValue::toComparableString, x -> x, (x1, x2) -> x1))
.values());
}
}
}

@ -1,12 +1,10 @@
package eu.dnetlib.dhp.schema.oaf;
import org.apache.commons.lang3.StringUtils;
import java.io.Serializable;
import java.util.*;
import java.util.stream.Collectors;
import java.util.Comparator;
import java.util.List;
public abstract class Result extends OafEntity implements Serializable {
public class Result extends OafEntity implements Serializable {
private List<Author> author;
@ -16,7 +14,7 @@ public abstract class Result extends OafEntity implements Serializable {
// common fields
private Qualifier language;
private List<Qualifier> country;
private List<Country> country;
private List<StructuredProperty> subject;
@ -44,16 +42,10 @@ public abstract class Result extends OafEntity implements Serializable {
private List<Field<String>> coverage;
private Field<String> refereed; //peer-review status
private Qualifier bestaccessright;
private List<Context> context;
// ( article | book ) processing charges. Defined here to cope with possible wrongly typed results
private Field<String> processingchargeamount;
// currency - alphabetic code describe in ISO-4217. Defined here to cope with possible wrongly typed results
private Field<String> processingchargecurrency;
private List<ExternalReference> externalReference;
private List<Instance> instance;
@ -82,11 +74,11 @@ public abstract class Result extends OafEntity implements Serializable {
this.language = language;
}
public List<Qualifier> getCountry() {
public List<Country> getCountry() {
return country;
}
public void setCountry(List<Qualifier> country) {
public void setCountry(List<Country> country) {
this.country = country;
}
@ -194,12 +186,12 @@ public abstract class Result extends OafEntity implements Serializable {
this.coverage = coverage;
}
public Field<String> getRefereed() {
return refereed;
public Qualifier getBestaccessright() {
return bestaccessright;
}
public void setRefereed(Field<String> refereed) {
this.refereed = refereed;
public void setBestaccessright(Qualifier bestaccessright) {
this.bestaccessright = bestaccessright;
}
public List<Context> getContext() {
@ -226,24 +218,6 @@ public abstract class Result extends OafEntity implements Serializable {
this.instance = instance;
}
public Field<String> getProcessingchargeamount() {
return processingchargeamount;
}
public Result setProcessingchargeamount(Field<String> processingchargeamount) {
this.processingchargeamount = processingchargeamount;
return this;
}
public Field<String> getProcessingchargecurrency() {
return processingchargecurrency;
}
public Result setProcessingchargecurrency(Field<String> processingchargecurrency) {
this.processingchargecurrency = processingchargecurrency;
return this;
}
@Override
public void mergeFrom(OafEntity e) {
super.mergeFrom(e);
@ -287,19 +261,9 @@ public abstract class Result extends OafEntity implements Serializable {
coverage = mergeLists(coverage, r.getCoverage());
if (r.getRefereed() != null && compareTrust(this, r) < 0)
refereed = r.getRefereed();
context = mergeLists(context, r.getContext());
if (r.getProcessingchargeamount() != null && compareTrust(this, r) < 0)
processingchargeamount = r.getProcessingchargeamount();
if (r.getProcessingchargecurrency() != null && compareTrust(this, r) < 0)
processingchargecurrency = r.getProcessingchargecurrency();
externalReference = mergeLists(externalReference, r.getExternalReference());
}
@ -314,5 +278,4 @@ public abstract class Result extends OafEntity implements Serializable {
return a.size() > b.size() ? a : b;
}
}

@ -0,0 +1,42 @@
package eu.dnetlib.dhp.schema.action;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.schema.oaf.Relation;
import org.apache.commons.lang3.StringUtils;
import org.junit.jupiter.api.Test;
import java.io.IOException;
import static org.junit.jupiter.api.Assertions.*;
/**
* @author claudio.atzori
*/
public class AtomicActionTest {
@Test
public void serializationTest() throws IOException {
Relation rel = new Relation();
rel.setSource("1");
rel.setTarget("2");
rel.setRelType("resultResult");
rel.setSubRelType("dedup");
rel.setRelClass("merges");
AtomicAction aa1 = new AtomicAction(Relation.class, rel);
final ObjectMapper mapper = new ObjectMapper();
String json = mapper.writeValueAsString(aa1);
assertTrue(StringUtils.isNotBlank(json));
AtomicAction aa2 = mapper.readValue(json, AtomicAction.class);
assertEquals(aa1.getClazz(), aa2.getClazz());
assertEquals(aa1.getPayload(), aa2.getPayload());
}
}

@ -1,11 +1,9 @@
package eu.dnetlib.dhp.schema.oaf;
import org.junit.Assert;
import org.junit.Before;
import org.junit.Test;
import static org.junit.jupiter.api.Assertions.*;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
@ -13,7 +11,7 @@ public class MergeTest {
OafEntity oaf;
@Before
@BeforeEach
public void setUp() {
oaf = new Publication();
}
@ -44,8 +42,8 @@ public class MergeTest {
a.mergeFrom(b);
Assert.assertNotNull(a.getCollectedfrom());
Assert.assertEquals(3, a.getCollectedfrom().size());
assertNotNull(a.getCollectedfrom());
assertEquals(3, a.getCollectedfrom().size());
}
@ -60,8 +58,8 @@ public class MergeTest {
a.mergeFrom(b);
Assert.assertNotNull(a.getSubject());
Assert.assertEquals(3, a.getSubject().size());
assertNotNull(a.getSubject());
assertEquals(3, a.getSubject().size());
}

@ -6,7 +6,7 @@ import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.SerializationFeature;
import eu.dnetlib.dhp.schema.oaf.Qualifier;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
import org.junit.Test;
import org.junit.jupiter.api.Test;
import java.io.IOException;
import java.util.Arrays;

@ -4,7 +4,7 @@
<parent>
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-workflows</artifactId>
<version>1.0.5-SNAPSHOT</version>
<version>1.1.6-SNAPSHOT</version>
</parent>
<artifactId>dhp-aggregation</artifactId>
@ -24,6 +24,61 @@
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-common</artifactId>
<version>${project.version}</version>
<exclusions>
<exclusion>
<groupId>com.sun.xml.bind</groupId>
<artifactId>jaxb-core</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-schemas</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>eu.dnetlib</groupId>
<artifactId>dnet-actionmanager-common</artifactId>
<exclusions>
<exclusion>
<groupId>eu.dnetlib</groupId>
<artifactId>dnet-openaireplus-mapping-utils</artifactId>
</exclusion>
<exclusion>
<groupId>saxonica</groupId>
<artifactId>saxon</artifactId>
</exclusion>
<exclusion>
<groupId>saxonica</groupId>
<artifactId>saxon-dom</artifactId>
</exclusion>
<exclusion>
<groupId>jgrapht</groupId>
<artifactId>jgrapht</artifactId>
</exclusion>
<exclusion>
<groupId>net.sf.ehcache</groupId>
<artifactId>ehcache</artifactId>
</exclusion>
<exclusion>
<groupId>org.springframework</groupId>
<artifactId>spring-test</artifactId>
</exclusion>
<exclusion>
<groupId>org.apache.*</groupId>
<artifactId>*</artifactId>
</exclusion>
<exclusion>
<groupId>apache</groupId>
<artifactId>*</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>eu.dnetlib</groupId>
<artifactId>dnet-openaire-data-protos</artifactId>
</dependency>
<dependency>
@ -44,13 +99,22 @@
<groupId>jaxen</groupId>
<artifactId>jaxen</artifactId>
</dependency>
<dependency>
<groupId>org.mongodb</groupId>
<artifactId>mongo-java-driver</artifactId>
</dependency>
<dependency>
<groupId>org.mockito</groupId>
<artifactId>mockito-core</artifactId>
<version>2.25.0</version>
<scope>test</scope>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-distcp</artifactId>
</dependency>
<dependency>
<groupId>org.postgresql</groupId>
<artifactId>postgresql</artifactId>
<version>42.2.10</version>
</dependency>
</dependencies>

@ -0,0 +1,49 @@
package eu.dnetlib.dhp.migration.actions;
import eu.dnetlib.data.proto.FieldTypeProtos.Qualifier;
import java.util.Comparator;
public class LicenseComparator implements Comparator<Qualifier> {
@Override
public int compare(Qualifier left, Qualifier right) {
if (left == null && right == null) return 0;
if (left == null) return 1;
if (right == null) return -1;
String lClass = left.getClassid();
String rClass = right.getClassid();
if (lClass.equals(rClass)) return 0;
if (lClass.equals("OPEN SOURCE")) return -1;
if (rClass.equals("OPEN SOURCE")) return 1;
if (lClass.equals("OPEN")) return -1;
if (rClass.equals("OPEN")) return 1;
if (lClass.equals("6MONTHS")) return -1;
if (rClass.equals("6MONTHS")) return 1;
if (lClass.equals("12MONTHS")) return -1;
if (rClass.equals("12MONTHS")) return 1;
if (lClass.equals("EMBARGO")) return -1;
if (rClass.equals("EMBARGO")) return 1;
if (lClass.equals("RESTRICTED")) return -1;
if (rClass.equals("RESTRICTED")) return 1;
if (lClass.equals("CLOSED")) return -1;
if (rClass.equals("CLOSED")) return 1;
if (lClass.equals("UNKNOWN")) return -1;
if (rClass.equals("UNKNOWN")) return 1;
// Else (but unlikely), lexicographical ordering will do.
return lClass.compareTo(rClass);
}
}

@ -0,0 +1,170 @@
package eu.dnetlib.dhp.migration.actions;
import com.google.common.base.Splitter;
import com.google.common.collect.Lists;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.utils.ISLookupClientFactory;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.tools.DistCp;
import org.apache.hadoop.tools.DistCpOptions;
import org.apache.hadoop.util.ToolRunner;
import java.io.File;
import java.io.FileOutputStream;
import java.io.OutputStream;
import java.util.*;
import java.util.stream.Collectors;
public class MigrateActionSet {
private static final Log log = LogFactory.getLog(MigrateActionSet.class);
private static final String SEPARATOR = "/";
private static final String TARGET_PATHS = "target_paths";
private static final String RAWSET_PREFIX = "rawset_";
private static Boolean DEFAULT_TRANSFORM_ONLY = false;
public static void main(String[] args) throws Exception {
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
IOUtils.toString(MigrateActionSet.class.getResourceAsStream(
"/eu/dnetlib/dhp/migration/migrate_actionsets_parameters.json")));
parser.parseArgument(args);
new MigrateActionSet().run(parser);
}
private void run(ArgumentApplicationParser parser) throws Exception {
final String isLookupUrl = parser.get("isLookupUrl");
final String sourceNN = parser.get("sourceNameNode");
final String targetNN = parser.get("targetNameNode");
final String workDir = parser.get("workingDirectory");
final Integer distcp_num_maps = Integer.parseInt(parser.get("distcp_num_maps"));
final String distcp_memory_mb = parser.get("distcp_memory_mb");
final String distcp_task_timeout = parser.get("distcp_task_timeout");
final String transform_only_s = parser.get("transform_only");
log.info("transform only param: " + transform_only_s);
final Boolean transformOnly = Boolean.valueOf(parser.get("transform_only"));
log.info("transform only: " + transformOnly);
ISLookUpService isLookUp = ISLookupClientFactory.getLookUpService(isLookupUrl);
Configuration conf = getConfiguration(distcp_task_timeout, distcp_memory_mb, distcp_num_maps);
FileSystem targetFS = FileSystem.get(conf);
Configuration sourceConf = getConfiguration(distcp_task_timeout, distcp_memory_mb, distcp_num_maps);
sourceConf.set(FileSystem.FS_DEFAULT_NAME_KEY, sourceNN);
FileSystem sourceFS = FileSystem.get(sourceConf);
Properties props = new Properties();
List<Path> targetPaths = new ArrayList<>();
final List<Path> sourcePaths = getSourcePaths(sourceNN, isLookUp);
log.info(String.format("paths to process:\n%s", sourcePaths.stream().map(p -> p.toString()).collect(Collectors.joining("\n"))));
for(Path source : sourcePaths) {
if (!sourceFS.exists(source)) {
log.warn(String.format("skipping unexisting path: %s", source));
} else {
LinkedList<String> pathQ = Lists.newLinkedList(Splitter.on(SEPARATOR).split(source.toUri().getPath()));
final String rawSet = pathQ.pollLast();
log.info(String.format("got RAWSET: %s", rawSet));
if (StringUtils.isNotBlank(rawSet) && rawSet.startsWith(RAWSET_PREFIX)) {
final String actionSetDirectory = pathQ.pollLast();
final Path targetPath = new Path(targetNN + workDir + SEPARATOR + actionSetDirectory + SEPARATOR + rawSet);
log.info(String.format("using TARGET PATH: %s", targetPath));
if (!transformOnly) {
if (targetFS.exists(targetPath)) {
targetFS.delete(targetPath, true);
}
runDistcp(distcp_num_maps, distcp_memory_mb, distcp_task_timeout, conf, source, targetPath);
}
targetPaths.add(targetPath);
}
}
}
props.setProperty(TARGET_PATHS, targetPaths
.stream()
.map(p -> p.toString())
.collect(Collectors.joining(",")));
File file = new File(System.getProperty("oozie.action.output.properties"));
try(OutputStream os = new FileOutputStream(file)) {
props.store(os, "");
}
System.out.println(file.getAbsolutePath());
}
private void runDistcp(Integer distcp_num_maps, String distcp_memory_mb, String distcp_task_timeout, Configuration conf, Path source, Path targetPath) throws Exception {
final DistCpOptions op = new DistCpOptions(source, targetPath);
op.setMaxMaps(distcp_num_maps);
op.preserve(DistCpOptions.FileAttribute.BLOCKSIZE);
op.preserve(DistCpOptions.FileAttribute.REPLICATION);
op.preserve(DistCpOptions.FileAttribute.CHECKSUMTYPE);
int res = ToolRunner.run(new DistCp(conf, op), new String[]{
"-Dmapred.task.timeout=" + distcp_task_timeout,
"-Dmapreduce.map.memory.mb=" + distcp_memory_mb,
"-pb",
"-m " + distcp_num_maps,
source.toString(),
targetPath.toString()});
if (res != 0) {
throw new RuntimeException(String.format("distcp exited with code %s", res));
}
}
private Configuration getConfiguration(String distcp_task_timeout, String distcp_memory_mb, Integer distcp_num_maps) {
final Configuration conf = new Configuration();
conf.set("dfs.webhdfs.socket.connect-timeout", distcp_task_timeout);
conf.set("dfs.webhdfs.socket.read-timeout", distcp_task_timeout);
conf.set("dfs.http.client.retry.policy.enabled", "true");
conf.set("mapred.task.timeout", distcp_task_timeout);
conf.set("mapreduce.map.memory.mb", distcp_memory_mb);
conf.set("mapred.map.tasks", String.valueOf(distcp_num_maps));
return conf;
}
private List<Path> getSourcePaths(String sourceNN, ISLookUpService isLookUp) throws ISLookUpException {
String XQUERY = "distinct-values(\n" +
"let $basePath := collection('/db/DRIVER/ServiceResources/ActionManagerServiceResourceType')//SERVICE_PROPERTIES/PROPERTY[@key = 'basePath']/@value/string()\n" +
"for $x in collection('/db/DRIVER/ActionManagerSetDSResources/ActionManagerSetDSResourceType') \n" +
"let $setDir := $x//SET/@directory/string()\n" +
"let $rawSet := $x//RAW_SETS/LATEST/@id/string()\n" +
"return concat($basePath, '/', $setDir, '/', $rawSet))";
log.info(String.format("running xquery:\n%s", XQUERY));
return isLookUp.quickSearchProfile(XQUERY)
.stream()
.map(p -> sourceNN + p)
.map(Path::new)
.collect(Collectors.toList());
}
}

@ -0,0 +1,580 @@
package eu.dnetlib.dhp.migration.actions;
import com.google.common.collect.Lists;
import com.googlecode.protobuf.format.JsonFormat;
import eu.dnetlib.data.proto.*;
import eu.dnetlib.dhp.schema.oaf.*;
import org.apache.commons.lang3.StringUtils;
import java.io.Serializable;
import java.util.List;
import java.util.Optional;
import java.util.stream.Collectors;
public class ProtoConverter implements Serializable {
public static final String UNKNOWN = "UNKNOWN";
public static final String NOT_AVAILABLE = "not available";
public static final String DNET_ACCESS_MODES = "dnet:access_modes";
public static Oaf convert(OafProtos.Oaf oaf) {
try {
switch (oaf.getKind()) {
case entity:
return convertEntity(oaf);
case relation:
return convertRelation(oaf);
default:
throw new IllegalArgumentException("invalid kind " + oaf.getKind());
}
} catch (Throwable e) {
throw new RuntimeException("error on getting " + JsonFormat.printToString(oaf), e);
}
}
private static Relation convertRelation(OafProtos.Oaf oaf) {
final OafProtos.OafRel r = oaf.getRel();
final Relation rel = new Relation();
rel.setDataInfo(mapDataInfo(oaf.getDataInfo()));
rel.setLastupdatetimestamp(oaf.getLastupdatetimestamp());
rel.setSource(r.getSource());
rel.setTarget(r.getTarget());
rel.setRelType(r.getRelType().toString());
rel.setSubRelType(r.getSubRelType().toString());
rel.setRelClass(r.getRelClass());
rel.setCollectedFrom(r.getCollectedfromCount() > 0 ?
r.getCollectedfromList().stream()
.map(kv -> mapKV(kv))
.collect(Collectors.toList()) : null);
return rel;
}
private static OafEntity convertEntity(OafProtos.Oaf oaf) {
switch (oaf.getEntity().getType()) {
case result:
final Result r = convertResult(oaf);
r.setInstance(convertInstances(oaf));
return r;
case project:
return convertProject(oaf);
case datasource:
return convertDataSource(oaf);
case organization:
return convertOrganization(oaf);
default:
throw new RuntimeException("received unknown type");
}
}
private static List<Instance> convertInstances(OafProtos.Oaf oaf) {
final ResultProtos.Result r = oaf.getEntity().getResult();
if (r.getInstanceCount() > 0) {
return r.getInstanceList()
.stream()
.map(i -> convertInstance(i))
.collect(Collectors.toList());
}
return Lists.newArrayList();
}
private static Instance convertInstance(ResultProtos.Result.Instance ri) {
final Instance i = new Instance();
i.setAccessright(mapQualifier(ri.getAccessright()));
i.setCollectedfrom(mapKV(ri.getCollectedfrom()));
i.setDateofacceptance(mapStringField(ri.getDateofacceptance()));
i.setDistributionlocation(ri.getDistributionlocation());
i.setHostedby(mapKV(ri.getHostedby()));
i.setInstancetype(mapQualifier(ri.getInstancetype()));
i.setLicense(mapStringField(ri.getLicense()));
i.setUrl(ri.getUrlList());
i.setRefereed(mapStringField(ri.getRefereed()));
i.setProcessingchargeamount(mapStringField(ri.getProcessingchargeamount()));
i.setProcessingchargecurrency(mapStringField(ri.getProcessingchargecurrency()));
return i;
}
private static Organization convertOrganization(OafProtos.Oaf oaf) {
final OrganizationProtos.Organization.Metadata m = oaf.getEntity().getOrganization().getMetadata();
final Organization org = setOaf(new Organization(), oaf);
setEntity(org, oaf);
org.setLegalshortname(mapStringField(m.getLegalshortname()));
org.setLegalname(mapStringField(m.getLegalname()));
org.setAlternativeNames(m.getAlternativeNamesList().
stream()
.map(ProtoConverter::mapStringField)
.collect(Collectors.toList()));
org.setWebsiteurl(mapStringField(m.getWebsiteurl()));
org.setLogourl(mapStringField(m.getLogourl()));
org.setEclegalbody(mapStringField(m.getEclegalbody()));
org.setEclegalperson(mapStringField(m.getEclegalperson()));
org.setEcnonprofit(mapStringField(m.getEcnonprofit()));
org.setEcresearchorganization(mapStringField(m.getEcresearchorganization()));
org.setEchighereducation(mapStringField(m.getEchighereducation()));
org.setEcinternationalorganizationeurinterests(mapStringField(m.getEcinternationalorganizationeurinterests()));
org.setEcinternationalorganization(mapStringField(m.getEcinternationalorganization()));
org.setEcenterprise(mapStringField(m.getEcenterprise()));
org.setEcsmevalidated(mapStringField(m.getEcsmevalidated()));
org.setEcnutscode(mapStringField(m.getEcnutscode()));
org.setCountry(mapQualifier(m.getCountry()));
return org;
}
private static Datasource convertDataSource(OafProtos.Oaf oaf) {
final DatasourceProtos.Datasource.Metadata m = oaf.getEntity().getDatasource().getMetadata();
final Datasource datasource = setOaf(new Datasource(), oaf);
setEntity(datasource, oaf);
datasource.setAccessinfopackage(m.getAccessinfopackageList()
.stream()
.map(ProtoConverter::mapStringField)
.collect(Collectors.toList()));
datasource.setCertificates(mapStringField(m.getCertificates()));
datasource.setCitationguidelineurl(mapStringField(m.getCitationguidelineurl()));
datasource.setContactemail(mapStringField(m.getContactemail()));
datasource.setDatabaseaccessrestriction(mapStringField(m.getDatabaseaccessrestriction()));
datasource.setDatabaseaccesstype(mapStringField(m.getDatabaseaccesstype()));
datasource.setDataprovider(mapBoolField(m.getDataprovider()));
datasource.setDatasourcetype(mapQualifier(m.getDatasourcetype()));
datasource.setDatauploadrestriction(mapStringField(m.getDatauploadrestriction()));
datasource.setCitationguidelineurl(mapStringField(m.getCitationguidelineurl()));
datasource.setDatauploadtype(mapStringField(m.getDatauploadtype()));
datasource.setDateofvalidation(mapStringField(m.getDateofvalidation()));
datasource.setDescription(mapStringField(m.getDescription()));
datasource.setEnglishname(mapStringField(m.getEnglishname()));
datasource.setLatitude(mapStringField(m.getLatitude()));
datasource.setLongitude(mapStringField(m.getLongitude()));
datasource.setLogourl(mapStringField(m.getLogourl()));
datasource.setMissionstatementurl(mapStringField(m.getMissionstatementurl()));
datasource.setNamespaceprefix(mapStringField(m.getNamespaceprefix()));
datasource.setOdcontenttypes(m.getOdcontenttypesList()
.stream()
.map(ProtoConverter::mapStringField)
.collect(Collectors.toList()));
datasource.setOdlanguages(m.getOdlanguagesList()
.stream()
.map(ProtoConverter::mapStringField)
.collect(Collectors.toList()));
datasource.setOdnumberofitems(mapStringField(m.getOdnumberofitems()));
datasource.setOdnumberofitemsdate(mapStringField(m.getOdnumberofitemsdate()));
datasource.setOdpolicies(mapStringField(m.getOdpolicies()));
datasource.setOfficialname(mapStringField(m.getOfficialname()));
datasource.setOpenairecompatibility(mapQualifier(m.getOpenairecompatibility()));
datasource.setPidsystems(mapStringField(m.getPidsystems()));
datasource.setPolicies(m.getPoliciesList()
.stream()
.map(ProtoConverter::mapKV)
.collect(Collectors.toList()));
datasource.setQualitymanagementkind(mapStringField(m.getQualitymanagementkind()));
datasource.setReleaseenddate(mapStringField(m.getReleaseenddate()));
datasource.setServiceprovider(mapBoolField(m.getServiceprovider()));
datasource.setReleasestartdate(mapStringField(m.getReleasestartdate()));
datasource.setSubjects(m.getSubjectsList()
.stream()
.map(ProtoConverter::mapStructuredProperty)
.collect(Collectors.toList()));
datasource.setVersioning(mapBoolField(m.getVersioning()));
datasource.setWebsiteurl(mapStringField(m.getWebsiteurl()));
datasource.setJournal(mapJournal(m.getJournal()));
return datasource;
}
private static Project convertProject(OafProtos.Oaf oaf) {
final ProjectProtos.Project.Metadata m = oaf.getEntity().getProject().getMetadata();
final Project project = setOaf(new Project(), oaf);
setEntity(project, oaf);
project.setAcronym(mapStringField(m.getAcronym()));
project.setCallidentifier(mapStringField(m.getCallidentifier()));
project.setCode(mapStringField(m.getCode()));
project.setContactemail(mapStringField(m.getContactemail()));
project.setContactfax(mapStringField(m.getContactfax()));
project.setContactfullname(mapStringField(m.getContactfullname()));
project.setContactphone(mapStringField(m.getContactphone()));
project.setContracttype(mapQualifier(m.getContracttype()));
project.setCurrency(mapStringField(m.getCurrency()));
project.setDuration(mapStringField(m.getDuration()));
project.setEcarticle29_3(mapStringField(m.getEcarticle293()));
project.setEcsc39(mapStringField(m.getEcsc39()));
project.setOamandatepublications(mapStringField(m.getOamandatepublications()));
project.setStartdate(mapStringField(m.getStartdate()));
project.setEnddate(mapStringField(m.getEnddate()));
project.setFundedamount(m.getFundedamount());
project.setTotalcost(m.getTotalcost());
project.setKeywords(mapStringField(m.getKeywords()));
project.setSubjects(m.getSubjectsList().stream()
.map(sp -> mapStructuredProperty(sp))
.collect(Collectors.toList()));
project.setTitle(mapStringField(m.getTitle()));
project.setWebsiteurl(mapStringField(m.getWebsiteurl()));
project.setFundingtree(m.getFundingtreeList().stream()
.map(f -> mapStringField(f))
.collect(Collectors.toList()));
project.setJsonextrainfo(mapStringField(m.getJsonextrainfo()));
project.setSummary(mapStringField(m.getSummary()));
project.setOptional1(mapStringField(m.getOptional1()));
project.setOptional2(mapStringField(m.getOptional2()));
return project;
}
private static Result convertResult(OafProtos.Oaf oaf) {
switch (oaf.getEntity().getResult().getMetadata().getResulttype().getClassid()) {
case "dataset":
return createDataset(oaf);
case "publication":
return createPublication(oaf);
case "software":
return createSoftware(oaf);
case "other":
return createORP(oaf);
default:
Result result = setOaf(new Result(), oaf);
setEntity(result, oaf);
return setResult(result, oaf);
}
}
private static Software createSoftware(OafProtos.Oaf oaf) {
ResultProtos.Result.Metadata m = oaf.getEntity().getResult().getMetadata();
Software software = setOaf(new Software(), oaf);
setEntity(software, oaf);
setResult(software, oaf);
software.setDocumentationUrl(m.getDocumentationUrlList()
.stream()
.map(ProtoConverter::mapStringField)
.collect(Collectors.toList()));
software.setLicense(m.getLicenseList()
.stream()
.map(ProtoConverter::mapStructuredProperty)
.collect(Collectors.toList()));
software.setCodeRepositoryUrl(mapStringField(m.getCodeRepositoryUrl()));
software.setProgrammingLanguage(mapQualifier(m.getProgrammingLanguage()));
return software;
}
private static OtherResearchProduct createORP(OafProtos.Oaf oaf) {
ResultProtos.Result.Metadata m = oaf.getEntity().getResult().getMetadata();
OtherResearchProduct otherResearchProducts = setOaf(new OtherResearchProduct(), oaf);
setEntity(otherResearchProducts, oaf);
setResult(otherResearchProducts, oaf);
otherResearchProducts.setContactperson(m.getContactpersonList()
.stream()
.map(ProtoConverter::mapStringField)
.collect(Collectors.toList()));
otherResearchProducts.setContactgroup(m.getContactgroupList()
.stream()
.map(ProtoConverter::mapStringField)
.collect(Collectors.toList()));
otherResearchProducts.setTool(m.getToolList()
.stream()
.map(ProtoConverter::mapStringField)
.collect(Collectors.toList()));
return otherResearchProducts;
}
private static Publication createPublication(OafProtos.Oaf oaf) {
ResultProtos.Result.Metadata m = oaf.getEntity().getResult().getMetadata();
Publication publication = setOaf(new Publication(), oaf);
setEntity(publication, oaf);
setResult(publication, oaf);
publication.setJournal(mapJournal(m.getJournal()));
return publication;
}
private static Dataset createDataset(OafProtos.Oaf oaf) {
ResultProtos.Result.Metadata m = oaf.getEntity().getResult().getMetadata();
Dataset dataset = setOaf(new Dataset(), oaf);
setEntity(dataset, oaf);
setResult(dataset, oaf);
dataset.setStoragedate(mapStringField(m.getStoragedate()));
dataset.setDevice(mapStringField(m.getDevice()));
dataset.setSize(mapStringField(m.getSize()));
dataset.setVersion(mapStringField(m.getVersion()));
dataset.setLastmetadataupdate(mapStringField(m.getLastmetadataupdate()));
dataset.setMetadataversionnumber(mapStringField(m.getMetadataversionnumber()));
dataset.setGeolocation(m.getGeolocationList()
.stream()
.map(ProtoConverter::mapGeolocation)
.collect(Collectors.toList()));
return dataset;
}
public static <T extends Oaf> T setOaf(T oaf, OafProtos.Oaf o) {
oaf.setDataInfo(mapDataInfo(o.getDataInfo()));
oaf.setLastupdatetimestamp(o.getLastupdatetimestamp());
return oaf;
}
public static <T extends OafEntity> T setEntity(T entity, OafProtos.Oaf oaf) {
//setting Entity fields
final OafProtos.OafEntity e = oaf.getEntity();
entity.setId(e.getId());
entity.setOriginalId(e.getOriginalIdList());
entity.setCollectedfrom(e.getCollectedfromList()
.stream()
.map(ProtoConverter::mapKV)
.collect(Collectors.toList()));
entity.setPid(e.getPidList().stream()
.map(ProtoConverter::mapStructuredProperty)
.collect(Collectors.toList()));
entity.setDateofcollection(e.getDateofcollection());
entity.setDateoftransformation(e.getDateoftransformation());
entity.setExtraInfo(e.getExtraInfoList()
.stream()
.map(ProtoConverter::mapExtraInfo)
.collect(Collectors.toList()));
return entity;
}
public static <T extends Result> T setResult(T entity, OafProtos.Oaf oaf) {
//setting Entity fields
final ResultProtos.Result.Metadata m = oaf.getEntity().getResult().getMetadata();
entity.setAuthor(m.getAuthorList()
.stream()
.map(ProtoConverter::mapAuthor)
.collect(Collectors.toList()));
entity.setResulttype(mapQualifier(m.getResulttype()));
entity.setLanguage(mapQualifier(m.getLanguage()));
entity.setCountry(m.getCountryList()
.stream()
.map(ProtoConverter::mapQualifierAsCountry)
.collect(Collectors.toList()));
entity.setSubject(m.getSubjectList()
.stream()
.map(ProtoConverter::mapStructuredProperty)
.collect(Collectors.toList()));
entity.setTitle(m.getTitleList()
.stream()
.map(ProtoConverter::mapStructuredProperty)
.collect(Collectors.toList()));
entity.setRelevantdate(m.getRelevantdateList()
.stream()
.map(ProtoConverter::mapStructuredProperty)
.collect(Collectors.toList()));
entity.setDescription(m.getDescriptionList()
.stream()
.map(ProtoConverter::mapStringField)
.collect(Collectors.toList()));
entity.setDateofacceptance(mapStringField(m.getDateofacceptance()));
entity.setPublisher(mapStringField(m.getPublisher()));
entity.setEmbargoenddate(mapStringField(m.getEmbargoenddate()));
entity.setSource(m.getSourceList()
.stream()
.map(ProtoConverter::mapStringField)
.collect(Collectors.toList()));
entity.setFulltext(m.getFulltextList()
.stream()
.map(ProtoConverter::mapStringField)
.collect(Collectors.toList()));
entity.setFormat(m.getFormatList()
.stream()
.map(ProtoConverter::mapStringField)
.collect(Collectors.toList()));
entity.setContributor(m.getContributorList()
.stream()
.map(ProtoConverter::mapStringField)
.collect(Collectors.toList()));
entity.setResourcetype(mapQualifier(m.getResourcetype()));
entity.setCoverage(m.getCoverageList()
.stream()
.map(ProtoConverter::mapStringField)
.collect(Collectors.toList()));
entity.setContext(m.getContextList()
.stream()
.map(ProtoConverter::mapContext)
.collect(Collectors.toList()));
entity.setBestaccessright(getBestAccessRights(oaf.getEntity().getResult().getInstanceList()));
return entity;
}
private static Qualifier getBestAccessRights(List<ResultProtos.Result.Instance> instanceList) {
if (instanceList != null) {
final Optional<FieldTypeProtos.Qualifier> min = instanceList.stream()
.map(i -> i.getAccessright()).min(new LicenseComparator());
final Qualifier rights = min.isPresent() ? mapQualifier(min.get()) : new Qualifier();
if (StringUtils.isBlank(rights.getClassid())) {
rights.setClassid(UNKNOWN);
}
if (StringUtils.isBlank(rights.getClassname()) || UNKNOWN.equalsIgnoreCase(rights.getClassname())) {
rights.setClassname(NOT_AVAILABLE);
}
if (StringUtils.isBlank(rights.getSchemeid())) {
rights.setSchemeid(DNET_ACCESS_MODES);
}
if (StringUtils.isBlank(rights.getSchemename())) {
rights.setSchemename(DNET_ACCESS_MODES);
}
return rights;
}
return null;
}
private static Context mapContext(ResultProtos.Result.Context context) {
final Context entity = new Context();
entity.setId(context.getId());
entity.setDataInfo(context.getDataInfoList()
.stream()
.map(ProtoConverter::mapDataInfo)
.collect(Collectors.toList()));
return entity;
}
public static KeyValue mapKV(FieldTypeProtos.KeyValue kv) {
final KeyValue keyValue = new KeyValue();
keyValue.setKey(kv.getKey());
keyValue.setValue(kv.getValue());
keyValue.setDataInfo(mapDataInfo(kv.getDataInfo()));
return keyValue;
}
public static DataInfo mapDataInfo(FieldTypeProtos.DataInfo d) {
final DataInfo dataInfo = new DataInfo();
dataInfo.setDeletedbyinference(d.getDeletedbyinference());
dataInfo.setInferenceprovenance(d.getInferenceprovenance());
dataInfo.setInferred(d.getInferred());
dataInfo.setInvisible(d.getInvisible());
dataInfo.setProvenanceaction(mapQualifier(d.getProvenanceaction()));
dataInfo.setTrust(d.getTrust());
return dataInfo;
}
public static Qualifier mapQualifier(FieldTypeProtos.Qualifier q) {
final Qualifier qualifier = new Qualifier();
qualifier.setClassid(q.getClassid());
qualifier.setClassname(q.getClassname());
qualifier.setSchemeid(q.getSchemeid());
qualifier.setSchemename(q.getSchemename());
return qualifier;
}
public static Country mapQualifierAsCountry(FieldTypeProtos.Qualifier q) {
final Country c = new Country();
c.setClassid(q.getClassid());
c.setClassname(q.getClassname());
c.setSchemeid(q.getSchemeid());
c.setSchemename(q.getSchemename());
c.setDataInfo(mapDataInfo(q.getDataInfo()));
return c;
}
public static StructuredProperty mapStructuredProperty(FieldTypeProtos.StructuredProperty sp) {
final StructuredProperty structuredProperty = new StructuredProperty();
structuredProperty.setValue(sp.getValue());
structuredProperty.setQualifier(mapQualifier(sp.getQualifier()));
structuredProperty.setDataInfo(mapDataInfo(sp.getDataInfo()));
return structuredProperty;
}
public static ExtraInfo mapExtraInfo(FieldTypeProtos.ExtraInfo extraInfo) {
final ExtraInfo entity = new ExtraInfo();
entity.setName(extraInfo.getName());
entity.setTypology(extraInfo.getTypology());
entity.setProvenance(extraInfo.getProvenance());
entity.setTrust(extraInfo.getTrust());
entity.setValue(extraInfo.getValue());
return entity;
}
public static OAIProvenance mapOAIProvenance(FieldTypeProtos.OAIProvenance oaiProvenance) {
final OAIProvenance entity = new OAIProvenance();
entity.setOriginDescription(mapOriginalDescription(oaiProvenance.getOriginDescription()));
return entity;
}
public static OriginDescription mapOriginalDescription(FieldTypeProtos.OAIProvenance.OriginDescription originDescription) {
final OriginDescription originDescriptionResult = new OriginDescription();
originDescriptionResult.setHarvestDate(originDescription.getHarvestDate());
originDescriptionResult.setAltered(originDescription.getAltered());
originDescriptionResult.setBaseURL(originDescription.getBaseURL());
originDescriptionResult.setIdentifier(originDescription.getIdentifier());
originDescriptionResult.setDatestamp(originDescription.getDatestamp());
originDescriptionResult.setMetadataNamespace(originDescription.getMetadataNamespace());
return originDescriptionResult;
}
public static Field<String> mapStringField(FieldTypeProtos.StringField s) {
final Field<String> stringField = new Field<>();
stringField.setValue(s.getValue());
stringField.setDataInfo(mapDataInfo(s.getDataInfo()));
return stringField;
}
public static Field<Boolean> mapBoolField(FieldTypeProtos.BoolField b) {
final Field<Boolean> booleanField = new Field<>();
booleanField.setValue(b.getValue());
booleanField.setDataInfo(mapDataInfo(b.getDataInfo()));
return booleanField;
}
public static Field<Integer> mapIntField(FieldTypeProtos.IntField b) {
final Field<Integer> entity = new Field<>();
entity.setValue(b.getValue());
entity.setDataInfo(mapDataInfo(b.getDataInfo()));
return entity;
}
public static Journal mapJournal(FieldTypeProtos.Journal j) {
final Journal journal = new Journal();
journal.setConferencedate(j.getConferencedate());
journal.setConferenceplace(j.getConferenceplace());
journal.setEdition(j.getEdition());
journal.setEp(j.getEp());
journal.setIss(j.getIss());
journal.setIssnLinking(j.getIssnLinking());
journal.setIssnOnline(j.getIssnOnline());
journal.setIssnPrinted(j.getIssnPrinted());
journal.setName(j.getName());
journal.setSp(j.getSp());
journal.setVol(j.getVol());
journal.setDataInfo(mapDataInfo(j.getDataInfo()));
return journal;
}
public static Author mapAuthor(FieldTypeProtos.Author author) {
final Author entity = new Author();
entity.setFullname(author.getFullname());
entity.setName(author.getName());
entity.setSurname(author.getSurname());
entity.setRank(author.getRank());
entity.setPid(author.getPidList()
.stream()
.map(kv -> {
final StructuredProperty sp = new StructuredProperty();
sp.setValue(kv.getValue());
final Qualifier q = new Qualifier();
q.setClassid(kv.getKey());
q.setClassname(kv.getKey());
sp.setQualifier(q);
return sp;
})
.collect(Collectors.toList()));
entity.setAffiliation(author.getAffiliationList()
.stream()
.map(ProtoConverter::mapStringField)
.collect(Collectors.toList()));
return entity;
}
public static GeoLocation mapGeolocation(ResultProtos.Result.GeoLocation geoLocation) {
final GeoLocation entity = new GeoLocation();
entity.setPoint(geoLocation.getPoint());
entity.setBox(geoLocation.getBox());
entity.setPlace(geoLocation.getPlace());
return entity;
}
}

@ -0,0 +1,194 @@
package eu.dnetlib.dhp.migration.actions;
import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.common.base.Splitter;
import com.google.common.collect.Lists;
import com.google.protobuf.InvalidProtocolBufferException;
import eu.dnetlib.dhp.schema.action.AtomicAction;
import eu.dnetlib.data.proto.OafProtos;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.schema.oaf.*;
import eu.dnetlib.dhp.utils.ISLookupClientFactory;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.compress.GzipCodec;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.SparkSession;
import scala.Tuple2;
import java.io.IOException;
import java.io.Serializable;
import java.util.LinkedList;
public class TransformActions implements Serializable {
private static final Log log = LogFactory.getLog(TransformActions.class);
private static final String SEPARATOR = "/";
public static void main(String[] args) throws Exception {
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
IOUtils.toString(MigrateActionSet.class.getResourceAsStream(
"/eu/dnetlib/dhp/migration/transform_actionsets_parameters.json")));
parser.parseArgument(args);
new TransformActions().run(parser);
}
private void run(ArgumentApplicationParser parser) throws ISLookUpException, IOException {
final String isLookupUrl = parser.get("isLookupUrl");
log.info("isLookupUrl: " + isLookupUrl);
final String inputPaths = parser.get("inputPaths");
if (StringUtils.isBlank(inputPaths)) {
throw new RuntimeException("empty inputPaths");
}
log.info("inputPaths: " + inputPaths);
final String targetBaseDir = getTargetBaseDir(isLookupUrl);
try(SparkSession spark = getSparkSession(parser)) {
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
final FileSystem fs = FileSystem.get(spark.sparkContext().hadoopConfiguration());
for(String sourcePath : Lists.newArrayList(Splitter.on(",").split(inputPaths))) {
LinkedList<String> pathQ = Lists.newLinkedList(Splitter.on(SEPARATOR).split(sourcePath));
final String rawset = pathQ.pollLast();
final String actionSetDirectory = pathQ.pollLast();
final Path targetDirectory = new Path(targetBaseDir + SEPARATOR + actionSetDirectory + SEPARATOR + rawset);
if (fs.exists(targetDirectory)) {
log.info(String.format("found target directory '%s", targetDirectory));
fs.delete(targetDirectory, true);
log.info(String.format("deleted target directory '%s", targetDirectory));
}
log.info(String.format("transforming actions from '%s' to '%s'", sourcePath, targetDirectory));
sc.sequenceFile(sourcePath, Text.class, Text.class)
.mapToPair(a -> new Tuple2<>(a._1(), eu.dnetlib.actionmanager.actions.AtomicAction.fromJSON(a._2().toString())))
.mapToPair(a -> new Tuple2<>(a._1(), transformAction(a._1().toString(), a._2())))
.filter(t -> StringUtils.isNotBlank(t._2().toString()))
.saveAsHadoopFile(targetDirectory.toString(), Text.class, Text.class, SequenceFileOutputFormat.class, GzipCodec.class);
}
}
}
private Text transformAction(String atomicaActionId, eu.dnetlib.actionmanager.actions.AtomicAction aa) throws InvalidProtocolBufferException, JsonProcessingException {
final Text out = new Text();
final ObjectMapper mapper = new ObjectMapper();
if (aa.getTargetValue() != null && aa.getTargetValue().length > 0) {
out.set(mapper.writeValueAsString(doTransform(aa)));
} else {
if (atomicaActionId.contains("dedupSimilarity")) {
out.set(mapper.writeValueAsString(getRelationAtomicAction(atomicaActionId)));
}
}
return out;
}
private AtomicAction<Relation> getRelationAtomicAction(String atomicaActionId) {
final String[] splitId = atomicaActionId.split("@");
String source = splitId[0];
String target = splitId[2];
String[] relSemantic = splitId[1].split("_");
Relation rel = new Relation();
rel.setSource(source);
rel.setTarget(target);
rel.setRelType(relSemantic[0]);
rel.setSubRelType(relSemantic[1]);
rel.setRelClass(relSemantic[2]);
DataInfo d = new DataInfo();
d.setDeletedbyinference(false);
d.setInferenceprovenance("deduplication");
d.setInferred(true);
d.setInvisible(false);
Qualifier provenanceaction = new Qualifier();
provenanceaction.setClassid("deduplication");
provenanceaction.setClassname("deduplication");
provenanceaction.setSchemeid("dnet:provenanceActions");
provenanceaction.setSchemename("dnet:provenanceActions");
d.setProvenanceaction(provenanceaction);
rel.setDataInfo(d);
return new AtomicAction<>(Relation.class, rel);
}
private AtomicAction doTransform(eu.dnetlib.actionmanager.actions.AtomicAction aa) throws InvalidProtocolBufferException {
final OafProtos.Oaf proto_oaf = OafProtos.Oaf.parseFrom(aa.getTargetValue());
final Oaf oaf = ProtoConverter.convert(proto_oaf);
switch (proto_oaf.getKind()) {
case entity:
switch (proto_oaf.getEntity().getType()) {
case datasource:
return new AtomicAction<>(Datasource.class, (Datasource) oaf);
case organization:
return new AtomicAction<>(Organization.class, (Organization) oaf);
case project:
return new AtomicAction<>(Project.class, (Project) oaf);
case result:
final String resulttypeid = proto_oaf.getEntity().getResult().getMetadata().getResulttype().getClassid();
switch (resulttypeid) {
case "publication":
return new AtomicAction<>(Publication.class, (Publication) oaf);
case "software":
return new AtomicAction<>(Software.class, (Software) oaf);
case "other":
return new AtomicAction<>(OtherResearchProduct.class, (OtherResearchProduct) oaf);
case "dataset":
return new AtomicAction<>(Dataset.class, (Dataset) oaf);
default:
// can be an update, where the resulttype is not specified
return new AtomicAction<>(Result.class, (Result) oaf);
}
default:
throw new IllegalArgumentException("invalid entity type: " + proto_oaf.getEntity().getType());
}
case relation:
return new AtomicAction<>(Relation.class, (Relation) oaf);
default:
throw new IllegalArgumentException("invalid kind: " + proto_oaf.getKind());
}
}
private String getTargetBaseDir(String isLookupUrl) throws ISLookUpException {
ISLookUpService isLookUp = ISLookupClientFactory.getLookUpService(isLookupUrl);
String XQUERY = "collection('/db/DRIVER/ServiceResources/ActionManagerServiceResourceType')//SERVICE_PROPERTIES/PROPERTY[@key = 'basePath']/@value/string()";
return isLookUp.getResourceProfileByQuery(XQUERY);
}
private static SparkSession getSparkSession(ArgumentApplicationParser parser) {
SparkConf conf = new SparkConf();
return SparkSession
.builder()
.appName(TransformActions.class.getSimpleName())
.master(parser.get("master"))
.config(conf)
.enableHiveSupport()
.getOrCreate();
}
}

@ -0,0 +1,481 @@
package eu.dnetlib.dhp.migration.step1;
import static eu.dnetlib.dhp.migration.utils.OafMapperUtils.asString;
import static eu.dnetlib.dhp.migration.utils.OafMapperUtils.createOpenaireId;
import static eu.dnetlib.dhp.migration.utils.OafMapperUtils.dataInfo;
import static eu.dnetlib.dhp.migration.utils.OafMapperUtils.field;
import static eu.dnetlib.dhp.migration.utils.OafMapperUtils.journal;
import static eu.dnetlib.dhp.migration.utils.OafMapperUtils.listFields;
import static eu.dnetlib.dhp.migration.utils.OafMapperUtils.listKeyValues;
import static eu.dnetlib.dhp.migration.utils.OafMapperUtils.qualifier;
import static eu.dnetlib.dhp.migration.utils.OafMapperUtils.structuredProperty;
import java.io.Closeable;
import java.io.IOException;
import java.sql.Array;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Date;
import java.util.List;
import java.util.function.Consumer;
import java.util.function.Function;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.migration.utils.AbstractMigrationApplication;
import eu.dnetlib.dhp.migration.utils.DbClient;
import eu.dnetlib.dhp.schema.oaf.Context;
import eu.dnetlib.dhp.schema.oaf.DataInfo;
import eu.dnetlib.dhp.schema.oaf.Dataset;
import eu.dnetlib.dhp.schema.oaf.Datasource;
import eu.dnetlib.dhp.schema.oaf.Field;
import eu.dnetlib.dhp.schema.oaf.Journal;
import eu.dnetlib.dhp.schema.oaf.KeyValue;
import eu.dnetlib.dhp.schema.oaf.Oaf;
import eu.dnetlib.dhp.schema.oaf.Organization;
import eu.dnetlib.dhp.schema.oaf.OtherResearchProduct;
import eu.dnetlib.dhp.schema.oaf.Project;
import eu.dnetlib.dhp.schema.oaf.Publication;
import eu.dnetlib.dhp.schema.oaf.Qualifier;
import eu.dnetlib.dhp.schema.oaf.Relation;
import eu.dnetlib.dhp.schema.oaf.Result;
import eu.dnetlib.dhp.schema.oaf.Software;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
public class MigrateDbEntitiesApplication extends AbstractMigrationApplication implements Closeable {
private static final Qualifier ENTITYREGISTRY_PROVENANCE_ACTION =
qualifier("sysimport:crosswalk:entityregistry", "sysimport:crosswalk:entityregistry", "dnet:provenance_actions", "dnet:provenance_actions");
private static final Log log = LogFactory.getLog(MigrateDbEntitiesApplication.class);
private final DbClient dbClient;
private final long lastUpdateTimestamp;
public static void main(final String[] args) throws Exception {
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
IOUtils.toString(MigrateDbEntitiesApplication.class.getResourceAsStream("/eu/dnetlib/dhp/migration/migrate_db_entities_parameters.json")));
parser.parseArgument(args);
final String dbUrl = parser.get("postgresUrl");
final String dbUser = parser.get("postgresUser");
final String dbPassword = parser.get("postgresPassword");
final String hdfsPath = parser.get("hdfsPath");
final boolean processClaims = parser.get("action") != null && parser.get("action").equalsIgnoreCase("claims");
try (final MigrateDbEntitiesApplication smdbe = new MigrateDbEntitiesApplication(hdfsPath, dbUrl, dbUser, dbPassword)) {
if (processClaims) {
log.info("Processing claims...");
smdbe.execute("queryClaims.sql", smdbe::processClaims);
} else {
log.info("Processing datasources...");
smdbe.execute("queryDatasources.sql", smdbe::processDatasource);
log.info("Processing projects...");
smdbe.execute("queryProjects.sql", smdbe::processProject);
log.info("Processing orgs...");
smdbe.execute("queryOrganizations.sql", smdbe::processOrganization);
log.info("Processing relations ds <-> orgs ...");
smdbe.execute("queryDatasourceOrganization.sql", smdbe::processDatasourceOrganization);
log.info("Processing projects <-> orgs ...");
smdbe.execute("queryProjectOrganization.sql", smdbe::processProjectOrganization);
}
log.info("All done.");
}
}
protected MigrateDbEntitiesApplication() { // ONLY FOR UNIT TEST
super();
this.dbClient = null;
this.lastUpdateTimestamp = new Date().getTime();
}
public MigrateDbEntitiesApplication(final String hdfsPath, final String dbUrl, final String dbUser,
final String dbPassword) throws Exception {
super(hdfsPath);
this.dbClient = new DbClient(dbUrl, dbUser, dbPassword);
this.lastUpdateTimestamp = new Date().getTime();
}
public void execute(final String sqlFile, final Function<ResultSet, List<Oaf>> producer) throws Exception {
final String sql = IOUtils.toString(getClass().getResourceAsStream("/eu/dnetlib/dhp/migration/sql/" + sqlFile));
final Consumer<ResultSet> consumer = rs -> producer.apply(rs).forEach(oaf -> emitOaf(oaf));
dbClient.processResults(sql, consumer);
}
public List<Oaf> processDatasource(final ResultSet rs) {
try {
final DataInfo info = prepareDataInfo(rs);
final Datasource ds = new Datasource();
ds.setId(createOpenaireId(10, rs.getString("datasourceid"), true));
ds.setOriginalId(Arrays.asList(rs.getString("datasourceid")));
ds.setCollectedfrom(listKeyValues(rs.getString("collectedfromid"), rs.getString("collectedfromname")));
ds.setPid(new ArrayList<>());
ds.setDateofcollection(asString(rs.getDate("dateofcollection")));
ds.setDateoftransformation(null); // Value not returned by the SQL query
ds.setExtraInfo(new ArrayList<>()); // Values not present in the DB
ds.setOaiprovenance(null); // Values not present in the DB
ds.setDatasourcetype(prepareQualifierSplitting(rs.getString("datasourcetype")));
ds.setOpenairecompatibility(prepareQualifierSplitting(rs.getString("openairecompatibility")));
ds.setOfficialname(field(rs.getString("officialname"), info));
ds.setEnglishname(field(rs.getString("englishname"), info));
ds.setWebsiteurl(field(rs.getString("websiteurl"), info));
ds.setLogourl(field(rs.getString("logourl"), info));
ds.setContactemail(field(rs.getString("contactemail"), info));
ds.setNamespaceprefix(field(rs.getString("namespaceprefix"), info));
ds.setLatitude(field(Double.toString(rs.getDouble("latitude")), info));
ds.setLongitude(field(Double.toString(rs.getDouble("longitude")), info));
ds.setDateofvalidation(field(asString(rs.getDate("dateofvalidation")), info));
ds.setDescription(field(rs.getString("description"), info));
ds.setSubjects(prepareListOfStructProps(rs.getArray("subjects"), info));
ds.setOdnumberofitems(field(Double.toString(rs.getInt("odnumberofitems")), info));
ds.setOdnumberofitemsdate(field(asString(rs.getDate("odnumberofitemsdate")), info));
ds.setOdpolicies(field(rs.getString("odpolicies"), info));
ds.setOdlanguages(prepareListFields(rs.getArray("odlanguages"), info));
ds.setOdcontenttypes(prepareListFields(rs.getArray("odcontenttypes"), info));
ds.setAccessinfopackage(prepareListFields(rs.getArray("accessinfopackage"), info));
ds.setReleasestartdate(field(asString(rs.getDate("releasestartdate")), info));
ds.setReleaseenddate(field(asString(rs.getDate("releaseenddate")), info));
ds.setMissionstatementurl(field(rs.getString("missionstatementurl"), info));
ds.setDataprovider(field(rs.getBoolean("dataprovider"), info));
ds.setServiceprovider(field(rs.getBoolean("serviceprovider"), info));
ds.setDatabaseaccesstype(field(rs.getString("databaseaccesstype"), info));
ds.setDatauploadtype(field(rs.getString("datauploadtype"), info));
ds.setDatabaseaccessrestriction(field(rs.getString("databaseaccessrestriction"), info));
ds.setDatauploadrestriction(field(rs.getString("datauploadrestriction"), info));
ds.setVersioning(field(rs.getBoolean("versioning"), info));
ds.setCitationguidelineurl(field(rs.getString("citationguidelineurl"), info));
ds.setQualitymanagementkind(field(rs.getString("qualitymanagementkind"), info));
ds.setPidsystems(field(rs.getString("pidsystems"), info));
ds.setCertificates(field(rs.getString("certificates"), info));
ds.setPolicies(new ArrayList<>()); // The sql query returns an empty array
ds.setJournal(prepareJournal(rs.getString("officialname"), rs.getString("journal"), info)); // Journal
ds.setDataInfo(info);
ds.setLastupdatetimestamp(lastUpdateTimestamp);
return Arrays.asList(ds);
} catch (final Exception e) {
throw new RuntimeException(e);
}
}
public List<Oaf> processProject(final ResultSet rs) {
try {
final DataInfo info = prepareDataInfo(rs);
final Project p = new Project();
p.setId(createOpenaireId(40, rs.getString("projectid"), true));
p.setOriginalId(Arrays.asList(rs.getString("projectid")));
p.setCollectedfrom(listKeyValues(rs.getString("collectedfromid"), rs.getString("collectedfromname")));
p.setPid(new ArrayList<>());
p.setDateofcollection(asString(rs.getDate("dateofcollection")));
p.setDateoftransformation(asString(rs.getDate("dateoftransformation")));
p.setExtraInfo(new ArrayList<>()); // Values not present in the DB
p.setOaiprovenance(null); // Values not present in the DB
p.setWebsiteurl(field(rs.getString("websiteurl"), info));
p.setCode(field(rs.getString("code"), info));
p.setAcronym(field(rs.getString("acronym"), info));
p.setTitle(field(rs.getString("title"), info));
p.setStartdate(field(asString(rs.getDate("startdate")), info));
p.setEnddate(field(asString(rs.getDate("enddate")), info));
p.setCallidentifier(field(rs.getString("callidentifier"), info));
p.setKeywords(field(rs.getString("keywords"), info));
p.setDuration(field(Integer.toString(rs.getInt("duration")), info));
p.setEcsc39(field(Boolean.toString(rs.getBoolean("ecsc39")), info));
p.setOamandatepublications(field(Boolean.toString(rs.getBoolean("oamandatepublications")), info));
p.setEcarticle29_3(field(Boolean.toString(rs.getBoolean("ecarticle29_3")), info));
p.setSubjects(prepareListOfStructProps(rs.getArray("subjects"), info));
p.setFundingtree(prepareListFields(rs.getArray("fundingtree"), info));
p.setContracttype(prepareQualifierSplitting(rs.getString("contracttype")));
p.setOptional1(field(rs.getString("optional1"), info));
p.setOptional2(field(rs.getString("optional2"), info));
p.setJsonextrainfo(field(rs.getString("jsonextrainfo"), info));
p.setContactfullname(field(rs.getString("contactfullname"), info));
p.setContactfax(field(rs.getString("contactfax"), info));
p.setContactphone(field(rs.getString("contactphone"), info));
p.setContactemail(field(rs.getString("contactemail"), info));
p.setSummary(field(rs.getString("summary"), info));
p.setCurrency(field(rs.getString("currency"), info));
p.setTotalcost(new Float(rs.getDouble("totalcost")));
p.setFundedamount(new Float(rs.getDouble("fundedamount")));
p.setDataInfo(info);
p.setLastupdatetimestamp(lastUpdateTimestamp);
return Arrays.asList(p);
} catch (final Exception e) {
throw new RuntimeException(e);
}
}
public List<Oaf> processOrganization(final ResultSet rs) {
try {
final DataInfo info = prepareDataInfo(rs);
final Organization o = new Organization();
o.setId(createOpenaireId(20, rs.getString("organizationid"), true));
o.setOriginalId(Arrays.asList(rs.getString("organizationid")));
o.setCollectedfrom(listKeyValues(rs.getString("collectedfromid"), rs.getString("collectedfromname")));
o.setPid(new ArrayList<>());
o.setDateofcollection(asString(rs.getDate("dateofcollection")));
o.setDateoftransformation(asString(rs.getDate("dateoftransformation")));
o.setExtraInfo(new ArrayList<>()); // Values not present in the DB
o.setOaiprovenance(null); // Values not present in the DB
o.setLegalshortname(field(rs.getString("legalshortname"), info));
o.setLegalname(field(rs.getString("legalname"), info));
o.setAlternativeNames(new ArrayList<>()); // Values not returned by the SQL query
o.setWebsiteurl(field(rs.getString("websiteurl"), info));
o.setLogourl(field(rs.getString("logourl"), info));
o.setEclegalbody(field(Boolean.toString(rs.getBoolean("eclegalbody")), info));
o.setEclegalperson(field(Boolean.toString(rs.getBoolean("eclegalperson")), info));
o.setEcnonprofit(field(Boolean.toString(rs.getBoolean("ecnonprofit")), info));
o.setEcresearchorganization(field(Boolean.toString(rs.getBoolean("ecresearchorganization")), info));
o.setEchighereducation(field(Boolean.toString(rs.getBoolean("echighereducation")), info));
o.setEcinternationalorganizationeurinterests(field(Boolean.toString(rs.getBoolean("ecinternationalorganizationeurinterests")), info));
o.setEcinternationalorganization(field(Boolean.toString(rs.getBoolean("ecinternationalorganization")), info));
o.setEcenterprise(field(Boolean.toString(rs.getBoolean("ecenterprise")), info));
o.setEcsmevalidated(field(Boolean.toString(rs.getBoolean("ecsmevalidated")), info));
o.setEcnutscode(field(Boolean.toString(rs.getBoolean("ecnutscode")), info));
o.setCountry(prepareQualifierSplitting(rs.getString("country")));
o.setDataInfo(info);
o.setLastupdatetimestamp(lastUpdateTimestamp);
return Arrays.asList(o);
} catch (final Exception e) {
throw new RuntimeException(e);
}
}
public List<Oaf> processDatasourceOrganization(final ResultSet rs) {
try {
final DataInfo info = prepareDataInfo(rs);
final String orgId = createOpenaireId(20, rs.getString("organization"), true);
final String dsId = createOpenaireId(10, rs.getString("datasource"), true);
final List<KeyValue> collectedFrom = listKeyValues(rs.getString("collectedfromid"), rs.getString("collectedfromname"));
final Relation r1 = new Relation();
r1.setRelType("datasourceOrganization");
r1.setSubRelType("provision");
r1.setRelClass("isProvidedBy");
r1.setSource(dsId);
r1.setTarget(orgId);
r1.setCollectedFrom(collectedFrom);
r1.setDataInfo(info);
r1.setLastupdatetimestamp(lastUpdateTimestamp);
final Relation r2 = new Relation();
r2.setRelType("datasourceOrganization");
r2.setSubRelType("provision");
r2.setRelClass("provides");
r2.setSource(orgId);
r2.setTarget(dsId);
r2.setCollectedFrom(collectedFrom);
r2.setDataInfo(info);
r2.setLastupdatetimestamp(lastUpdateTimestamp);
return Arrays.asList(r1, r2);
} catch (final Exception e) {
throw new RuntimeException(e);
}
}
public List<Oaf> processProjectOrganization(final ResultSet rs) {
try {
final DataInfo info = prepareDataInfo(rs);
final String orgId = createOpenaireId(20, rs.getString("resporganization"), true);
final String projectId = createOpenaireId(40, rs.getString("project"), true);
final List<KeyValue> collectedFrom = listKeyValues(rs.getString("collectedfromid"), rs.getString("collectedfromname"));
final Relation r1 = new Relation();
r1.setRelType("projectOrganization");
r1.setSubRelType("participation");
r1.setRelClass("isParticipant");
r1.setSource(projectId);
r1.setTarget(orgId);
r1.setCollectedFrom(collectedFrom);
r1.setDataInfo(info);
r1.setLastupdatetimestamp(lastUpdateTimestamp);
final Relation r2 = new Relation();
r2.setRelType("projectOrganization");
r2.setSubRelType("participation");
r2.setRelClass("hasParticipant");
r2.setSource(orgId);
r2.setTarget(projectId);
r2.setCollectedFrom(collectedFrom);
r2.setDataInfo(info);
r2.setLastupdatetimestamp(lastUpdateTimestamp);
return Arrays.asList(r1, r2);
} catch (final Exception e) {
throw new RuntimeException(e);
}
}
public List<Oaf> processClaims(final ResultSet rs) {
final DataInfo info =
dataInfo(false, null, false, false, qualifier("user:claim", "user:claim", "dnet:provenanceActions", "dnet:provenanceActions"), "0.9");
try {
if (rs.getString("source_type").equals("context")) {
final Result r;
if (rs.getString("target_type").equals("dataset")) {
r = new Dataset();
} else if (rs.getString("target_type").equals("software")) {
r = new Software();
} else if (rs.getString("target_type").equals("other")) {
r = new OtherResearchProduct();
} else {
r = new Publication();
}
r.setId(createOpenaireId(50, rs.getString("target_id"), false));
r.setLastupdatetimestamp(lastUpdateTimestamp);
r.setContext(prepareContext(rs.getString("source_id"), info));
r.setDataInfo(info);
return Arrays.asList(r);
} else {
final String sourceId = createOpenaireId(rs.getString("source_type"), rs.getString("source_id"), false);
final String targetId = createOpenaireId(rs.getString("target_type"), rs.getString("target_id"), false);
final Relation r1 = new Relation();
final Relation r2 = new Relation();
if (rs.getString("source_type").equals("project")) {
r1.setRelType("resultProject");
r1.setSubRelType("outcome");
r1.setRelClass("produces");
r2.setRelType("resultProject");
r2.setSubRelType("outcome");
r2.setRelClass("isProducedBy");
} else {
r1.setRelType("resultResult");
r1.setSubRelType("relationship");
r1.setRelClass("isRelatedTo");
r2.setRelType("resultResult");
r2.setSubRelType("relationship");
r2.setRelClass("isRelatedTo");
}
r1.setSource(sourceId);
r1.setTarget(targetId);
r1.setDataInfo(info);
r1.setLastupdatetimestamp(lastUpdateTimestamp);
r2.setSource(targetId);
r2.setTarget(sourceId);
r2.setDataInfo(info);
r2.setLastupdatetimestamp(lastUpdateTimestamp);
return Arrays.asList(r1, r2);
}
} catch (final Exception e) {
throw new RuntimeException(e);
}
}
private List<Context> prepareContext(final String id, final DataInfo dataInfo) {
final Context context = new Context();
context.setId(id);
context.setDataInfo(Arrays.asList(dataInfo));
return Arrays.asList(context);
}
private DataInfo prepareDataInfo(final ResultSet rs) throws SQLException {
final Boolean deletedbyinference = rs.getBoolean("deletedbyinference");
final String inferenceprovenance = rs.getString("inferenceprovenance");
final Boolean inferred = rs.getBoolean("inferred");
final String trust = rs.getString("trust");
return dataInfo(deletedbyinference, inferenceprovenance, inferred, false, ENTITYREGISTRY_PROVENANCE_ACTION, trust);
}
private Qualifier prepareQualifierSplitting(final String s) {
if (StringUtils.isBlank(s)) { return null; }
final String[] arr = s.split("@@@");
return arr.length == 4 ? qualifier(arr[0], arr[1], arr[2], arr[3]) : null;
}
private List<Field<String>> prepareListFields(final Array array, final DataInfo info) {
try {
return array != null ? listFields(info, (String[]) array.getArray()) : new ArrayList<>();
} catch (final SQLException e) {
throw new RuntimeException("Invalid SQL array", e);
}
}
private StructuredProperty prepareStructProp(final String s, final DataInfo dataInfo) {
if (StringUtils.isBlank(s)) { return null; }
final String[] parts = s.split("###");
if (parts.length == 2) {
final String value = parts[0];
final String[] arr = parts[1].split("@@@");
if (arr.length == 4) { return structuredProperty(value, arr[0], arr[1], arr[2], arr[3], dataInfo); }
}
return null;
}
private List<StructuredProperty> prepareListOfStructProps(final Array array, final DataInfo dataInfo) throws SQLException {
final List<StructuredProperty> res = new ArrayList<>();
if (array != null) {
for (final String s : (String[]) array.getArray()) {
final StructuredProperty sp = prepareStructProp(s, dataInfo);
if (sp != null) {
res.add(sp);
}
}
}
return res;
}
private Journal prepareJournal(final String name, final String sj, final DataInfo info) {
if (StringUtils.isNotBlank(sj)) {
final String[] arr = sj.split("@@@");
if (arr.length == 3) {
final String issn = StringUtils.isNotBlank(arr[0]) ? arr[0] : null;
final String eissn = StringUtils.isNotBlank(arr[1]) ? arr[1] : null;;
final String lissn = StringUtils.isNotBlank(arr[2]) ? arr[2] : null;;
if (issn != null || eissn != null
|| lissn != null) { return journal(name, issn, eissn, eissn, null, null, null, null, null, null, null, info); }
}
}
return null;
}
@Override
public void close() throws IOException {
super.close();
dbClient.close();
}
}

@ -0,0 +1,67 @@
package eu.dnetlib.dhp.migration.step1;
import java.io.Closeable;
import java.io.IOException;
import java.util.Map;
import java.util.Map.Entry;
import org.apache.commons.io.IOUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.migration.utils.AbstractMigrationApplication;
import eu.dnetlib.dhp.migration.utils.MdstoreClient;
public class MigrateMongoMdstoresApplication extends AbstractMigrationApplication implements Closeable {
private static final Log log = LogFactory.getLog(MigrateMongoMdstoresApplication.class);
private final MdstoreClient mdstoreClient;
public static void main(final String[] args) throws Exception {
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
IOUtils.toString(MigrateMongoMdstoresApplication.class.getResourceAsStream("/eu/dnetlib/dhp/migration/migrate_mongo_mstores_parameters.json")));
parser.parseArgument(args);
final String mongoBaseUrl = parser.get("mongoBaseUrl");
final String mongoDb = parser.get("mongoDb");
final String mdFormat = parser.get("mdFormat");
final String mdLayout = parser.get("mdLayout");
final String mdInterpretation = parser.get("mdInterpretation");
final String hdfsPath = parser.get("hdfsPath");
try (MigrateMongoMdstoresApplication app = new MigrateMongoMdstoresApplication(hdfsPath, mongoBaseUrl, mongoDb)) {
app.execute(mdFormat, mdLayout, mdInterpretation);
}
}
public MigrateMongoMdstoresApplication(final String hdfsPath, final String mongoBaseUrl, final String mongoDb) throws Exception {
super(hdfsPath);
this.mdstoreClient = new MdstoreClient(mongoBaseUrl, mongoDb);
}
public void execute(final String format, final String layout, final String interpretation) {
final Map<String, String> colls = mdstoreClient.validCollections(format, layout, interpretation);
log.info("Found " + colls.size() + " mdstores");
for (final Entry<String, String> entry : colls.entrySet()) {
log.info("Processing mdstore " + entry.getKey() + " (collection: " + entry.getValue() + ")");
final String currentColl = entry.getValue();
for (final String xml : mdstoreClient.listRecords(currentColl)) {
emit(xml, "native_" + format);
}
}
}
@Override
public void close() throws IOException {
super.close();
mdstoreClient.close();
}
}

@ -0,0 +1,394 @@
package eu.dnetlib.dhp.migration.step2;
import static eu.dnetlib.dhp.migration.utils.OafMapperUtils.createOpenaireId;
import static eu.dnetlib.dhp.migration.utils.OafMapperUtils.dataInfo;
import static eu.dnetlib.dhp.migration.utils.OafMapperUtils.field;
import static eu.dnetlib.dhp.migration.utils.OafMapperUtils.journal;
import static eu.dnetlib.dhp.migration.utils.OafMapperUtils.keyValue;
import static eu.dnetlib.dhp.migration.utils.OafMapperUtils.listFields;
import static eu.dnetlib.dhp.migration.utils.OafMapperUtils.oaiIProvenance;
import static eu.dnetlib.dhp.migration.utils.OafMapperUtils.qualifier;
import static eu.dnetlib.dhp.migration.utils.OafMapperUtils.structuredProperty;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.commons.lang3.StringUtils;
import org.dom4j.Document;
import org.dom4j.DocumentFactory;
import org.dom4j.DocumentHelper;
import org.dom4j.Node;
import eu.dnetlib.dhp.schema.oaf.Author;
import eu.dnetlib.dhp.schema.oaf.DataInfo;
import eu.dnetlib.dhp.schema.oaf.Dataset;
import eu.dnetlib.dhp.schema.oaf.Field;
import eu.dnetlib.dhp.schema.oaf.GeoLocation;
import eu.dnetlib.dhp.schema.oaf.Instance;
import eu.dnetlib.dhp.schema.oaf.Journal;
import eu.dnetlib.dhp.schema.oaf.KeyValue;
import eu.dnetlib.dhp.schema.oaf.OAIProvenance;
import eu.dnetlib.dhp.schema.oaf.Oaf;
import eu.dnetlib.dhp.schema.oaf.OtherResearchProduct;
import eu.dnetlib.dhp.schema.oaf.Publication;
import eu.dnetlib.dhp.schema.oaf.Qualifier;
import eu.dnetlib.dhp.schema.oaf.Relation;
import eu.dnetlib.dhp.schema.oaf.Result;
import eu.dnetlib.dhp.schema.oaf.Software;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
public abstract class AbstractMdRecordToOafMapper {
protected final Map<String, String> code2name;
protected static final Qualifier MAIN_TITLE_QUALIFIER = qualifier("main title", "main title", "dnet:dataCite_title", "dnet:dataCite_title");
protected static final Qualifier PUBLICATION_RESULTTYPE_QUALIFIER =
qualifier("publication", "publication", "dnet:result_typologies", "dnet:result_typologies");
protected static final Qualifier DATASET_RESULTTYPE_QUALIFIER = qualifier("dataset", "dataset", "dnet:result_typologies", "dnet:result_typologies");
protected static final Qualifier SOFTWARE_RESULTTYPE_QUALIFIER = qualifier("software", "software", "dnet:result_typologies", "dnet:result_typologies");
protected static final Qualifier OTHER_RESULTTYPE_QUALIFIER = qualifier("other", "other", "dnet:result_typologies", "dnet:result_typologies");
protected AbstractMdRecordToOafMapper(final Map<String, String> code2name) {
this.code2name = code2name;
}
public List<Oaf> processMdRecord(final String xml) {
try {
final Map<String, String> nsContext = new HashMap<>();
nsContext.put("dr", "http://www.driver-repository.eu/namespace/dr");
nsContext.put("dri", "http://www.driver-repository.eu/namespace/dri");
nsContext.put("oaf", "http://namespace.openaire.eu/oaf");
nsContext.put("oai", "http://www.openarchives.org/OAI/2.0/");
nsContext.put("prov", "http://www.openarchives.org/OAI/2.0/provenance");
nsContext.put("dc", "http://purl.org/dc/elements/1.1/");
nsContext.put("datacite", "http://datacite.org/schema/kernel-3");
DocumentFactory.getInstance().setXPathNamespaceURIs(nsContext);
final Document doc = DocumentHelper.parseText(xml);
final String type = doc.valueOf("//dr:CobjCategory/@type");
final KeyValue collectedFrom = keyValue(doc.valueOf("//oaf:collectedFrom/@id"), doc.valueOf("//oaf:collectedFrom/@name"));
final KeyValue hostedBy = StringUtils.isBlank(doc.valueOf("//oaf:hostedBy/@id")) ? collectedFrom
: keyValue(doc.valueOf("//oaf:hostedBy/@id"), doc.valueOf("//oaf:hostedBy/@name"));
final DataInfo info = prepareDataInfo(doc);
final long lastUpdateTimestamp = new Date().getTime();
return createOafs(doc, type, collectedFrom, hostedBy, info, lastUpdateTimestamp);
} catch (final Exception e) {
throw new RuntimeException(e);
}
}
protected List<Oaf> createOafs(final Document doc,
final String type,
final KeyValue collectedFrom,
final KeyValue hostedBy,
final DataInfo info,
final long lastUpdateTimestamp) {
final List<Oaf> oafs = new ArrayList<>();
switch (type.toLowerCase()) {
case "":
case "publication":
final Publication p = new Publication();
populateResultFields(p, doc, collectedFrom, hostedBy, info, lastUpdateTimestamp);
p.setResulttype(PUBLICATION_RESULTTYPE_QUALIFIER);
p.setJournal(prepareJournal(doc, info));
oafs.add(p);
break;
case "dataset":
final Dataset d = new Dataset();
populateResultFields(d, doc, collectedFrom, hostedBy, info, lastUpdateTimestamp);
d.setResulttype(DATASET_RESULTTYPE_QUALIFIER);
d.setStoragedate(prepareDatasetStorageDate(doc, info));
d.setDevice(prepareDatasetDevice(doc, info));
d.setSize(prepareDatasetSize(doc, info));
d.setVersion(prepareDatasetVersion(doc, info));
d.setLastmetadataupdate(prepareDatasetLastMetadataUpdate(doc, info));
d.setMetadataversionnumber(prepareDatasetMetadataVersionNumber(doc, info));
d.setGeolocation(prepareDatasetGeoLocations(doc, info));
oafs.add(d);
break;
case "software":
final Software s = new Software();
populateResultFields(s, doc, collectedFrom, hostedBy, info, lastUpdateTimestamp);
s.setResulttype(SOFTWARE_RESULTTYPE_QUALIFIER);
s.setDocumentationUrl(prepareSoftwareDocumentationUrls(doc, info));
s.setLicense(prepareSoftwareLicenses(doc, info));
s.setCodeRepositoryUrl(prepareSoftwareCodeRepositoryUrl(doc, info));
s.setProgrammingLanguage(prepareSoftwareProgrammingLanguage(doc, info));
oafs.add(s);
break;
case "otherresearchproducts":
default:
final OtherResearchProduct o = new OtherResearchProduct();
populateResultFields(o, doc, collectedFrom, hostedBy, info, lastUpdateTimestamp);
o.setResulttype(OTHER_RESULTTYPE_QUALIFIER);
o.setContactperson(prepareOtherResearchProductContactPersons(doc, info));
o.setContactgroup(prepareOtherResearchProductContactGroups(doc, info));
o.setTool(prepareOtherResearchProductTools(doc, info));
oafs.add(o);
break;
}
if (!oafs.isEmpty()) {
oafs.addAll(addProjectRels(doc, collectedFrom, info, lastUpdateTimestamp));
oafs.addAll(addOtherResultRels(doc, collectedFrom, info, lastUpdateTimestamp));
}
return oafs;
}
private List<Oaf> addProjectRels(final Document doc,
final KeyValue collectedFrom,
final DataInfo info,
final long lastUpdateTimestamp) {
final List<Oaf> res = new ArrayList<>();
final String docId = createOpenaireId(50, doc.valueOf("//dri:objIdentifier"), false);
for (final Object o : doc.selectNodes("//oaf:projectid")) {
final String projectId = createOpenaireId(40, ((Node) o).getText(), true);
final Relation r1 = new Relation();
r1.setRelType("resultProject");
r1.setSubRelType("outcome");
r1.setRelClass("isProducedBy");
r1.setSource(docId);
r1.setTarget(projectId);
r1.setCollectedFrom(Arrays.asList(collectedFrom));
r1.setDataInfo(info);
r1.setLastupdatetimestamp(lastUpdateTimestamp);
res.add(r1);
final Relation r2 = new Relation();
r2.setRelType("resultProject");
r2.setSubRelType("outcome");
r2.setRelClass("produces");
r2.setSource(projectId);
r2.setTarget(docId);
r2.setCollectedFrom(Arrays.asList(collectedFrom));
r2.setDataInfo(info);
r2.setLastupdatetimestamp(lastUpdateTimestamp);
res.add(r2);
}
return res;
}
protected abstract List<Oaf> addOtherResultRels(final Document doc,
final KeyValue collectedFrom,
final DataInfo info,
final long lastUpdateTimestamp);
private void populateResultFields(final Result r,
final Document doc,
final KeyValue collectedFrom,
final KeyValue hostedBy,
final DataInfo info,
final long lastUpdateTimestamp) {
r.setDataInfo(info);
r.setLastupdatetimestamp(lastUpdateTimestamp);
r.setId(createOpenaireId(50, doc.valueOf("//dri:objIdentifier"), false));
r.setOriginalId(Arrays.asList(doc.valueOf("//dri:objIdentifier")));
r.setCollectedfrom(Arrays.asList(collectedFrom));
r.setPid(prepareListStructProps(doc, "//oaf:identifier", "@identifierType", "dnet:pid_types", "dnet:pid_types", info));
r.setDateofcollection(doc.valueOf("//dr:dateOfCollection"));
r.setDateoftransformation(doc.valueOf("//dr:dateOfTransformation"));
r.setExtraInfo(new ArrayList<>()); // NOT PRESENT IN MDSTORES
r.setOaiprovenance(prepareOAIprovenance(doc));
r.setAuthor(prepareAuthors(doc, info));
r.setLanguage(prepareLanguages(doc));
r.setCountry(new ArrayList<>()); // NOT PRESENT IN MDSTORES
r.setSubject(prepareSubjects(doc, info));
r.setTitle(prepareTitles(doc, info));
r.setRelevantdate(prepareRelevantDates(doc, info));
r.setDescription(prepareDescriptions(doc, info));
r.setDateofacceptance(prepareField(doc, "//oaf:dateAccepted", info));
r.setPublisher(preparePublisher(doc, info));
r.setEmbargoenddate(prepareField(doc, "//oaf:embargoenddate", info));
r.setSource(prepareSources(doc, info));
r.setFulltext(new ArrayList<>()); // NOT PRESENT IN MDSTORES
r.setFormat(prepareFormats(doc, info));
r.setContributor(prepareContributors(doc, info));
r.setResourcetype(prepareResourceType(doc, info));
r.setCoverage(prepareCoverages(doc, info));
r.setContext(new ArrayList<>()); // NOT PRESENT IN MDSTORES
r.setExternalReference(new ArrayList<>()); // NOT PRESENT IN MDSTORES
r.setInstance(prepareInstances(doc, info, collectedFrom, hostedBy));
}
protected abstract Qualifier prepareResourceType(Document doc, DataInfo info);
protected abstract List<Instance> prepareInstances(Document doc, DataInfo info, KeyValue collectedfrom, KeyValue hostedby);
protected abstract List<Field<String>> prepareSources(Document doc, DataInfo info);
protected abstract List<StructuredProperty> prepareRelevantDates(Document doc, DataInfo info);
protected abstract List<Field<String>> prepareCoverages(Document doc, DataInfo info);
protected abstract List<Field<String>> prepareContributors(Document doc, DataInfo info);
protected abstract List<Field<String>> prepareFormats(Document doc, DataInfo info);
protected abstract Field<String> preparePublisher(Document doc, DataInfo info);
protected abstract List<Field<String>> prepareDescriptions(Document doc, DataInfo info);
protected abstract List<StructuredProperty> prepareTitles(Document doc, DataInfo info);
protected abstract List<StructuredProperty> prepareSubjects(Document doc, DataInfo info);
protected abstract Qualifier prepareLanguages(Document doc);
protected abstract List<Author> prepareAuthors(Document doc, DataInfo info);
protected abstract List<Field<String>> prepareOtherResearchProductTools(Document doc, DataInfo info);
protected abstract List<Field<String>> prepareOtherResearchProductContactGroups(Document doc, DataInfo info);
protected abstract List<Field<String>> prepareOtherResearchProductContactPersons(Document doc, DataInfo info);
protected abstract Qualifier prepareSoftwareProgrammingLanguage(Document doc, DataInfo info);
protected abstract Field<String> prepareSoftwareCodeRepositoryUrl(Document doc, DataInfo info);
protected abstract List<StructuredProperty> prepareSoftwareLicenses(Document doc, DataInfo info);
protected abstract List<Field<String>> prepareSoftwareDocumentationUrls(Document doc, DataInfo info);
protected abstract List<GeoLocation> prepareDatasetGeoLocations(Document doc, DataInfo info);
protected abstract Field<String> prepareDatasetMetadataVersionNumber(Document doc, DataInfo info);
protected abstract Field<String> prepareDatasetLastMetadataUpdate(Document doc, DataInfo info);
protected abstract Field<String> prepareDatasetVersion(Document doc, DataInfo info);
protected abstract Field<String> prepareDatasetSize(Document doc, DataInfo info);
protected abstract Field<String> prepareDatasetDevice(Document doc, DataInfo info);
protected abstract Field<String> prepareDatasetStorageDate(Document doc, DataInfo info);
private Journal prepareJournal(final Document doc, final DataInfo info) {
final Node n = doc.selectSingleNode("//oaf:journal");
if (n != null) {
final String name = n.getText();
final String issnPrinted = n.valueOf("@issn");
final String issnOnline = n.valueOf("@eissn");
final String issnLinking = n.valueOf("@lissn");
final String ep = n.valueOf("@ep");
final String iss = n.valueOf("@iss");
final String sp = n.valueOf("@sp");
final String vol = n.valueOf("@vol");
final String edition = n.valueOf("@edition");
if (StringUtils.isNotBlank(name)) { return journal(name, issnPrinted, issnOnline, issnLinking, ep, iss, sp, vol, edition, null, null, info); }
}
return null;
}
protected Qualifier prepareQualifier(final Node node, final String xpath, final String schemeId, final String schemeName) {
final String classId = node.valueOf(xpath);
final String className = code2name.get(classId);
return qualifier(classId, className, schemeId, schemeName);
}
protected List<StructuredProperty> prepareListStructProps(final Node node,
final String xpath,
final String xpathClassId,
final String schemeId,
final String schemeName,
final DataInfo info) {
final List<StructuredProperty> res = new ArrayList<>();
for (final Object o : node.selectNodes(xpath)) {
final Node n = (Node) o;
final String classId = n.valueOf(xpathClassId);
final String className = code2name.get(classId);
res.add(structuredProperty(n.getText(), classId, className, schemeId, schemeName, info));
}
return res;
}
protected List<StructuredProperty> prepareListStructProps(final Node node, final String xpath, final Qualifier qualifier, final DataInfo info) {
final List<StructuredProperty> res = new ArrayList<>();
for (final Object o : node.selectNodes(xpath)) {
final Node n = (Node) o;
res.add(structuredProperty(n.getText(), qualifier, info));
}
return res;
}
protected List<StructuredProperty> prepareListStructProps(final Node node, final String xpath, final DataInfo info) {
final List<StructuredProperty> res = new ArrayList<>();
for (final Object o : node.selectNodes(xpath)) {
final Node n = (Node) o;
res.add(structuredProperty(n.getText(), n.valueOf("@classid"), n.valueOf("@classname"), n.valueOf("@schemeid"), n
.valueOf("@schemename"), info));
}
return res;
}
protected OAIProvenance prepareOAIprovenance(final Document doc) {
final Node n = doc.selectSingleNode("//*[local-name()='provenance']/*[local-name()='originDescription']");
if (n == null) { return null; }
final String identifier = n.valueOf("./*[local-name()='identifier']");
final String baseURL = n.valueOf("./*[local-name()='baseURL']");;
final String metadataNamespace = n.valueOf("./*[local-name()='metadataNamespace']");;
final boolean altered = n.valueOf("@altered").equalsIgnoreCase("true");
final String datestamp = n.valueOf("./*[local-name()='datestamp']");;
final String harvestDate = n.valueOf("@harvestDate");;
return oaiIProvenance(identifier, baseURL, metadataNamespace, altered, datestamp, harvestDate);
}
protected DataInfo prepareDataInfo(final Document doc) {
final Node n = doc.selectSingleNode("//oaf:datainfo");
if (n == null) { return null; }
final String paClassId = n.valueOf("./oaf:provenanceaction/@classid");
final String paClassName = n.valueOf("./oaf:provenanceaction/@classname");
final String paSchemeId = n.valueOf("./oaf:provenanceaction/@schemeid");
final String paSchemeName = n.valueOf("./oaf:provenanceaction/@schemename");
final boolean deletedbyinference = Boolean.parseBoolean(n.valueOf("./oaf:deletedbyinference"));
final String inferenceprovenance = n.valueOf("./oaf:inferenceprovenance");
final Boolean inferred = Boolean.parseBoolean(n.valueOf("./oaf:inferred"));
final String trust = n.valueOf("./oaf:trust");
return dataInfo(deletedbyinference, inferenceprovenance, inferred, false, qualifier(paClassId, paClassName, paSchemeId, paSchemeName), trust);
}
protected Field<String> prepareField(final Node node, final String xpath, final DataInfo info) {
return field(node.valueOf(xpath), info);
}
protected List<Field<String>> prepareListFields(final Node node, final String xpath, final DataInfo info) {
return listFields(info, prepareListString(node, xpath));
}
protected List<String> prepareListString(final Node node, final String xpath) {
final List<String> res = new ArrayList<>();
for (final Object o : node.selectNodes(xpath)) {
final String s = ((Node) o).getText().trim();
if (StringUtils.isNotBlank(s)) {
res.add(s);
}
}
return res;
}
}

@ -0,0 +1,173 @@
package eu.dnetlib.dhp.migration.step2;
import java.io.IOException;
import java.sql.SQLException;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.compress.GzipCodec;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.SparkSession;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.migration.step1.MigrateMongoMdstoresApplication;
import eu.dnetlib.dhp.migration.utils.DbClient;
import eu.dnetlib.dhp.schema.oaf.Dataset;
import eu.dnetlib.dhp.schema.oaf.Datasource;
import eu.dnetlib.dhp.schema.oaf.Oaf;
import eu.dnetlib.dhp.schema.oaf.Organization;
import eu.dnetlib.dhp.schema.oaf.OtherResearchProduct;
import eu.dnetlib.dhp.schema.oaf.Project;
import eu.dnetlib.dhp.schema.oaf.Publication;
import eu.dnetlib.dhp.schema.oaf.Relation;
import eu.dnetlib.dhp.schema.oaf.Software;
import scala.Tuple2;
public class GenerateEntitiesApplication {
private static final Log log = LogFactory.getLog(GenerateEntitiesApplication.class);
public static void main(final String[] args) throws Exception {
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
IOUtils.toString(MigrateMongoMdstoresApplication.class
.getResourceAsStream("/eu/dnetlib/dhp/migration/generate_entities_parameters.json")));
parser.parseArgument(args);
final String sourcePaths = parser.get("sourcePaths");
final String targetPath = parser.get("targetPath");
final String dbUrl = parser.get("postgresUrl");
final String dbUser = parser.get("postgresUser");
final String dbPassword = parser.get("postgresPassword");
final Map<String, String> code2name = loadClassNames(dbUrl, dbUser, dbPassword);
try (final SparkSession spark = newSparkSession(parser); final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext())) {
final List<String> existingSourcePaths = Arrays.stream(sourcePaths.split(",")).filter(p -> exists(sc, p)).collect(Collectors.toList());
generateEntities(sc, code2name, existingSourcePaths, targetPath);
}
}
private static SparkSession newSparkSession(final ArgumentApplicationParser parser) {
return SparkSession
.builder()
.appName(GenerateEntitiesApplication.class.getSimpleName())
.master(parser.get("master"))
.getOrCreate();
}
private static void generateEntities(final JavaSparkContext sc,
final Map<String, String> code2name,
final List<String> sourcePaths,
final String targetPath) {
log.info("Generate entities from files:");
sourcePaths.forEach(log::info);
JavaRDD<String> inputRdd = sc.emptyRDD();
for (final String sp : sourcePaths) {
inputRdd = inputRdd.union(sc.sequenceFile(sp, Text.class, Text.class)
.map(k -> new Tuple2<>(k._1().toString(), k._2().toString()))
.map(k -> convertToListOaf(k._1(), k._2(), code2name))
.flatMap(list -> list.iterator())
.map(oaf -> oaf.getClass().getSimpleName().toLowerCase() + "|" + convertToJson(oaf)));
}
inputRdd.saveAsTextFile(targetPath, GzipCodec.class);
}
private static List<Oaf> convertToListOaf(final String id, final String s, final Map<String, String> code2name) {
final String type = StringUtils.substringAfter(id, ":");
switch (type.toLowerCase()) {
case "native_oaf":
return new OafToOafMapper(code2name).processMdRecord(s);
case "native_odf":
return new OdfToOafMapper(code2name).processMdRecord(s);
case "datasource":
return Arrays.asList(convertFromJson(s, Datasource.class));
case "organization":
return Arrays.asList(convertFromJson(s, Organization.class));
case "project":
return Arrays.asList(convertFromJson(s, Project.class));
case "relation":
return Arrays.asList(convertFromJson(s, Relation.class));
case "publication":
return Arrays.asList(convertFromJson(s, Publication.class));
case "dataset":
return Arrays.asList(convertFromJson(s, Dataset.class));
case "software":
return Arrays.asList(convertFromJson(s, Software.class));
case "otherresearchproducts":
default:
return Arrays.asList(convertFromJson(s, OtherResearchProduct.class));
}
}
private static Map<String, String> loadClassNames(final String dbUrl, final String dbUser, final String dbPassword) throws IOException {
log.info("Loading vocabulary terms from db...");
final Map<String, String> map = new HashMap<>();
try (DbClient dbClient = new DbClient(dbUrl, dbUser, dbPassword)) {
dbClient.processResults("select code, name from class", rs -> {
try {
map.put(rs.getString("code"), rs.getString("name"));
} catch (final SQLException e) {
e.printStackTrace();
}
});
}
log.info("Found " + map.size() + " terms.");
return map;
}
private static String convertToJson(final Oaf oaf) {
try {
return new ObjectMapper().writeValueAsString(oaf);
} catch (final Exception e) {
throw new RuntimeException(e);
}
}
private static Oaf convertFromJson(final String s, final Class<? extends Oaf> clazz) {
try {
return new ObjectMapper().readValue(s, clazz);
} catch (final Exception e) {
log.error("Error parsing object of class: " + clazz);
log.error(s);
throw new RuntimeException(e);
}
}
private static boolean exists(final JavaSparkContext context, final String pathToFile) {
try {
final FileSystem hdfs = org.apache.hadoop.fs.FileSystem.get(context.hadoopConfiguration());
final Path path = new Path(pathToFile);
return hdfs.exists(path);
} catch (final IOException e) {
throw new RuntimeException(e);
}
}
}

@ -0,0 +1,242 @@
package eu.dnetlib.dhp.migration.step2;
import static eu.dnetlib.dhp.migration.utils.OafMapperUtils.createOpenaireId;
import static eu.dnetlib.dhp.migration.utils.OafMapperUtils.field;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Map;
import org.dom4j.Document;
import org.dom4j.Node;
import eu.dnetlib.dhp.migration.utils.PacePerson;
import eu.dnetlib.dhp.schema.oaf.Author;
import eu.dnetlib.dhp.schema.oaf.DataInfo;
import eu.dnetlib.dhp.schema.oaf.Field;
import eu.dnetlib.dhp.schema.oaf.GeoLocation;
import eu.dnetlib.dhp.schema.oaf.Instance;
import eu.dnetlib.dhp.schema.oaf.KeyValue;
import eu.dnetlib.dhp.schema.oaf.Oaf;
import eu.dnetlib.dhp.schema.oaf.Qualifier;
import eu.dnetlib.dhp.schema.oaf.Relation;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
public class OafToOafMapper extends AbstractMdRecordToOafMapper {
public OafToOafMapper(final Map<String, String> code2name) {
super(code2name);
}
@Override
protected List<Author> prepareAuthors(final Document doc, final DataInfo info) {
final List<Author> res = new ArrayList<>();
int pos = 1;
for (final Object o : doc.selectNodes("//dc:creator")) {
final Node n = (Node) o;
final Author author = new Author();
author.setFullname(n.getText());
author.setRank(pos++);
final PacePerson p = new PacePerson(n.getText(), false);
if (p.isAccurate()) {
author.setName(p.getNormalisedFirstName());
author.setSurname(p.getNormalisedSurname());
}
res.add(author);
}
return res;
}
@Override
protected Qualifier prepareLanguages(final Document doc) {
return prepareQualifier(doc, "//dc:language", "dnet:languages", "dnet:languages");
}
@Override
protected List<StructuredProperty> prepareSubjects(final Document doc, final DataInfo info) {
return prepareListStructProps(doc, "//dc:subject", info);
}
@Override
protected List<StructuredProperty> prepareTitles(final Document doc, final DataInfo info) {
return prepareListStructProps(doc, "//dc:title", MAIN_TITLE_QUALIFIER, info);
}
@Override
protected List<Field<String>> prepareDescriptions(final Document doc, final DataInfo info) {
return prepareListFields(doc, "//dc:description", info);
}
@Override
protected Field<String> preparePublisher(final Document doc, final DataInfo info) {
return prepareField(doc, "//dc:publisher", info);
}
@Override
protected List<Field<String>> prepareFormats(final Document doc, final DataInfo info) {
return prepareListFields(doc, "//dc:format", info);
}
@Override
protected List<Field<String>> prepareContributors(final Document doc, final DataInfo info) {
return prepareListFields(doc, "//dc:contributor", info);
}
@Override
protected List<Field<String>> prepareCoverages(final Document doc, final DataInfo info) {
return prepareListFields(doc, "//dc:coverage", info);
}
@Override
protected List<Instance> prepareInstances(final Document doc, final DataInfo info, final KeyValue collectedfrom, final KeyValue hostedby) {
final List<Instance> res = new ArrayList<>();
for (final Object o : doc.selectNodes("//dc:identifier")) {
final String url = ((Node) o).getText().trim();
if (url.startsWith("http")) {
final Instance instance = new Instance();
instance.setUrl(Arrays.asList(url));
instance.setInstancetype(prepareQualifier(doc, "//dr:CobjCategory", "dnet:publication_resource", "dnet:publication_resource"));
instance.setCollectedfrom(collectedfrom);
instance.setHostedby(hostedby);
instance.setDateofacceptance(field(doc.valueOf("//oaf:dateAccepted"), info));
instance.setDistributionlocation(doc.valueOf("//oaf:distributionlocation"));
instance.setAccessright(prepareQualifier(doc, "//oaf:accessrights", "dnet:access_modes", "dnet:access_modes"));
instance.setLicense(field(doc.valueOf("//oaf:license"), info));
instance.setRefereed(field(doc.valueOf("//oaf:refereed"), info));
instance.setProcessingchargeamount(field(doc.valueOf("//oaf:processingchargeamount"), info));
instance.setProcessingchargecurrency(field(doc.valueOf("//oaf:processingchargeamount/@currency"), info));
res.add(instance);
}
}
return res;
}
@Override
protected List<Field<String>> prepareSources(final Document doc, final DataInfo info) {
return prepareListFields(doc, "//dc:source", info);
}
@Override
protected List<StructuredProperty> prepareRelevantDates(final Document doc, final DataInfo info) {
return new ArrayList<>(); // NOT PRESENT IN OAF
}
// SOFTWARES
@Override
protected Qualifier prepareSoftwareProgrammingLanguage(final Document doc, final DataInfo info) {
return null; // NOT PRESENT IN OAF
}
@Override
protected Field<String> prepareSoftwareCodeRepositoryUrl(final Document doc, final DataInfo info) {
return null; // NOT PRESENT IN OAF
}
@Override
protected List<StructuredProperty> prepareSoftwareLicenses(final Document doc, final DataInfo info) {
return new ArrayList<>(); // NOT PRESENT IN OAF
}
@Override
protected List<Field<String>> prepareSoftwareDocumentationUrls(final Document doc, final DataInfo info) {
return new ArrayList<>(); // NOT PRESENT IN OAF
}
// DATASETS
@Override
protected List<GeoLocation> prepareDatasetGeoLocations(final Document doc, final DataInfo info) {
return new ArrayList<>(); // NOT PRESENT IN OAF
}
@Override
protected Field<String> prepareDatasetMetadataVersionNumber(final Document doc, final DataInfo info) {
return null; // NOT PRESENT IN OAF
}
@Override
protected Field<String> prepareDatasetLastMetadataUpdate(final Document doc, final DataInfo info) {
return null; // NOT PRESENT IN OAF
}
@Override
protected Field<String> prepareDatasetVersion(final Document doc, final DataInfo info) {
return null; // NOT PRESENT IN OAF
}
@Override
protected Field<String> prepareDatasetSize(final Document doc, final DataInfo info) {
return null; // NOT PRESENT IN OAF
}
@Override
protected Field<String> prepareDatasetDevice(final Document doc, final DataInfo info) {
return null; // NOT PRESENT IN OAF
}
@Override
protected Field<String> prepareDatasetStorageDate(final Document doc, final DataInfo info) {
return null; // NOT PRESENT IN OAF
}
// OTHER PRODUCTS
@Override
protected List<Field<String>> prepareOtherResearchProductTools(final Document doc, final DataInfo info) {
return new ArrayList<>(); // NOT PRESENT IN OAF
}
@Override
protected List<Field<String>> prepareOtherResearchProductContactGroups(final Document doc, final DataInfo info) {
return new ArrayList<>(); // NOT PRESENT IN OAF
}
@Override
protected List<Field<String>> prepareOtherResearchProductContactPersons(final Document doc, final DataInfo info) {
return new ArrayList<>(); // NOT PRESENT IN OAF
}
@Override
protected List<Oaf> addOtherResultRels(final Document doc,
final KeyValue collectedFrom,
final DataInfo info,
final long lastUpdateTimestamp) {
final String docId = createOpenaireId(50, doc.valueOf("//dri:objIdentifier"), false);
final List<Oaf> res = new ArrayList<>();
for (final Object o : doc.selectNodes("//*[local-name()='relatedDataset']")) {
final String otherId = createOpenaireId(50, ((Node) o).getText(), false);
final Relation r1 = new Relation();
r1.setRelType("resultResult");
r1.setSubRelType("publicationDataset");
r1.setRelClass("isRelatedTo");
r1.setSource(docId);
r1.setTarget(otherId);
r1.setCollectedFrom(Arrays.asList(collectedFrom));
r1.setDataInfo(info);
r1.setLastupdatetimestamp(lastUpdateTimestamp);
res.add(r1);
final Relation r2 = new Relation();
r2.setRelType("resultResult");
r2.setSubRelType("publicationDataset");
r2.setRelClass("isRelatedTo");
r2.setSource(otherId);
r2.setTarget(docId);
r2.setCollectedFrom(Arrays.asList(collectedFrom));
r2.setDataInfo(info);
r2.setLastupdatetimestamp(lastUpdateTimestamp);
res.add(r2);
}
return res;
}
@Override
protected Qualifier prepareResourceType(final Document doc, final DataInfo info) {
return null; // NOT PRESENT IN OAF
}
}

@ -0,0 +1,265 @@
package eu.dnetlib.dhp.migration.step2;
import static eu.dnetlib.dhp.migration.utils.OafMapperUtils.createOpenaireId;
import static eu.dnetlib.dhp.migration.utils.OafMapperUtils.field;
import static eu.dnetlib.dhp.migration.utils.OafMapperUtils.structuredProperty;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Map;
import org.apache.commons.lang3.StringUtils;
import org.dom4j.Document;
import org.dom4j.Node;
import eu.dnetlib.dhp.schema.oaf.Author;
import eu.dnetlib.dhp.schema.oaf.DataInfo;
import eu.dnetlib.dhp.schema.oaf.Field;
import eu.dnetlib.dhp.schema.oaf.GeoLocation;
import eu.dnetlib.dhp.schema.oaf.Instance;
import eu.dnetlib.dhp.schema.oaf.KeyValue;
import eu.dnetlib.dhp.schema.oaf.Oaf;
import eu.dnetlib.dhp.schema.oaf.Qualifier;
import eu.dnetlib.dhp.schema.oaf.Relation;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
public class OdfToOafMapper extends AbstractMdRecordToOafMapper {
public OdfToOafMapper(final Map<String, String> code2name) {
super(code2name);
}
@Override
protected List<StructuredProperty> prepareTitles(final Document doc, final DataInfo info) {
return prepareListStructProps(doc, "//datacite:title", MAIN_TITLE_QUALIFIER, info);
}
@Override
protected List<Author> prepareAuthors(final Document doc, final DataInfo info) {
final List<Author> res = new ArrayList<>();
int pos = 1;
for (final Object o : doc.selectNodes("//datacite:creator")) {
final Node n = (Node) o;
final Author author = new Author();
author.setFullname(n.valueOf("./datacite:creatorName"));
author.setName(n.valueOf("./datacite:givenName"));
author.setSurname(n.valueOf("./datacite:familyName"));
author.setAffiliation(prepareListFields(doc, "./datacite:affiliation", info));
author.setPid(preparePids(doc, info));
author.setRank(pos++);
res.add(author);
}
return res;
}
private List<StructuredProperty> preparePids(final Document doc, final DataInfo info) {
final List<StructuredProperty> res = new ArrayList<>();
for (final Object o : doc.selectNodes("./datacite:nameIdentifier")) {
res.add(structuredProperty(((Node) o).getText(), prepareQualifier((Node) o, "./@nameIdentifierScheme", "dnet:pid_types", "dnet:pid_types"), info));
}
return res;
}
@Override
protected List<Instance> prepareInstances(final Document doc, final DataInfo info, final KeyValue collectedfrom, final KeyValue hostedby) {
final List<Instance> res = new ArrayList<>();
for (final Object o : doc.selectNodes("//datacite:alternateIdentifier[@alternateIdentifierType='URL']")) {
final Instance instance = new Instance();
instance.setUrl(Arrays.asList(((Node) o).getText().trim()));
instance.setInstancetype(prepareQualifier(doc, "//dr:CobjCategory", "dnet:publication_resource", "dnet:publication_resource"));
instance.setCollectedfrom(collectedfrom);
instance.setHostedby(hostedby);
instance.setDateofacceptance(field(doc.valueOf("//oaf:dateAccepted"), info));
instance.setDistributionlocation(doc.valueOf("//oaf:distributionlocation"));
instance.setAccessright(prepareQualifier(doc, "//oaf:accessrights", "dnet:access_modes", "dnet:access_modes"));
instance.setLicense(field(doc.valueOf("//oaf:license"), info));
instance.setRefereed(field(doc.valueOf("//oaf:refereed"), info));
instance.setProcessingchargeamount(field(doc.valueOf("//oaf:processingchargeamount"), info));
instance.setProcessingchargecurrency(field(doc.valueOf("//oaf:processingchargeamount/@currency"), info));
res.add(instance);
}
return res;
}
@Override
protected List<Field<String>> prepareSources(final Document doc, final DataInfo info) {
return new ArrayList<>(); // Not present in ODF ???
}
@Override
protected List<StructuredProperty> prepareRelevantDates(final Document doc, final DataInfo info) {
final List<StructuredProperty> res = new ArrayList<>();
for (final Object o : doc.selectNodes("//datacite:date")) {
final String dateType = ((Node) o).valueOf("@dateType");
if (StringUtils.isBlank(dateType) && !dateType.equalsIgnoreCase("Accepted") && !dateType.equalsIgnoreCase("Issued")
&& !dateType.equalsIgnoreCase("Updated") && !dateType.equalsIgnoreCase("Available")) {
res.add(structuredProperty(((Node) o).getText(), "UNKNOWN", "UNKNOWN", "dnet:dataCite_date", "dnet:dataCite_date", info));
}
}
return res;
}
@Override
protected List<Field<String>> prepareCoverages(final Document doc, final DataInfo info) {
return new ArrayList<>(); // Not present in ODF ???
}
@Override
protected List<Field<String>> prepareContributors(final Document doc, final DataInfo info) {
return prepareListFields(doc, "//datacite:contributorName", info);
}
@Override
protected List<Field<String>> prepareFormats(final Document doc, final DataInfo info) {
return prepareListFields(doc, "//datacite:format", info);
}
@Override
protected Field<String> preparePublisher(final Document doc, final DataInfo info) {
return prepareField(doc, "//datacite:publisher", info);
}
@Override
protected List<Field<String>> prepareDescriptions(final Document doc, final DataInfo info) {
return prepareListFields(doc, "//datacite:description[@descriptionType='Abstract']", info);
}
@Override
protected List<StructuredProperty> prepareSubjects(final Document doc, final DataInfo info) {
return prepareListStructProps(doc, "//datacite:subject", info);
}
@Override
protected Qualifier prepareLanguages(final Document doc) {
return prepareQualifier(doc, "//datacite:language", "dnet:languages", "dnet:languages");
}
@Override
protected List<Field<String>> prepareOtherResearchProductTools(final Document doc, final DataInfo info) {
return new ArrayList<>(); // Not present in ODF ???
}
@Override
protected List<Field<String>> prepareOtherResearchProductContactGroups(final Document doc, final DataInfo info) {
return prepareListFields(doc, "//datacite:contributor[@contributorType='ContactGroup']/datacite:contributorName", info);
}
@Override
protected List<Field<String>> prepareOtherResearchProductContactPersons(final Document doc, final DataInfo info) {
return prepareListFields(doc, "//datacite:contributor[@contributorType='ContactPerson']/datacite:contributorName", info);
}
@Override
protected Qualifier prepareSoftwareProgrammingLanguage(final Document doc, final DataInfo info) {
return prepareQualifier(doc, "//datacite:format", "dnet:programming_languages", "dnet:programming_languages");
}
@Override
protected Field<String> prepareSoftwareCodeRepositoryUrl(final Document doc, final DataInfo info) {
return null; // Not present in ODF ???
}
@Override
protected List<StructuredProperty> prepareSoftwareLicenses(final Document doc, final DataInfo info) {
return new ArrayList<>(); // Not present in ODF ???
}
@Override
protected List<Field<String>> prepareSoftwareDocumentationUrls(final Document doc, final DataInfo info) {
return prepareListFields(doc, "//datacite:relatedIdentifier[@relatedIdentifierType='URL' and @relationType='IsDocumentedBy']", info);
}
// DATASETS
@Override
protected List<GeoLocation> prepareDatasetGeoLocations(final Document doc, final DataInfo info) {
final List<GeoLocation> res = new ArrayList<>();
for (final Object o : doc.selectNodes("//datacite:geoLocation")) {
final GeoLocation loc = new GeoLocation();
loc.setBox(((Node) o).valueOf("./datacite:geoLocationBox"));
loc.setPlace(((Node) o).valueOf("./datacite:geoLocationPlace"));
loc.setPoint(((Node) o).valueOf("./datacite:geoLocationPoint"));
res.add(loc);
}
return res;
}
@Override
protected Field<String> prepareDatasetMetadataVersionNumber(final Document doc, final DataInfo info) {
return null; // Not present in ODF ???
}
@Override
protected Field<String> prepareDatasetLastMetadataUpdate(final Document doc, final DataInfo info) {
return prepareField(doc, "//datacite:date[@dateType='Updated']", info);
}
@Override
protected Field<String> prepareDatasetVersion(final Document doc, final DataInfo info) {
return prepareField(doc, "//datacite:version", info);
}
@Override
protected Field<String> prepareDatasetSize(final Document doc, final DataInfo info) {
return prepareField(doc, "//datacite:size", info);
}
@Override
protected Field<String> prepareDatasetDevice(final Document doc, final DataInfo info) {
return null; // Not present in ODF ???
}
@Override
protected Field<String> prepareDatasetStorageDate(final Document doc, final DataInfo info) {
return prepareField(doc, "//datacite:date[@dateType='Issued']", info);
}
@Override
protected List<Oaf> addOtherResultRels(final Document doc, final KeyValue collectedFrom, final DataInfo info, final long lastUpdateTimestamp) {
final String docId = createOpenaireId(50, doc.valueOf("//dri:objIdentifier"), false);
final List<Oaf> res = new ArrayList<>();
for (final Object o : doc.selectNodes("//datacite:relatedIdentifier[@relatedIdentifierType='OPENAIRE']")) {
final String otherId = createOpenaireId(50, ((Node) o).getText(), false);
final String type = ((Node) o).valueOf("@relationType");
if (type.equals("IsSupplementTo")) {
res.add(prepareOtherResultRel(collectedFrom, info, lastUpdateTimestamp, docId, otherId, "supplement", "isSupplementTo"));
res.add(prepareOtherResultRel(collectedFrom, info, lastUpdateTimestamp, otherId, docId, "supplement", "isSupplementedBy"));
} else if (type.equals("IsPartOf")) {
res.add(prepareOtherResultRel(collectedFrom, info, lastUpdateTimestamp, docId, otherId, "part", "IsPartOf"));
res.add(prepareOtherResultRel(collectedFrom, info, lastUpdateTimestamp, otherId, docId, "part", "HasParts"));
} else {}
}
return res;
}
private Relation prepareOtherResultRel(final KeyValue collectedFrom,
final DataInfo info,
final long lastUpdateTimestamp,
final String source,
final String target,
final String subRelType,
final String relClass) {
final Relation r = new Relation();
r.setRelType("resultResult");
r.setSubRelType(subRelType);
r.setRelClass(relClass);
r.setSource(source);
r.setTarget(target);
r.setCollectedFrom(Arrays.asList(collectedFrom));
r.setDataInfo(info);
r.setLastupdatetimestamp(lastUpdateTimestamp);
return r;
}
@Override
protected Qualifier prepareResourceType(final Document doc, final DataInfo info) {
return prepareQualifier(doc, "//*[local-name() = 'resource']//*[local-name() = 'resourceType']", "dnet:dataCite_resource", "dnet:dataCite_resource");
}
}

@ -0,0 +1,71 @@
package eu.dnetlib.dhp.migration.step3;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.io.compress.GzipCodec;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.SparkSession;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.migration.step1.MigrateMongoMdstoresApplication;
import eu.dnetlib.dhp.schema.oaf.Dataset;
import eu.dnetlib.dhp.schema.oaf.Datasource;
import eu.dnetlib.dhp.schema.oaf.Organization;
import eu.dnetlib.dhp.schema.oaf.OtherResearchProduct;
import eu.dnetlib.dhp.schema.oaf.Project;
import eu.dnetlib.dhp.schema.oaf.Publication;
import eu.dnetlib.dhp.schema.oaf.Relation;
import eu.dnetlib.dhp.schema.oaf.Software;
public class DispatchEntitiesApplication {
private static final Log log = LogFactory.getLog(DispatchEntitiesApplication.class);
public static void main(final String[] args) throws Exception {
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
IOUtils.toString(MigrateMongoMdstoresApplication.class
.getResourceAsStream("/eu/dnetlib/dhp/migration/dispatch_entities_parameters.json")));
parser.parseArgument(args);
try (final SparkSession spark = newSparkSession(parser); final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext())) {
final String sourcePath = parser.get("sourcePath");
final String targetPath = parser.get("graphRawPath");
processEntity(sc, Publication.class, sourcePath, targetPath);
processEntity(sc, Dataset.class, sourcePath, targetPath);
processEntity(sc, Software.class, sourcePath, targetPath);
processEntity(sc, OtherResearchProduct.class, sourcePath, targetPath);
processEntity(sc, Datasource.class, sourcePath, targetPath);
processEntity(sc, Organization.class, sourcePath, targetPath);
processEntity(sc, Project.class, sourcePath, targetPath);
processEntity(sc, Relation.class, sourcePath, targetPath);
}
}
private static SparkSession newSparkSession(final ArgumentApplicationParser parser) {
return SparkSession
.builder()
.appName(DispatchEntitiesApplication.class.getSimpleName())
.master(parser.get("master"))
.getOrCreate();
}
private static void processEntity(final JavaSparkContext sc, final Class<?> clazz, final String sourcePath, final String targetPath) {
final String type = clazz.getSimpleName().toLowerCase();
log.info(String.format("Processing entities (%s) in file: %s", type, sourcePath));
sc.textFile(sourcePath)
.filter(l -> isEntityType(l, type))
.map(l -> StringUtils.substringAfter(l, "|"))
.saveAsTextFile(targetPath + "/" + type, GzipCodec.class); // use repartition(XXX) ???
}
private static boolean isEntityType(final String line, final String type) {
return StringUtils.substringBefore(line, "|").equalsIgnoreCase(type);
}
}

@ -0,0 +1,81 @@
package eu.dnetlib.dhp.migration.utils;
import java.io.Closeable;
import java.io.IOException;
import java.util.concurrent.atomic.AtomicInteger;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.codehaus.jackson.map.ObjectMapper;
import eu.dnetlib.dhp.schema.oaf.Oaf;
public class AbstractMigrationApplication implements Closeable {
private final AtomicInteger counter = new AtomicInteger(0);
private final Text key = new Text();
private final Text value = new Text();
private final SequenceFile.Writer writer;
private final ObjectMapper objectMapper = new ObjectMapper();
private static final Log log = LogFactory.getLog(AbstractMigrationApplication.class);
protected AbstractMigrationApplication() { // ONLY FOR UNIT TEST
this.writer = null;
}
public AbstractMigrationApplication(final String hdfsPath) throws Exception {
log.info(String.format("Creating SequenceFile Writer, hdfsPath=%s", hdfsPath));
this.writer = SequenceFile.createWriter(getConf(), SequenceFile.Writer.file(new Path(hdfsPath)), SequenceFile.Writer
.keyClass(Text.class), SequenceFile.Writer.valueClass(Text.class));
}
private Configuration getConf() throws IOException {
final Configuration conf = new Configuration();
/*
* conf.set("fs.defaultFS", hdfsNameNode); conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName());
* conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName()); System.setProperty("HADOOP_USER_NAME", hdfsUser);
* System.setProperty("hadoop.home.dir", "/"); FileSystem.get(URI.create(hdfsNameNode), conf);
*/
return conf;
}
protected void emit(final String s, final String type) {
try {
key.set(counter.getAndIncrement() + ":" + type);
value.set(s);
writer.append(key, value);
} catch (final Exception e) {
throw new RuntimeException(e);
}
}
protected void emitOaf(final Oaf oaf) {
try {
emit(objectMapper.writeValueAsString(oaf), oaf.getClass().getSimpleName().toLowerCase());
} catch (final Exception e) {
throw new RuntimeException(e);
}
}
public ObjectMapper getObjectMapper() {
return objectMapper;
}
@Override
public void close() throws IOException {
writer.hflush();
writer.close();
}
}

@ -0,0 +1,65 @@
package eu.dnetlib.dhp.migration.utils;
import java.io.Closeable;
import java.io.IOException;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.sql.Statement;
import java.util.function.Consumer;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
public class DbClient implements Closeable {
private static final Log log = LogFactory.getLog(DbClient.class);
private Connection connection;
public DbClient(final String address, final String login, final String password) {
try {
Class.forName("org.postgresql.Driver");
this.connection =
StringUtils.isNoneBlank(login, password) ? DriverManager.getConnection(address, login, password) : DriverManager.getConnection(address);
this.connection.setAutoCommit(false);
} catch (final Exception e) {
log.error("Connection to postgresDB failed");
throw new RuntimeException("Connection to postgresDB failed", e);
}
log.info("Opened database successfully");
}
public void processResults(final String sql, final Consumer<ResultSet> consumer) {
try (final Statement stmt = connection.createStatement()) {
stmt.setFetchSize(100);
try (final ResultSet rs = stmt.executeQuery(sql)) {
while (rs.next()) {
consumer.accept(rs);
}
} catch (final SQLException e) {
log.error("Error executing sql query: " + sql, e);
throw new RuntimeException("Error executing sql query", e);
}
} catch (final SQLException e1) {
log.error("Error preparing sql statement", e1);
throw new RuntimeException("Error preparing sql statement", e1);
}
}
@Override
public void close() throws IOException {
try {
connection.close();
} catch (final SQLException e) {
throw new RuntimeException(e);
}
}
}

@ -0,0 +1,94 @@
package eu.dnetlib.dhp.migration.utils;
import java.io.Closeable;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Map;
import java.util.stream.StreamSupport;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.bson.Document;
import com.google.common.collect.Iterables;
import com.mongodb.MongoClient;
import com.mongodb.MongoClientURI;
import com.mongodb.client.MongoCollection;
import com.mongodb.client.MongoDatabase;
public class MdstoreClient implements Closeable {
private final MongoClient client;
private final MongoDatabase db;
private static final String COLL_METADATA = "metadata";
private static final String COLL_METADATA_MANAGER = "metadataManager";
private static final Log log = LogFactory.getLog(MdstoreClient.class);
public MdstoreClient(final String baseUrl, final String dbName) {
this.client = new MongoClient(new MongoClientURI(baseUrl));
this.db = getDb(client, dbName);
}
public Map<String, String> validCollections(final String mdFormat, final String mdLayout, final String mdInterpretation) {
final Map<String, String> transactions = new HashMap<>();
for (final Document entry : getColl(db, COLL_METADATA_MANAGER, true).find()) {
final String mdId = entry.getString("mdId");
final String currentId = entry.getString("currentId");
if (StringUtils.isNoneBlank(mdId, currentId)) {
transactions.put(mdId, currentId);
}
}
final Map<String, String> res = new HashMap<>();
for (final Document entry : getColl(db, COLL_METADATA, true).find()) {
if (entry.getString("format").equals(mdFormat) && entry.getString("layout").equals(mdLayout)
&& entry.getString("interpretation").equals(mdInterpretation) && transactions.containsKey(entry.getString("mdId"))) {
res.put(entry.getString("mdId"), transactions.get(entry.getString("mdId")));
}
}
return res;
}
private MongoDatabase getDb(final MongoClient client, final String dbName) {
if (!Iterables.contains(client.listDatabaseNames(), dbName)) {
final String err = String.format("Database '%s' not found in %s", dbName, client.getAddress());
log.warn(err);
throw new RuntimeException(err);
}
return client.getDatabase(dbName);
}
private MongoCollection<Document> getColl(final MongoDatabase db, final String collName, final boolean abortIfMissing) {
if (!Iterables.contains(db.listCollectionNames(), collName)) {
final String err = String.format(String.format("Missing collection '%s' in database '%s'", collName, db.getName()));
log.warn(err);
if (abortIfMissing) {
throw new RuntimeException(err);
} else {
return null;
}
}
return db.getCollection(collName);
}
public Iterable<String> listRecords(final String collName) {
final MongoCollection<Document> coll = getColl(db, collName, false);
return coll == null ? new ArrayList<>()
: () -> StreamSupport.stream(coll.find().spliterator(), false)
.filter(e -> e.containsKey("body"))
.map(e -> e.getString("body"))
.iterator();
}
@Override
public void close() throws IOException {
client.close();
}
}

@ -0,0 +1,195 @@
package eu.dnetlib.dhp.migration.utils;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Objects;
import java.util.stream.Collectors;
import org.apache.commons.lang3.StringUtils;
import eu.dnetlib.dhp.schema.oaf.DataInfo;
import eu.dnetlib.dhp.schema.oaf.ExtraInfo;
import eu.dnetlib.dhp.schema.oaf.Field;
import eu.dnetlib.dhp.schema.oaf.Journal;
import eu.dnetlib.dhp.schema.oaf.KeyValue;
import eu.dnetlib.dhp.schema.oaf.OAIProvenance;
import eu.dnetlib.dhp.schema.oaf.OriginDescription;
import eu.dnetlib.dhp.schema.oaf.Qualifier;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
import eu.dnetlib.dhp.utils.DHPUtils;
public class OafMapperUtils {
public static KeyValue keyValue(final String k, final String v) {
final KeyValue kv = new KeyValue();
kv.setKey(k);
kv.setValue(v);
return kv;
}
public static List<KeyValue> listKeyValues(final String... s) {
if (s.length % 2 > 0) { throw new RuntimeException("Invalid number of parameters (k,v,k,v,....)"); }
final List<KeyValue> list = new ArrayList<>();
for (int i = 0; i < s.length; i += 2) {
list.add(keyValue(s[i], s[i + 1]));
}
return list;
}
public static <T> Field<T> field(final T value, final DataInfo info) {
if (value == null || StringUtils.isBlank(value.toString())) { return null; }
final Field<T> field = new Field<>();
field.setValue(value);
field.setDataInfo(info);
return field;
}
public static List<Field<String>> listFields(final DataInfo info, final String... values) {
return Arrays.stream(values).map(v -> field(v, info)).filter(Objects::nonNull).collect(Collectors.toList());
}
public static List<Field<String>> listFields(final DataInfo info, final List<String> values) {
return values.stream().map(v -> field(v, info)).filter(Objects::nonNull).collect(Collectors.toList());
}
public static Qualifier qualifier(final String classid, final String classname, final String schemeid, final String schemename) {
final Qualifier q = new Qualifier();
q.setClassid(classid);
q.setClassname(classname);
q.setSchemeid(schemeid);
q.setSchemename(schemename);
return q;
}
public static StructuredProperty structuredProperty(final String value,
final String classid,
final String classname,
final String schemeid,
final String schemename,
final DataInfo dataInfo) {
return structuredProperty(value, qualifier(classid, classname, schemeid, schemename), dataInfo);
}
public static StructuredProperty structuredProperty(final String value, final Qualifier qualifier, final DataInfo dataInfo) {
if (value == null) { return null; }
final StructuredProperty sp = new StructuredProperty();
sp.setValue(value);
sp.setQualifier(qualifier);
sp.setDataInfo(dataInfo);
return sp;
}
public static ExtraInfo extraInfo(final String name, final String value, final String typology, final String provenance, final String trust) {
final ExtraInfo info = new ExtraInfo();
info.setName(name);
info.setValue(value);
info.setTypology(typology);
info.setProvenance(provenance);
info.setTrust(trust);
return info;
}
public static OAIProvenance oaiIProvenance(final String identifier,
final String baseURL,
final String metadataNamespace,
final Boolean altered,
final String datestamp,
final String harvestDate) {
final OriginDescription desc = new OriginDescription();
desc.setIdentifier(identifier);
desc.setBaseURL(baseURL);
desc.setMetadataNamespace(metadataNamespace);
desc.setAltered(altered);
desc.setDatestamp(datestamp);
desc.setHarvestDate(harvestDate);
final OAIProvenance p = new OAIProvenance();
p.setOriginDescription(desc);
return p;
}
public static Journal journal(final String name,
final String issnPrinted,
final String issnOnline,
final String issnLinking,
final String ep,
final String iss,
final String sp,
final String vol,
final String edition,
final String conferenceplace,
final String conferencedate,
final DataInfo dataInfo) {
if (StringUtils.isNotBlank(name) || StringUtils.isNotBlank(issnPrinted) || StringUtils.isNotBlank(issnOnline) || StringUtils.isNotBlank(issnLinking)) {
final Journal j = new Journal();
j.setName(name);
j.setIssnPrinted(issnPrinted);
j.setIssnOnline(issnOnline);
j.setIssnLinking(issnLinking);
j.setEp(ep);
j.setIss(iss);
j.setSp(sp);
j.setVol(vol);
j.setEdition(edition);
j.setConferenceplace(conferenceplace);
j.setConferencedate(conferencedate);
j.setDataInfo(dataInfo);
return j;
} else {
return null;
}
}
public static DataInfo dataInfo(final Boolean deletedbyinference,
final String inferenceprovenance,
final Boolean inferred,
final Boolean invisible,
final Qualifier provenanceaction,
final String trust) {
final DataInfo d = new DataInfo();
d.setDeletedbyinference(deletedbyinference);
d.setInferenceprovenance(inferenceprovenance);
d.setInferred(inferred);
d.setInvisible(invisible);
d.setProvenanceaction(provenanceaction);
d.setTrust(trust);
return d;
}
public static String createOpenaireId(final int prefix, final String originalId, final boolean to_md5) {
if (to_md5) {
final String nsPrefix = StringUtils.substringBefore(originalId, "::");
final String rest = StringUtils.substringAfter(originalId, "::");
return String.format("%s|%s::%s", prefix, nsPrefix, DHPUtils.md5(rest));
} else {
return String.format("%s|%s", prefix, originalId);
}
}
public static String createOpenaireId(final String type, final String originalId, final boolean to_md5) {
switch (type) {
case "datasource":
return createOpenaireId(10, originalId, to_md5);
case "organization":
return createOpenaireId(20, originalId, to_md5);
case "person":
return createOpenaireId(30, originalId, to_md5);
case "project":
return createOpenaireId(40, originalId, to_md5);
default:
return createOpenaireId(50, originalId, to_md5);
}
}
public static String asString(final Object o) {
return o == null ? "" : o.toString();
}
}

@ -0,0 +1,176 @@
package eu.dnetlib.dhp.migration.utils;
import java.nio.charset.Charset;
import java.text.Normalizer;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.text.WordUtils;
import com.google.common.base.Joiner;
import com.google.common.base.Splitter;
import com.google.common.collect.Iterables;
import com.google.common.collect.Lists;
import com.google.common.hash.Hashing;
public class PacePerson {
private static final String UTF8 = "UTF-8";
private List<String> name = Lists.newArrayList();
private List<String> surname = Lists.newArrayList();
private List<String> fullname = Lists.newArrayList();
private final String original;
private static Set<String> particles = null;
public static final String capitalize(final String s) {
return WordUtils.capitalize(s.toLowerCase(), ' ', '-');
}
public static final String dotAbbreviations(final String s) {
return s.length() == 1 ? s + "." : s;
}
public static Set<String> loadFromClasspath(final String classpath) {
final Set<String> h = new HashSet<>();
try {
for (final String s : IOUtils.readLines(PacePerson.class.getResourceAsStream(classpath))) {
h.add(s);
}
} catch (final Throwable e) {
return new HashSet<>();
}
return h;
}
public PacePerson(String s, final boolean aggressive) {
original = s;
s = Normalizer.normalize(s, Normalizer.Form.NFD);
s = s.replaceAll("\\(.+\\)", "");
s = s.replaceAll("\\[.+\\]", "");
s = s.replaceAll("\\{.+\\}", "");
s = s.replaceAll("\\s+-\\s+", "-");
s = s.replaceAll("[\\p{Punct}&&[^,-]]", " ");
s = s.replaceAll("\\d", " ");
s = s.replaceAll("\\n", " ");
s = s.replaceAll("\\.", " ");
s = s.replaceAll("\\s+", " ");
if (aggressive) {
s = s.replaceAll("[\\p{InCombiningDiacriticalMarks}&&[^,-]]", "");
// s = s.replaceAll("[\\W&&[^,-]]", "");
}
if (s.contains(",")) {
final String[] arr = s.split(",");
if (arr.length == 1) {
fullname = splitTerms(arr[0]);
} else if (arr.length > 1) {
surname = splitTerms(arr[0]);
name = splitTerms(arr[1]);
fullname.addAll(surname);
fullname.addAll(name);
}
} else {
fullname = splitTerms(s);
int lastInitialPosition = fullname.size();
boolean hasSurnameInUpperCase = false;
for (int i = 0; i < fullname.size(); i++) {
final String term = fullname.get(i);
if (term.length() == 1) {
lastInitialPosition = i;
} else if (term.equals(term.toUpperCase())) {
hasSurnameInUpperCase = true;
}
}
if (lastInitialPosition < fullname.size() - 1) { // Case: Michele G. Artini
name = fullname.subList(0, lastInitialPosition + 1);
surname = fullname.subList(lastInitialPosition + 1, fullname.size());
} else if (hasSurnameInUpperCase) { // Case: Michele ARTINI
for (final String term : fullname) {
if (term.length() > 1 && term.equals(term.toUpperCase())) {
surname.add(term);
} else {
name.add(term);
}
}
}
}
}
private List<String> splitTerms(final String s) {
if (particles == null) {
particles = loadFromClasspath("/eu/dnetlib/dhp/migration/pace/name_particles.txt");
}
final List<String> list = Lists.newArrayList();
for (final String part : Splitter.on(" ").omitEmptyStrings().split(s)) {
if (!particles.contains(part.toLowerCase())) {
list.add(part);
}
}
return list;
}
public List<String> getName() {
return name;
}
public String getNameString() {
return Joiner.on(" ").join(getName());
}
public List<String> getSurname() {
return surname;
}
public List<String> getFullname() {
return fullname;
}
public String getOriginal() {
return original;
}
public String hash() {
return Hashing.murmur3_128().hashString(getNormalisedFullname(), Charset.forName(UTF8)).toString();
}
public String getNormalisedFirstName() {
return Joiner.on(" ").join(getCapitalFirstnames());
}
public String getNormalisedSurname() {
return Joiner.on(" ").join(getCapitalSurname());
}
public String getSurnameString() {
return Joiner.on(" ").join(getSurname());
}
public String getNormalisedFullname() {
return isAccurate() ? getNormalisedSurname() + ", " + getNormalisedFirstName() : Joiner.on(" ").join(fullname);
}
public List<String> getCapitalFirstnames() {
return Lists.newArrayList(Iterables.transform(getNameWithAbbreviations(), PacePerson::capitalize));
}
public List<String> getCapitalSurname() {
return Lists.newArrayList(Iterables.transform(surname, PacePerson::capitalize));
}
public List<String> getNameWithAbbreviations() {
return Lists.newArrayList(Iterables.transform(name, PacePerson::dotAbbreviations));
}
public boolean isAccurate() {
return name != null && surname != null && !name.isEmpty() && !surname.isEmpty();
}
}

@ -0,0 +1,20 @@
[
{
"paramName": "s",
"paramLongName": "sourcePath",
"paramDescription": "the source path",
"paramRequired": true
},
{
"paramName": "mt",
"paramLongName": "master",
"paramDescription": "should be local or yarn",
"paramRequired": true
},
{
"paramName": "g",
"paramLongName": "graphRawPath",
"paramDescription": "the path of the graph Raw in hdfs",
"paramRequired": true
}
]

@ -0,0 +1,39 @@
[
{
"paramName": "s",
"paramLongName": "sourcePaths",
"paramDescription": "the HDFS source paths which contains the sequential file (comma separated)",
"paramRequired": true
},
{
"paramName": "mt",
"paramLongName": "master",
"paramDescription": "should be local or yarn",
"paramRequired": true
},
{
"paramName": "t",
"paramLongName": "targetPath",
"paramDescription": "the path of the target file",
"paramRequired": true
},
{
"paramName": "pgurl",
"paramLongName": "postgresUrl",
"paramDescription": "postgres url, example: jdbc:postgresql://localhost:5432/testdb",
"paramRequired": true
},
{
"paramName": "pguser",
"paramLongName": "postgresUser",
"paramDescription": "postgres user",
"paramRequired": false
},
{
"paramName": "pgpasswd",
"paramLongName": "postgresPassword",
"paramDescription": "postgres password",
"paramRequired": false
}
]

@ -0,0 +1,10 @@
[
{"paramName":"is", "paramLongName":"isLookupUrl", "paramDescription": "URL of the isLookUp Service", "paramRequired": true},
{"paramName":"sn", "paramLongName":"sourceNameNode", "paramDescription": "nameNode of the source cluster", "paramRequired": true},
{"paramName":"tn", "paramLongName":"targetNameNode", "paramDescription": "namoNode of the target cluster", "paramRequired": true},
{"paramName":"w", "paramLongName":"workingDirectory", "paramDescription": "working directory", "paramRequired": true},
{"paramName":"nm", "paramLongName":"distcp_num_maps", "paramDescription": "maximum number of map tasks used in the distcp process", "paramRequired": true},
{"paramName":"mm", "paramLongName":"distcp_memory_mb", "paramDescription": "memory for distcp action copying actionsets from remote cluster", "paramRequired": true},
{"paramName":"tt", "paramLongName":"distcp_task_timeout", "paramDescription": "timeout for distcp copying actions from remote cluster", "paramRequired": true},
{"paramName":"tr", "paramLongName":"transform_only", "paramDescription": "activate tranform-only mode. Only apply transformation step", "paramRequired": true}
]

@ -0,0 +1,32 @@
[
{
"paramName": "p",
"paramLongName": "hdfsPath",
"paramDescription": "the path where storing the sequential file",
"paramRequired": true
},
{
"paramName": "pgurl",
"paramLongName": "postgresUrl",
"paramDescription": "postgres url, example: jdbc:postgresql://localhost:5432/testdb",
"paramRequired": true
},
{
"paramName": "pguser",
"paramLongName": "postgresUser",
"paramDescription": "postgres user",
"paramRequired": false
},
{
"paramName": "pgpasswd",
"paramLongName": "postgresPassword",
"paramDescription": "postgres password",
"paramRequired": false
},
{
"paramName": "a",
"paramLongName": "action",
"paramDescription": "process claims",
"paramRequired": false
}
]

@ -0,0 +1,38 @@
[
{
"paramName": "p",
"paramLongName": "hdfsPath",
"paramDescription": "the path where storing the sequential file",
"paramRequired": true
},
{
"paramName": "mongourl",
"paramLongName": "mongoBaseUrl",
"paramDescription": "mongoDB url, example: mongodb://[username:password@]host[:port]",
"paramRequired": true
},
{
"paramName": "mongodb",
"paramLongName": "mongoDb",
"paramDescription": "mongo database",
"paramRequired": true
},
{
"paramName": "f",
"paramLongName": "mdFormat",
"paramDescription": "metadata format",
"paramRequired": true
},
{
"paramName": "l",
"paramLongName": "mdLayout",
"paramDescription": "metadata layout",
"paramRequired": true
},
{
"paramName": "i",
"paramLongName": "mdInterpretation",
"paramDescription": "metadata interpretation",
"paramRequired": true
}
]

@ -0,0 +1 @@
SELECT source_type, source_id, target_type, target_id, semantics FROM claim WHERE approved=TRUE;

@ -0,0 +1,17 @@
SELECT
dor.datasource AS datasource,
dor.organization AS organization,
NULL AS startdate,
NULL AS enddate,
false AS inferred,
false AS deletedbyinference,
0.9 AS trust,
NULL AS inferenceprovenance,
dc.id AS collectedfromid,
dc.officialname AS collectedfromname,
'providedBy@@@provided by@@@dnet:datasources_organizations_typologies@@@dnet:datasources_organizations_typologies' AS semantics,
d.provenanceaction || '@@@' || d.provenanceaction || '@@@dnet:provenanceActions@@@dnet:provenanceActions' AS provenanceaction
FROM dsm_datasource_organization dor
LEFT OUTER JOIN dsm_datasources d ON (dor.datasource = d.id)
LEFT OUTER JOIN dsm_datasources dc ON (dc.id = d.collectedfrom)

@ -0,0 +1,147 @@
SELECT
d.id AS datasourceid,
d.id || array_agg(distinct di.pid) AS identities,
d.officialname AS officialname,
d.englishname AS englishname,
d.contactemail AS contactemail,
CASE
WHEN (array_agg(DISTINCT COALESCE (a.compatibility_override, a.compatibility):: TEXT) @> ARRAY ['openaire-cris_1.1'])
THEN
'openaire-cris_1.1@@@OpenAIRE CRIS v1.1@@@dnet:datasourceCompatibilityLevel@@@dnet:datasourceCompatibilityLevel'
WHEN (array_agg(DISTINCT COALESCE (a.compatibility_override, a.compatibility):: TEXT) @> ARRAY ['driver', 'openaire2.0'])
THEN
'driver-openaire2.0@@@OpenAIRE 2.0+ (DRIVER OA, EC funding)@@@dnet:datasourceCompatibilityLevel@@@dnet:datasourceCompatibilityLevel'
WHEN (array_agg(DISTINCT COALESCE (a.compatibility_override, a.compatibility) :: TEXT) @> ARRAY ['driver'])
THEN
'driver@@@OpenAIRE Basic (DRIVER OA)@@@dnet:datasourceCompatibilityLevel@@@dnet:datasourceCompatibilityLevel'
WHEN (array_agg(DISTINCT COALESCE (a.compatibility_override, a.compatibility) :: TEXT) @> ARRAY ['openaire2.0'])
THEN
'openaire2.0@@@OpenAIRE 2.0 (EC funding)@@@dnet:datasourceCompatibilityLevel@@@dnet:datasourceCompatibilityLevel'
WHEN (array_agg(DISTINCT COALESCE (a.compatibility_override, a.compatibility) :: TEXT) @> ARRAY ['openaire3.0'])
THEN
'openaire3.0@@@OpenAIRE 3.0 (OA, funding)@@@dnet:datasourceCompatibilityLevel@@@dnet:datasourceCompatibilityLevel'
WHEN (array_agg(DISTINCT COALESCE (a.compatibility_override, a.compatibility) :: TEXT) @> ARRAY ['openaire2.0_data'])
THEN
'openaire2.0_data@@@OpenAIRE Data (funded, referenced datasets)@@@dnet:datasourceCompatibilityLevel@@@dnet:datasourceCompatibilityLevel'
WHEN (array_agg(DISTINCT COALESCE (a.compatibility_override, a.compatibility) :: TEXT) @> ARRAY ['native'])
THEN
'native@@@proprietary@@@dnet:datasourceCompatibilityLevel@@@dnet:datasourceCompatibilityLevel'
WHEN (array_agg(DISTINCT COALESCE (a.compatibility_override, a.compatibility) :: TEXT) @> ARRAY ['hostedBy'])
THEN
'hostedBy@@@collected from a compatible aggregator@@@dnet:datasourceCompatibilityLevel@@@dnet:datasourceCompatibilityLevel'
WHEN (array_agg(DISTINCT COALESCE (a.compatibility_override, a.compatibility) :: TEXT) @> ARRAY ['notCompatible'])
THEN
'notCompatible@@@under validation@@@dnet:datasourceCompatibilityLevel@@@dnet:datasourceCompatibilityLevel'
ELSE
'UNKNOWN@@@not available@@@dnet:datasourceCompatibilityLevel@@@dnet:datasourceCompatibilityLevel'
END AS openairecompatibility,
d.websiteurl AS websiteurl,
d.logourl AS logourl,
array_agg(DISTINCT CASE WHEN a.protocol = 'oai' and last_aggregation_date is not null THEN a.baseurl ELSE NULL END) AS accessinfopackage,
d.latitude AS latitude,
d.longitude AS longitude,
d.namespaceprefix AS namespaceprefix,
NULL AS odnumberofitems,
NULL AS odnumberofitemsdate,
(SELECT array_agg(s|| '###keywords@@@keywords@@@dnet:subject_classification_typologies@@@dnet:subject_classification_typologies')
FROM UNNEST(
ARRAY(
SELECT trim(s)
FROM unnest(string_to_array(d.subjects, '@@')) AS s)) AS s) AS subjects,
d.description AS description,
NULL AS odpolicies,
ARRAY(SELECT trim(s)
FROM unnest(string_to_array(d.languages, ',')) AS s) AS odlanguages,
ARRAY(SELECT trim(s)
FROM unnest(string_to_array(d.od_contenttypes, '-')) AS s) AS odcontenttypes,
false AS inferred,
false AS deletedbyinference,
0.9 AS trust,
NULL AS inferenceprovenance,
d.dateofcollection AS dateofcollection,
d.dateofvalidation AS dateofvalidation,
-- re3data fields
d.releasestartdate AS releasestartdate,
d.releaseenddate AS releaseenddate,
d.missionstatementurl AS missionstatementurl,
d.dataprovider AS dataprovider,
d.serviceprovider AS serviceprovider,
d.databaseaccesstype AS databaseaccesstype,
d.datauploadtype AS datauploadtype,
d.databaseaccessrestriction AS databaseaccessrestriction,
d.datauploadrestriction AS datauploadrestriction,
d.versioning AS versioning,
d.citationguidelineurl AS citationguidelineurl,
d.qualitymanagementkind AS qualitymanagementkind,
d.pidsystems AS pidsystems,
d.certificates AS certificates,
ARRAY[]::text[] AS policies,
dc.id AS collectedfromid,
dc.officialname AS collectedfromname,
d.typology || '@@@' || CASE
WHEN (d.typology = 'crissystem') THEN 'CRIS System'
WHEN (d.typology = 'datarepository::unknown') THEN 'Data Repository'
WHEN (d.typology = 'aggregator::datarepository') THEN 'Data Repository Aggregator'
WHEN (d.typology = 'infospace') THEN 'Information Space'
WHEN (d.typology = 'pubsrepository::institutional') THEN 'Institutional Repository'
WHEN (d.typology = 'aggregator::pubsrepository::institutional') THEN 'Institutional Repository Aggregator'
WHEN (d.typology = 'pubsrepository::journal') THEN 'Journal'
WHEN (d.typology = 'aggregator::pubsrepository::journals') THEN 'Journal Aggregator/Publisher'
WHEN (d.typology = 'pubsrepository::mock') THEN 'Other'
WHEN (d.typology = 'pubscatalogue::unknown') THEN 'Publication Catalogue'
WHEN (d.typology = 'pubsrepository::unknown') THEN 'Publication Repository'
WHEN (d.typology = 'aggregator::pubsrepository::unknown') THEN 'Publication Repository Aggregator'
WHEN (d.typology = 'entityregistry') THEN 'Registry'
WHEN (d.typology = 'scholarcomminfra') THEN 'Scholarly Comm. Infrastructure'
WHEN (d.typology = 'pubsrepository::thematic') THEN 'Thematic Repository'
WHEN (d.typology = 'websource') THEN 'Web Source'
WHEN (d.typology = 'entityregistry::projects') THEN 'Funder database'
WHEN (d.typology = 'entityregistry::repositories') THEN 'Registry of repositories'
WHEN (d.typology = 'softwarerepository') THEN 'Software Repository'
WHEN (d.typology = 'aggregator::softwarerepository') THEN 'Software Repository Aggregator'
WHEN (d.typology = 'orprepository') THEN 'Repository'
ELSE 'Other'
END || '@@@dnet:datasource_typologies@@@dnet:datasource_typologies' AS datasourcetype,
'sysimport:crosswalk:entityregistry@@@sysimport:crosswalk:entityregistry@@@dnet:provenance_actions@@@dnet:provenance_actions' AS provenanceaction,
CONCAT(d.issn, '@@@', d.eissn, '@@@', d.lissn) AS journal
FROM dsm_datasources d
LEFT OUTER JOIN dsm_datasources dc on (d.collectedfrom = dc.id)
LEFT OUTER JOIN dsm_api a ON (d.id = a.datasource)
LEFT OUTER JOIN dsm_datasourcepids di ON (d.id = di.datasource)
GROUP BY
d.id,
d.officialname,
d.englishname,
d.websiteurl,
d.logourl,
d.contactemail,
d.namespaceprefix,
d.description,
d.latitude,
d.longitude,
d.dateofcollection,
d.dateofvalidation,
d.releasestartdate,
d.releaseenddate,
d.missionstatementurl,
d.dataprovider,
d.serviceprovider,
d.databaseaccesstype,
d.datauploadtype,
d.databaseaccessrestriction,
d.datauploadrestriction,
d.versioning,
d.citationguidelineurl,
d.qualitymanagementkind,
d.pidsystems,
d.certificates,
dc.id,
dc.officialname,
d.issn,
d.eissn,
d.lissn

@ -0,0 +1,35 @@
SELECT
o.id AS organizationid,
o.legalshortname AS legalshortname,
o.legalname AS legalname,
o.websiteurl AS websiteurl,
o.logourl AS logourl,
o.ec_legalbody AS eclegalbody,
o.ec_legalperson AS eclegalperson,
o.ec_nonprofit AS ecnonprofit,
o.ec_researchorganization AS ecresearchorganization,
o.ec_highereducation AS echighereducation,
o.ec_internationalorganizationeurinterests AS ecinternationalorganizationeurinterests,
o.ec_internationalorganization AS ecinternationalorganization,
o.ec_enterprise AS ecenterprise,
o.ec_smevalidated AS ecsmevalidated,
o.ec_nutscode AS ecnutscode,
o.dateofcollection AS dateofcollection,
o.lastupdate AS dateoftransformation,
false AS inferred,
false AS deletedbyinference,
o.trust AS trust,
'' AS inferenceprovenance,
d.id AS collectedfromid,
d.officialname AS collectedfromname,
o.country || '@@@' || o.country || '@@@dnet:countries@@@dnet:countries' AS country,
'sysimport:crosswalk:entityregistry@@@sysimport:crosswalk:entityregistry@@@dnet:provenance_actions@@@dnet:provenance_actions' AS provenanceaction,
ARRAY[]::text[] AS pid
FROM dsm_organizations o
LEFT OUTER JOIN dsm_datasources d ON (d.id = o.collectedfrom)

@ -0,0 +1,53 @@
SELECT
o.id AS organizationid,
coalesce((array_agg(a.acronym))[1], o.name) AS legalshortname,
o.name AS legalname,
array_agg(DISTINCT n.name) AS "alternativeNames",
(array_agg(u.url))[1] AS websiteurl,
o.modification_date AS dateoftransformation,
false AS inferred,
false AS deletedbyinference,
0.95 AS trust,
'' AS inferenceprovenance,
'openaire____::openorgs' AS collectedfromid,
'OpenOrgs Database' AS collectedfromname,
o.country || '@@@' || o.country || '@@@dnet:countries@@@dnet:countries' AS country,
'sysimport:crosswalk:entityregistry@@@sysimport:crosswalk:entityregistry@@@dnet:provenance_actions@@@dnet:provenance_actions' AS provenanceaction,
array_agg(DISTINCT i.otherid || '###' || i.type || '@@@dnet:pid_types') AS pid
FROM organizations o
LEFT OUTER JOIN acronyms a ON (a.id = o.id)
LEFT OUTER JOIN urls u ON (u.id = o.id)
LEFT OUTER JOIN other_ids i ON (i.id = o.id)
LEFT OUTER JOIN other_names n ON (n.id = o.id)
GROUP BY
o.id,
o.name,
o.modification_date,
o.country
UNION ALL
SELECT
'openorgsmesh'||substring(o.id, 13)||'-'||md5(n.name) AS organizationid,
n.name AS legalshortname,
n.name AS legalname,
ARRAY[]::text[] AS "alternativeNames",
(array_agg(u.url))[1] AS websiteurl,
o.modification_date AS dateoftransformation,
false AS inferred,
false AS deletedbyinference,
0.88 AS trust,
'' AS inferenceprovenance,
'openaire____::openorgs' AS collectedfromid,
'OpenOrgs Database' AS collectedfromname,
o.country || '@@@' || o.country || '@@@dnet:countries@@@dnet:countries' AS country,
'sysimport:crosswalk:entityregistry@@@sysimport:crosswalk:entityregistry@@@dnet:provenance_actions@@@dnet:provenance_actions' AS provenanceaction,
array_agg(DISTINCT i.otherid || '###' || i.type || '@@@dnet:pid_types') AS pid
FROM other_names n
LEFT OUTER JOIN organizations o ON (n.id = o.id)
LEFT OUTER JOIN urls u ON (u.id = o.id)
LEFT OUTER JOIN other_ids i ON (i.id = o.id)
GROUP BY
o.id, o.modification_date, o.country, n.name

@ -0,0 +1,19 @@
SELECT
po.project AS project,
po.resporganization AS resporganization,
po.participantnumber AS participantnumber,
po.contribution AS contribution,
NULL AS startdate,
NULL AS enddate,
false AS inferred,
false AS deletedbyinference,
po.trust AS trust,
NULL AS inferenceprovenance,
dc.id AS collectedfromid,
dc.officialname AS collectedfromname,
po.semanticclass || '@@@' || po.semanticclass || '@@@dnet:project_organization_relations@@@dnet:project_organization_relations' AS semantics,
'sysimport:crosswalk:entityregistry@@@sysimport:crosswalk:entityregistry@@@dnet:provenance_actions@@@dnet:provenance_actions' AS provenanceaction
FROM project_organization po
LEFT OUTER JOIN projects p ON (p.id = po.project)
LEFT OUTER JOIN dsm_datasources dc ON (dc.id = p.collectedfrom)

@ -0,0 +1,89 @@
SELECT
p.id AS projectid,
p.code AS code,
p.websiteurl AS websiteurl,
p.acronym AS acronym,
p.title AS title,
p.startdate AS startdate,
p.enddate AS enddate,
p.call_identifier AS callidentifier,
p.keywords AS keywords,
p.duration AS duration,
p.ec_sc39 AS ecsc39,
p.oa_mandate_for_publications AS oamandatepublications,
p.ec_article29_3 AS ecarticle29_3,
p.dateofcollection AS dateofcollection,
p.lastupdate AS dateoftransformation,
p.inferred AS inferred,
p.deletedbyinference AS deletedbyinference,
p.trust AS trust,
p.inferenceprovenance AS inferenceprovenance,
p.optional1 AS optional1,
p.optional2 AS optional2,
p.jsonextrainfo AS jsonextrainfo,
p.contactfullname AS contactfullname,
p.contactfax AS contactfax,
p.contactphone AS contactphone,
p.contactemail AS contactemail,
p.summary AS summary,
p.currency AS currency,
p.totalcost AS totalcost,
p.fundedamount AS fundedamount,
dc.id AS collectedfromid,
dc.officialname AS collectedfromname,
p.contracttype || '@@@' || p.contracttypename || '@@@' || p.contracttypescheme || '@@@' || p.contracttypescheme AS contracttype,
pac.code || '@@@' || pac.name || '@@@' || pas.code || '@@@' || pas.name AS provenanceaction,
array_agg(DISTINCT i.pid || '###' || i.issuertype) AS pid,
array_agg(DISTINCT s.name || '###' || sc.code || '@@@' || sc.name || '@@@' || ss.code || '@@@' || ss.name) AS subjects,
array_agg(DISTINCT fp.path) AS fundingtree
FROM projects p
LEFT OUTER JOIN class pac ON (pac.code = p.provenanceactionclass)
LEFT OUTER JOIN scheme pas ON (pas.code = p.provenanceactionscheme)
LEFT OUTER JOIN projectpids pp ON (pp.project = p.id)
LEFT OUTER JOIN dsm_identities i ON (i.pid = pp.pid)
LEFT OUTER JOIN dsm_datasources dc ON (dc.id = p.collectedfrom)
LEFT OUTER JOIN project_fundingpath pf ON (pf.project = p.id)
LEFT OUTER JOIN fundingpaths fp ON (fp.id = pf.funding)
LEFT OUTER JOIN project_subject ps ON (ps.project = p.id)
LEFT OUTER JOIN subjects s ON (s.id = ps.subject)
LEFT OUTER JOIN class sc ON (sc.code = s.semanticclass)
LEFT OUTER JOIN scheme ss ON (ss.code = s.semanticscheme)
GROUP BY
p.id,
p.code,
p.websiteurl,
p.acronym,
p.title,
p.startdate,
p.enddate,
p.call_identifier,
p.keywords,
p.duration,
p.ec_sc39,
p.oa_mandate_for_publications,
p.ec_article29_3,
p.dateofcollection,
p.inferred,
p.deletedbyinference,
p.trust,
p.inferenceprovenance,
p.contactfullname,
p.contactfax,
p.contactphone,
p.contactemail,
p.summary,
p.currency,
p.totalcost,
p.fundedamount,
dc.id,
dc.officialname,
pac.code, pac.name, pas.code, pas.name,
p.contracttype , p.contracttypename, p.contracttypescheme;

@ -0,0 +1,90 @@
SELECT
p.id AS projectid,
p.code AS code,
p.websiteurl AS websiteurl,
p.acronym AS acronym,
p.title AS title,
p.startdate AS startdate,
p.enddate AS enddate,
p.call_identifier AS callidentifier,
p.keywords AS keywords,
p.duration AS duration,
p.ec_sc39 AS ecsc39,
p.oa_mandate_for_publications AS oamandatepublications,
p.ec_article29_3 AS ecarticle29_3,
p.dateofcollection AS dateofcollection,
p.lastupdate AS dateoftransformation,
p.inferred AS inferred,
p.deletedbyinference AS deletedbyinference,
p.trust AS trust,
p.inferenceprovenance AS inferenceprovenance,
p.optional1 AS optional1,
p.optional2 AS optional2,
p.jsonextrainfo AS jsonextrainfo,
p.contactfullname AS contactfullname,
p.contactfax AS contactfax,
p.contactphone AS contactphone,
p.contactemail AS contactemail,
p.summary AS summary,
p.currency AS currency,
p.totalcost AS totalcost,
p.fundedamount AS fundedamount,
dc.id AS collectedfromid,
dc.officialname AS collectedfromname,
ctc.code || '@@@' || ctc.name || '@@@' || cts.code || '@@@' || cts.name AS contracttype,
pac.code || '@@@' || pac.name || '@@@' || pas.code || '@@@' || pas.name AS provenanceaction,
array_agg(DISTINCT i.pid || '###' || i.issuertype) AS pid,
array_agg(DISTINCT s.name || '###' || sc.code || '@@@' || sc.name || '@@@' || ss.code || '@@@' || ss.name) AS subjects,
array_agg(DISTINCT fp.path) AS fundingtree
FROM projects p
LEFT OUTER JOIN class pac ON (pac.code = p.provenanceactionclass)
LEFT OUTER JOIN scheme pas ON (pas.code = p.provenanceactionscheme)
LEFT OUTER JOIN projectpids pp ON (pp.project = p.id)
LEFT OUTER JOIN dsm_identities i ON (i.pid = pp.pid)
LEFT OUTER JOIN dsm_datasources dc ON (dc.id = p.collectedfrom)
LEFT OUTER JOIN project_fundingpath pf ON (pf.project = p.id)
LEFT OUTER JOIN fundingpaths fp ON (fp.id = pf.funding)
LEFT OUTER JOIN project_subject ps ON (ps.project = p.id)
LEFT OUTER JOIN subjects s ON (s.id = ps.subject)
LEFT OUTER JOIN class sc ON (sc.code = s.semanticclass)
LEFT OUTER JOIN scheme ss ON (ss.code = s.semanticscheme)
LEFT OUTER JOIN class ctc ON (ctc.code = p.contracttypeclass)
LEFT OUTER JOIN scheme cts ON (cts.code = p.contracttypescheme)
GROUP BY
p.id,
p.code,
p.websiteurl,
p.acronym,
p.title,
p.startdate,
p.enddate,
p.call_identifier,
p.keywords,
p.duration,
p.ec_sc39,
p.oa_mandate_for_publications,
p.ec_article29_3,
p.dateofcollection,
p.inferred,
p.deletedbyinference,
p.trust,
p.inferenceprovenance,
p.contactfullname,
p.contactfax,
p.contactphone,
p.contactemail,
p.summary,
p.currency,
p.totalcost,
p.fundedamount,
dc.id,
dc.officialname,
pac.code, pac.name, pas.code, pas.name,
ctc.code, ctc.name, cts.code, cts.name;

@ -0,0 +1,17 @@
SELECT local_id AS id1, oa_original_id AS id2 FROM openaire_simrels WHERE reltype = 'is_similar'
UNION ALL
SELECT
o.id AS id1,
'openorgsmesh'||substring(o.id, 13)||'-'||md5(a.acronym) AS id2
FROM acronyms a
LEFT OUTER JOIN organizations o ON (a.id = o.id)
UNION ALL
SELECT
o.id AS id1,
'openorgsmesh'||substring(o.id, 13)||'-'||md5(n.name) AS id2
FROM other_names n
LEFT OUTER JOIN organizations o ON (n.id = o.id)

@ -0,0 +1,5 @@
[
{"paramName":"mt", "paramLongName":"master", "paramDescription": "should be local or yarn", "paramRequired": true},
{"paramName":"is", "paramLongName":"isLookupUrl", "paramDescription": "URL of the isLookUp Service", "paramRequired": true},
{"paramName":"i", "paramLongName":"inputPaths", "paramDescription": "URL of the isLookUp Service", "paramRequired": true}
]

@ -0,0 +1,30 @@
<configuration>
<property>
<name>jobTracker</name>
<value>yarnRM</value>
</property>
<property>
<name>nameNode</name>
<value>hdfs://nameservice1</value>
</property>
<property>
<name>sourceNN</name>
<value>webhdfs://namenode2.hadoop.dm.openaire.eu:50071</value>
</property>
<property>
<name>oozie.use.system.libpath</name>
<value>true</value>
</property>
<property>
<name>oozie.action.sharelib.for.spark</name>
<value>spark2</value>
</property>
<property>
<name>spark2YarnHistoryServerAddress</name>
<value>http://iis-cdh5-test-gw.ocean.icm.edu.pl:18088</value>
</property>
<property>
<name>spark2EventLogDir</name>
<value>/user/spark/applicationHistory</value>
</property>
</configuration>

@ -0,0 +1,122 @@
<workflow-app xmlns='uri:oozie:workflow:0.5' name='migrate_actions'>
<parameters>
<property>
<name>sourceNN</name>
<description>the source name node</description>
</property>
<property>
<name>isLookupUrl</name>
<description>the isLookup service endpoint</description>
</property>
<property>
<name>workingDirectory</name>
<value>/tmp/actionsets</value>
<description>working directory</description>
</property>
<property>
<name>distcp_memory_mb</name>
<value>6144</value>
<description>memory for distcp copying actionsets from remote cluster</description>
</property>
<property>
<name>distcp_task_timeout</name>
<value>60000000</value>
<description>timeout for distcp copying actions from remote cluster</description>
</property>
<property>
<name>distcp_num_maps</name>
<value>1</value>
<description>mmaximum number of map tasks used in the distcp process</description>
</property>
<property>
<name>transform_only</name>
<description>activate tranform-only mode. Only apply transformation step</description>
</property>
<property>
<name>sparkDriverMemory</name>
<description>memory for driver process</description>
</property>
<property>
<name>sparkExecutorMemory</name>
<description>memory for individual executor</description>
</property>
<property>
<name>sparkExecutorCores</name>
<description>number of cores used by single executor</description>
</property>
<property>
<name>spark2YarnHistoryServerAddress</name>
<description>spark 2.* yarn history server address</description>
</property>
<property>
<name>spark2EventLogDir</name>
<description>spark 2.* event log dir location</description>
</property>
</parameters>
<global>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<configuration>
<property>
<name>mapreduce.job.queuename</name>
<value>${queueName}</value>
</property>
<property>
<name>oozie.launcher.mapred.job.queue.name</name>
<value>${oozieLauncherQueueName}</value>
</property>
</configuration>
</global>
<start to='migrate_actionsets' />
<action name='migrate_actionsets'>
<java>
<main-class>eu.dnetlib.dhp.migration.actions.MigrateActionSet</main-class>
<java-opt>-Dmapred.task.timeout=${distcp_task_timeout}</java-opt>
<arg>-is</arg><arg>${isLookupUrl}</arg>
<arg>-sn</arg><arg>${sourceNN}</arg>
<arg>-tn</arg><arg>${nameNode}</arg>
<arg>-w</arg><arg>${workingDirectory}</arg>
<arg>-nm</arg><arg>${distcp_num_maps}</arg>
<arg>-mm</arg><arg>${distcp_memory_mb}</arg>
<arg>-tt</arg><arg>${distcp_task_timeout}</arg>
<arg>-tr</arg><arg>${transform_only}</arg>
<capture-output/>
</java>
<ok to="transform_actions" />
<error to="fail" />
</action>
<action name="transform_actions">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>transform_actions</name>
<class>eu.dnetlib.dhp.migration.actions.TransformActions</class>
<jar>dhp-aggregation-${projectVersion}.jar</jar>
<spark-opts>
--executor-cores ${sparkExecutorCores}
--executor-memory ${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener"
--conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener"
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
</spark-opts>
<arg>-mt</arg><arg>yarn</arg>
<arg>-is</arg><arg>${isLookupUrl}</arg>
<arg>--inputPaths</arg><arg>${wf:actionData('migrate_actionsets')['target_paths']}</arg>
</spark>
<ok to="end"/>
<error to="fail"/>
</action>
<kill name="fail">
<message>migrate_actions failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<end name="end" />
</workflow-app>

@ -0,0 +1,18 @@
<configuration>
<property>
<name>jobTracker</name>
<value>yarnRM</value>
</property>
<property>
<name>nameNode</name>
<value>hdfs://nameservice1</value>
</property>
<property>
<name>oozie.use.system.libpath</name>
<value>true</value>
</property>
<property>
<name>oozie.action.sharelib.for.spark</name>
<value>spark2</value>
</property>
</configuration>

@ -0,0 +1,169 @@
<workflow-app name="import Claims as Graph" xmlns="uri:oozie:workflow:0.5">
<parameters>
<property>
<name>migrationClaimsPathStep1</name>
<description>the base path to store hdfs file</description>
</property>
<property>
<name>migrationClaimsPathStep2</name>
<description>the temporary path to store entities before dispatching</description>
</property>
<property>
<name>migrationClaimsPathStep3</name>
<description>the graph Raw base path</description>
</property>
<property>
<name>postgresURL</name>
<description>the postgres URL to access to the database</description>
</property>
<property>
<name>postgresUser</name>
<description>the user postgres</description>
</property>
<property>
<name>postgresPassword</name>
<description>the password postgres</description>
</property>
<property>
<name>mongoURL</name>
<description>mongoDB url, example: mongodb://[username:password@]host[:port]</description>
</property>
<property>
<name>mongoDb</name>
<description>mongo database</description>
</property>
<property>
<name>sparkDriverMemory</name>
<description>memory for driver process</description>
</property>
<property>
<name>sparkExecutorMemory</name>
<description>memory for individual executor</description>
</property>
<property>
<name>sparkExecutorCores</name>
<description>number of cores used by single executor</description>
</property>
</parameters>
<start to="ResetWorkingPath"/>
<kill name="Kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<action name="ResetWorkingPath">
<fs>
<delete path='${migrationClaimsPathStep1}'/>
<mkdir path='${migrationClaimsPathStep1}'/>
</fs>
<ok to="ImportDBClaims"/>
<error to="Kill"/>
</action>
<action name="ImportDBClaims">
<java>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<main-class>eu.dnetlib.dhp.migration.step1.MigrateDbEntitiesApplication</main-class>
<arg>-p</arg><arg>${migrationClaimsPathStep1}/db_claims</arg>
<arg>-pgurl</arg><arg>${postgresURL}</arg>
<arg>-pguser</arg><arg>${postgresUser}</arg>
<arg>-pgpasswd</arg><arg>${postgresPassword}</arg>
<arg>-a</arg><arg>claims</arg>
</java>
<ok to="ImportODFClaims"/>
<error to="Kill"/>
</action>
<action name="ImportODFClaims">
<java>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<main-class>eu.dnetlib.dhp.migration.step1.MigrateMongoMdstoresApplication</main-class>
<arg>-p</arg><arg>${migrationClaimsPathStep1}/odf_claims</arg>
<arg>-mongourl</arg><arg>${mongoURL}</arg>
<arg>-mongodb</arg><arg>${mongoDb}</arg>
<arg>-f</arg><arg>ODF</arg>
<arg>-l</arg><arg>store</arg>
<arg>-i</arg><arg>claim</arg>
</java>
<ok to="ImportOAFClaims"/>
<error to="Kill"/>
</action>
<action name="ImportOAFClaims">
<java>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<main-class>eu.dnetlib.dhp.migration.step1.MigrateMongoMdstoresApplication</main-class>
<arg>-p</arg><arg>${migrationClaimsPathStep1}/oaf_claims</arg>
<arg>-mongourl</arg><arg>${mongoURL}</arg>
<arg>-mongodb</arg><arg>${mongoDb}</arg>
<arg>-f</arg><arg>OAF</arg>
<arg>-l</arg><arg>store</arg>
<arg>-i</arg><arg>claim</arg>
</java>
<ok to="ResetClaimEntities"/>
<error to="Kill"/>
</action>
<action name="ResetClaimEntities">
<fs>
<delete path='${migrationClaimsPathStep2}'/>
<mkdir path='${migrationClaimsPathStep2}'/>
</fs>
<ok to="GenerateClaimEntities"/>
<error to="Kill"/>
</action>
<action name="GenerateClaimEntities">
<spark xmlns="uri:oozie:spark-action:0.2">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<master>yarn-cluster</master>
<mode>cluster</mode>
<name>GenerateClaimEntities</name>
<class>eu.dnetlib.dhp.migration.step2.GenerateEntitiesApplication</class>
<jar>dhp-aggregation-${projectVersion}.jar</jar>
<spark-opts>--executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores} --driver-memory=${sparkDriverMemory} --conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" --conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" --conf spark.sql.warehouse.dir="/user/hive/warehouse"</spark-opts>
<arg>-mt</arg> <arg>yarn-cluster</arg>
<arg>-s</arg><arg>${migrationClaimsPathStep1}/db_claims,${migrationClaimsPathStep1}/oaf_claims,${migrationClaimsPathStep1}/odf_claims</arg>
<arg>-t</arg><arg>${migrationClaimsPathStep2}/claim_entities</arg>
<arg>-pgurl</arg><arg>${postgresURL}</arg>
<arg>-pguser</arg><arg>${postgresUser}</arg>
<arg>-pgpasswd</arg><arg>${postgresPassword}</arg>
</spark>
<ok to="ResetClaimGraph"/>
<error to="Kill"/>
</action>
<action name="ResetClaimGraph">
<fs>
<delete path='${migrationClaimsPathStep3}'/>
<mkdir path='${migrationClaimsPathStep3}'/>
</fs>
<ok to="GenerateClaimGraph"/>
<error to="Kill"/>
</action>
<action name="GenerateClaimGraph">
<spark xmlns="uri:oozie:spark-action:0.2">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<master>yarn-cluster</master>
<mode>cluster</mode>
<name>GenerateClaimGraph</name>
<class>eu.dnetlib.dhp.migration.step3.DispatchEntitiesApplication</class>
<jar>dhp-aggregation-${projectVersion}.jar</jar>
<spark-opts>--executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores} --driver-memory=${sparkDriverMemory} --conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" --conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" --conf spark.sql.warehouse.dir="/user/hive/warehouse"</spark-opts>
<arg>-mt</arg> <arg>yarn-cluster</arg>
<arg>-s</arg><arg>${migrationClaimsPathStep2}/claim_entities</arg>
<arg>-g</arg><arg>${migrationClaimsPathStep3}</arg>
</spark>
<ok to="End"/>
<error to="Kill"/>
</action>
<end name="End"/>
</workflow-app>

@ -0,0 +1,18 @@
<configuration>
<property>
<name>jobTracker</name>
<value>yarnRM</value>
</property>
<property>
<name>nameNode</name>
<value>hdfs://nameservice1</value>
</property>
<property>
<name>oozie.use.system.libpath</name>
<value>true</value>
</property>
<property>
<name>oozie.action.sharelib.for.spark</name>
<value>spark2</value>
</property>
</configuration>

@ -0,0 +1,197 @@
<workflow-app name="import regular entities as Graph (all steps)" xmlns="uri:oozie:workflow:0.5">
<parameters>
<property>
<name>workingPath</name>
<value>/tmp/dhp_migration</value>
<description>the base path to store temporary intermediate data</description>
</property>
<property>
<name>graphBasePath</name>
<description>the target path to store raw graph</description>
</property>
<property>
<name>reuseContent</name>
<value>false</value>
<description>should import content from the aggregator or reuse a previous version</description>
</property>
<property>
<name>postgresURL</name>
<description>the postgres URL to access to the database</description>
</property>
<property>
<name>postgresUser</name>
<description>the user postgres</description>
</property>
<property>
<name>postgresPassword</name>
<description>the password postgres</description>
</property>
<property>
<name>mongoURL</name>
<description>mongoDB url, example: mongodb://[username:password@]host[:port]</description>
</property>
<property>
<name>mongoDb</name>
<description>mongo database</description>
</property>
<property>
<name>sparkDriverMemory</name>
<description>memory for driver process</description>
</property>
<property>
<name>sparkExecutorMemory</name>
<description>memory for individual executor</description>
</property>
<property>
<name>sparkExecutorCores</name>
<description>number of cores used by single executor</description>
</property>
</parameters>
<global>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<configuration>
<property>
<name>mapreduce.job.queuename</name>
<value>${queueName}</value>
</property>
<property>
<name>oozie.launcher.mapred.job.queue.name</name>
<value>${oozieLauncherQueueName}</value>
</property>
</configuration>
</global>
<start to="ReuseContent"/>
<kill name="Kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<decision name="ReuseContent">
<switch>
<case to="ResetWorkingPath">${wf:conf('reuseContent') eq false}</case>
<case to="ResetAllEntitiesPath">${wf:conf('reuseContent') eq true}</case>
<default to="ResetWorkingPath"/>
</switch>
</decision>
<action name="ResetWorkingPath">
<fs>
<delete path="${workingPath}"/>
<mkdir path="${workingPath}"/>
</fs>
<ok to="ImportDB"/>
<error to="Kill"/>
</action>
<action name="ImportDB">
<java>
<main-class>eu.dnetlib.dhp.migration.step1.MigrateDbEntitiesApplication</main-class>
<arg>-p</arg><arg>${workingPath}/db_records</arg>
<arg>-pgurl</arg><arg>${postgresURL}</arg>
<arg>-pguser</arg><arg>${postgresUser}</arg>
<arg>-pgpasswd</arg><arg>${postgresPassword}</arg>
</java>
<ok to="ImportODF"/>
<error to="Kill"/>
</action>
<action name="ImportODF">
<java>
<main-class>eu.dnetlib.dhp.migration.step1.MigrateMongoMdstoresApplication</main-class>
<arg>-p</arg><arg>${workingPath}/odf_records</arg>
<arg>-mongourl</arg><arg>${mongoURL}</arg>
<arg>-mongodb</arg><arg>${mongoDb}</arg>
<arg>-f</arg><arg>ODF</arg>
<arg>-l</arg><arg>store</arg>
<arg>-i</arg><arg>cleaned</arg>
</java>
<ok to="ImportOAF"/>
<error to="Kill"/>
</action>
<action name="ImportOAF">
<java>
<main-class>eu.dnetlib.dhp.migration.step1.MigrateMongoMdstoresApplication</main-class>
<arg>-p</arg><arg>${workingPath}/oaf_records</arg>
<arg>-mongourl</arg><arg>${mongoURL}</arg>
<arg>-mongodb</arg><arg>${mongoDb}</arg>
<arg>-f</arg><arg>OAF</arg>
<arg>-l</arg><arg>store</arg>
<arg>-i</arg><arg>cleaned</arg>
</java>
<ok to="ResetAllEntitiesPath"/>
<error to="Kill"/>
</action>
<action name="ResetAllEntitiesPath">
<fs>
<delete path="${workingPath}/all_entities"/>
</fs>
<ok to="GenerateEntities"/>
<error to="Kill"/>
</action>
<action name="GenerateEntities">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>GenerateEntities</name>
<class>eu.dnetlib.dhp.migration.step2.GenerateEntitiesApplication</class>
<jar>dhp-aggregation-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory ${sparkExecutorMemory}
--executor-cores ${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener"
--conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener"
--conf spark.sql.warehouse.dir="/user/hive/warehouse"
</spark-opts>
<arg>-mt</arg> <arg>yarn-cluster</arg>
<arg>-s</arg><arg>${workingPath}/db_records,${workingPath}/oaf_records,${workingPath}/odf_records</arg>
<arg>-t</arg><arg>${workingPath}/all_entities</arg>
<arg>-pgurl</arg><arg>${postgresURL}</arg>
<arg>-pguser</arg><arg>${postgresUser}</arg>
<arg>-pgpasswd</arg><arg>${postgresPassword}</arg>
</spark>
<ok to="ResetGraphPath"/>
<error to="Kill"/>
</action>
<action name="ResetGraphPath">
<fs>
<delete path="${graphBasePath}/graph_raw"/>
<mkdir path="${graphBasePath}/graph_raw"/>
</fs>
<ok to="GenerateGraph"/>
<error to="Kill"/>
</action>
<action name="GenerateGraph">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>GenerateGraph</name>
<class>eu.dnetlib.dhp.migration.step3.DispatchEntitiesApplication</class>
<jar>dhp-aggregation-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory ${sparkExecutorMemory}
--executor-cores ${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener"
--conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener"
--conf spark.sql.warehouse.dir="/user/hive/warehouse"
</spark-opts>
<arg>-mt</arg> <arg>yarn-cluster</arg>
<arg>-s</arg><arg>${workingPath}/all_entities</arg>
<arg>-g</arg><arg>${graphBasePath}/graph_raw</arg>
</spark>
<ok to="End"/>
<error to="Kill"/>
</action>
<end name="End"/>
</workflow-app>

@ -0,0 +1,18 @@
<configuration>
<property>
<name>jobTracker</name>
<value>yarnRM</value>
</property>
<property>
<name>nameNode</name>
<value>hdfs://nameservice1</value>
</property>
<property>
<name>oozie.use.system.libpath</name>
<value>true</value>
</property>
<property>
<name>oozie.action.sharelib.for.spark</name>
<value>spark2</value>
</property>
</configuration>

@ -0,0 +1,103 @@
<workflow-app name="import regular entities as Graph (step 1)" xmlns="uri:oozie:workflow:0.5">
<parameters>
<property>
<name>migrationPathStep1</name>
<description>the base path to store hdfs file</description>
</property>
<property>
<name>postgresURL</name>
<description>the postgres URL to access to the database</description>
</property>
<property>
<name>postgresUser</name>
<description>the user postgres</description>
</property>
<property>
<name>postgresPassword</name>
<description>the password postgres</description>
</property>
<property>
<name>mongoURL</name>
<description>mongoDB url, example: mongodb://[username:password@]host[:port]</description>
</property>
<property>
<name>mongoDb</name>
<description>mongo database</description>
</property>
<property>
<name>sparkDriverMemory</name>
<description>memory for driver process</description>
</property>
<property>
<name>sparkExecutorMemory</name>
<description>memory for individual executor</description>
</property>
<property>
<name>sparkExecutorCores</name>
<description>number of cores used by single executor</description>
</property>
</parameters>
<start to="ResetWorkingPath"/>
<kill name="Kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<action name="ResetWorkingPath">
<fs>
<delete path='${migrationPathStep1}'/>
<mkdir path='${migrationPathStep1}'/>
</fs>
<ok to="ImportDB"/>
<error to="Kill"/>
</action>
<action name="ImportDB">
<java>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<main-class>eu.dnetlib.dhp.migration.step1.MigrateDbEntitiesApplication</main-class>
<arg>-p</arg><arg>${migrationPathStep1}/db_records</arg>
<arg>-pgurl</arg><arg>${postgresURL}</arg>
<arg>-pguser</arg><arg>${postgresUser}</arg>
<arg>-pgpasswd</arg><arg>${postgresPassword}</arg>
</java>
<ok to="ImportODF"/>
<error to="Kill"/>
</action>
<action name="ImportODF">
<java>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<main-class>eu.dnetlib.dhp.migration.step1.MigrateMongoMdstoresApplication</main-class>
<arg>-p</arg><arg>${migrationPathStep1}/odf_records</arg>
<arg>-mongourl</arg><arg>${mongoURL}</arg>
<arg>-mongodb</arg><arg>${mongoDb}</arg>
<arg>-f</arg><arg>ODF</arg>
<arg>-l</arg><arg>store</arg>
<arg>-i</arg><arg>cleaned</arg>
</java>
<ok to="ImportOAF"/>
<error to="Kill"/>
</action>
<action name="ImportOAF">
<java>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<main-class>eu.dnetlib.dhp.migration.step1.MigrateMongoMdstoresApplication</main-class>
<arg>-p</arg><arg>${migrationPathStep1}/oaf_records</arg>
<arg>-mongourl</arg><arg>${mongoURL}</arg>
<arg>-mongodb</arg><arg>${mongoDb}</arg>
<arg>-f</arg><arg>OAF</arg>
<arg>-l</arg><arg>store</arg>
<arg>-i</arg><arg>cleaned</arg>
</java>
<ok to="End"/>
<error to="Kill"/>
</action>
<end name="End"/>
</workflow-app>

@ -0,0 +1,18 @@
<configuration>
<property>
<name>jobTracker</name>
<value>yarnRM</value>
</property>
<property>
<name>nameNode</name>
<value>hdfs://nameservice1</value>
</property>
<property>
<name>oozie.use.system.libpath</name>
<value>true</value>
</property>
<property>
<name>oozie.action.sharelib.for.spark</name>
<value>spark2</value>
</property>
</configuration>

@ -0,0 +1,62 @@
<workflow-app name="import db entities (step 1)" xmlns="uri:oozie:workflow:0.5">
<parameters>
<property>
<name>migrationPathStep1</name>
<description>the base path to store hdfs file</description>
</property>
<property>
<name>postgresURL</name>
<description>the postgres URL to access to the database</description>
</property>
<property>
<name>postgresUser</name>
<description>the user postgres</description>
</property>
<property>
<name>postgresPassword</name>
<description>the password postgres</description>
</property>
<property>
<name>sparkDriverMemory</name>
<description>memory for driver process</description>
</property>
<property>
<name>sparkExecutorMemory</name>
<description>memory for individual executor</description>
</property>
<property>
<name>sparkExecutorCores</name>
<description>number of cores used by single executor</description>
</property>
</parameters>
<start to="ResetWorkingPath"/>
<kill name="Kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<action name="ResetWorkingPath">
<fs>
<delete path='${migrationPathStep1}/db_records'/>
</fs>
<ok to="ImportDB"/>
<error to="Kill"/>
</action>
<action name="ImportDB">
<java>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<main-class>eu.dnetlib.dhp.migration.step1.MigrateDbEntitiesApplication</main-class>
<arg>-p</arg><arg>${migrationPathStep1}/db_records</arg>
<arg>-pgurl</arg><arg>${postgresURL}</arg>
<arg>-pguser</arg><arg>${postgresUser}</arg>
<arg>-pgpasswd</arg><arg>${postgresPassword}</arg>
</java>
<ok to="End"/>
<error to="Kill"/>
</action>
<end name="End"/>
</workflow-app>

@ -0,0 +1,18 @@
<configuration>
<property>
<name>jobTracker</name>
<value>yarnRM</value>
</property>
<property>
<name>nameNode</name>
<value>hdfs://nameservice1</value>
</property>
<property>
<name>oozie.use.system.libpath</name>
<value>true</value>
</property>
<property>
<name>oozie.action.sharelib.for.spark</name>
<value>spark2</value>
</property>
</configuration>

@ -0,0 +1,74 @@
<workflow-app name="import regular entities as Graph (step 2)" xmlns="uri:oozie:workflow:0.5">
<parameters>
<property>
<name>migrationPathStep1</name>
<description>the base path to store hdfs file</description>
</property>
<property>
<name>migrationPathStep2</name>
<description>the temporary path to store entities before dispatching</description>
</property>
<property>
<name>postgresURL</name>
<description>the postgres URL to access to the database</description>
</property>
<property>
<name>postgresUser</name>
<description>the user postgres</description>
</property>
<property>
<name>postgresPassword</name>
<description>the password postgres</description>
</property>
<property>
<name>sparkDriverMemory</name>
<description>memory for driver process</description>
</property>
<property>
<name>sparkExecutorMemory</name>
<description>memory for individual executor</description>
</property>
<property>
<name>sparkExecutorCores</name>
<description>number of cores used by single executor</description>
</property>
</parameters>
<start to="ResetEntities"/>
<kill name="Kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<action name="ResetEntities">
<fs>
<delete path='${migrationPathStep2}'/>
<mkdir path='${migrationPathStep2}'/>
</fs>
<ok to="GenerateEntities"/>
<error to="Kill"/>
</action>
<action name="GenerateEntities">
<spark xmlns="uri:oozie:spark-action:0.2">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<master>yarn-cluster</master>
<mode>cluster</mode>
<name>GenerateEntities</name>
<class>eu.dnetlib.dhp.migration.step2.GenerateEntitiesApplication</class>
<jar>dhp-aggregation-${projectVersion}.jar</jar>
<spark-opts>--executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores} --driver-memory=${sparkDriverMemory} --conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" --conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" --conf spark.sql.warehouse.dir="/user/hive/warehouse"</spark-opts>
<arg>-mt</arg> <arg>yarn-cluster</arg>
<arg>-s</arg><arg>${migrationPathStep1}/db_records,${migrationPathStep1}/oaf_records,${migrationPathStep1}/odf_records</arg>
<arg>-t</arg><arg>${migrationPathStep2}/all_entities</arg>
<arg>-pgurl</arg><arg>${postgresURL}</arg>
<arg>-pguser</arg><arg>${postgresUser}</arg>
<arg>-pgpasswd</arg><arg>${postgresPassword}</arg>
</spark>
<ok to="End"/>
<error to="Kill"/>
</action>
<end name="End"/>
</workflow-app>

@ -0,0 +1,18 @@
<configuration>
<property>
<name>jobTracker</name>
<value>yarnRM</value>
</property>
<property>
<name>nameNode</name>
<value>hdfs://nameservice1</value>
</property>
<property>
<name>oozie.use.system.libpath</name>
<value>true</value>
</property>
<property>
<name>oozie.action.sharelib.for.spark</name>
<value>spark2</value>
</property>
</configuration>

@ -0,0 +1,60 @@
<workflow-app name="import regular entities as Graph (step 3)" xmlns="uri:oozie:workflow:0.5">
<parameters>
<property>
<name>migrationPathStep2</name>
<description>the temporary path to store entities before dispatching</description>
</property>
<property>
<name>migrationPathStep3</name>
<description>the graph Raw base path</description>
</property>
<property>
<name>sparkDriverMemory</name>
<description>memory for driver process</description>
</property>
<property>
<name>sparkExecutorMemory</name>
<description>memory for individual executor</description>
</property>
<property>
<name>sparkExecutorCores</name>
<description>number of cores used by single executor</description>
</property>
</parameters>
<start to="ResetGraph"/>
<kill name="Kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<action name="ResetGraph">
<fs>
<delete path='${migrationPathStep3}'/>
<mkdir path='${migrationPathStep3}'/>
</fs>
<ok to="GenerateGraph"/>
<error to="Kill"/>
</action>
<action name="GenerateGraph">
<spark xmlns="uri:oozie:spark-action:0.2">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<master>yarn-cluster</master>
<mode>cluster</mode>
<name>GenerateGraph</name>
<class>eu.dnetlib.dhp.migration.step3.DispatchEntitiesApplication</class>
<jar>dhp-aggregation-${projectVersion}.jar</jar>
<spark-opts>--executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores} --driver-memory=${sparkDriverMemory} --conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" --conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" --conf spark.sql.warehouse.dir="/user/hive/warehouse"</spark-opts>
<arg>-mt</arg> <arg>yarn-cluster</arg>
<arg>-s</arg><arg>${migrationPathStep2}/all_entities</arg>
<arg>-g</arg><arg>${migrationPathStep3}</arg>
</spark>
<ok to="End"/>
<error to="Kill"/>
</action>
<end name="End"/>
</workflow-app>

@ -0,0 +1,9 @@
# Set root logger level to DEBUG and its only appender to A1.
log4j.rootLogger=INFO, A1
# A1 is set to be a ConsoleAppender.
log4j.appender.A1=org.apache.log4j.ConsoleAppender
# A1 uses PatternLayout.
log4j.appender.A1.layout=org.apache.log4j.PatternLayout
log4j.appender.A1.layout.ConversionPattern=%-4r [%t] %-5p %c %x - %m%n

@ -1,79 +1,89 @@
package eu.dnetlib.dhp.collection;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.model.mdstore.MetadataRecord;
import eu.dnetlib.dhp.model.mdstore.Provenance;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOUtils;
import org.junit.*;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
public class CollectionJobTest {
private Path testDir;
@Before
public void setup() throws IOException {
testDir = Files.createTempDirectory("dhp-collection");
}
@After
public void teadDown() throws IOException {
FileUtils.deleteDirectory(testDir.toFile());
}
@Test
public void tesCollection() throws Exception {
Provenance provenance = new Provenance("pippo", "puppa", "ns_prefix");
GenerateNativeStoreSparkJob.main(new String[] {
"-mt", "local",
"-w", "wid",
"-e", "XML",
"-d", ""+System.currentTimeMillis(),
"-p", new ObjectMapper().writeValueAsString(provenance),
"-x", "./*[local-name()='record']/*[local-name()='header']/*[local-name()='identifier']",
"-i", this.getClass().getResource("/eu/dnetlib/dhp/collection/native.seq").toString(),
"-o", testDir.toString()+"/store",
"-t", "true",
"-ru", "",
"-rp", "",
"-rh", "",
"-ro", "",
"-rr", ""});
System.out.println(new ObjectMapper().writeValueAsString(provenance));
}
@Test
public void testGenerationMetadataRecord() throws Exception {
final String xml = IOUtils.toString(this.getClass().getResourceAsStream("./record.xml"));
MetadataRecord record = GenerateNativeStoreSparkJob.parseRecord(xml, "./*[local-name()='record']/*[local-name()='header']/*[local-name()='identifier']", "XML", new Provenance("foo", "bar", "ns_prefix"), System.currentTimeMillis(), null,null);
assert record != null;
System.out.println(record.getId());
System.out.println(record.getOriginalId());
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOUtils;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
}
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.model.mdstore.MetadataRecord;
import eu.dnetlib.dhp.model.mdstore.Provenance;
@Test
public void TestEquals () throws IOException {
import static org.junit.jupiter.api.Assertions.*;
final String xml = IOUtils.toString(this.getClass().getResourceAsStream("./record.xml"));
MetadataRecord record = GenerateNativeStoreSparkJob.parseRecord(xml, "./*[local-name()='record']/*[local-name()='header']/*[local-name()='identifier']", "XML", new Provenance("foo", "bar", "ns_prefix"), System.currentTimeMillis(), null,null);
MetadataRecord record1 = GenerateNativeStoreSparkJob.parseRecord(xml, "./*[local-name()='record']/*[local-name()='header']/*[local-name()='identifier']", "XML", new Provenance("foo", "bar", "ns_prefix"), System.currentTimeMillis(), null,null);
assert record != null;
record.setBody("ciao");
assert record1 != null;
record1.setBody("mondo");
Assert.assertEquals(record, record1);
public class CollectionJobTest {
}
private Path testDir;
@BeforeEach
public void setup() throws IOException {
testDir = Files.createTempDirectory("dhp-collection");
}
@AfterEach
public void teadDown() throws IOException {
FileUtils.deleteDirectory(testDir.toFile());
}
@Test
public void tesCollection() throws Exception {
final Provenance provenance = new Provenance("pippo", "puppa", "ns_prefix");
GenerateNativeStoreSparkJob.main(new String[] {
"-mt", "local",
"-w", "wid",
"-e", "XML",
"-d", "" + System.currentTimeMillis(),
"-p", new ObjectMapper().writeValueAsString(provenance),
"-x", "./*[local-name()='record']/*[local-name()='header']/*[local-name()='identifier']",
"-i", this.getClass().getResource("/eu/dnetlib/dhp/collection/native.seq").toString(),
"-o", testDir.toString() + "/store",
"-t", "true",
"-ru", "",
"-rp", "",
"-rh", "",
"-ro", "",
"-rr", "" });
System.out.println(new ObjectMapper().writeValueAsString(provenance));
}
@Test
public void testGenerationMetadataRecord() throws Exception {
final String xml = IOUtils.toString(this.getClass().getResourceAsStream("./record.xml"));
final MetadataRecord record = GenerateNativeStoreSparkJob
.parseRecord(xml, "./*[local-name()='record']/*[local-name()='header']/*[local-name()='identifier']", "XML", new Provenance("foo", "bar",
"ns_prefix"), System.currentTimeMillis(), null, null);
assert record != null;
System.out.println(record.getId());
System.out.println(record.getOriginalId());
}
@Test
public void TestEquals() throws IOException {
final String xml = IOUtils.toString(this.getClass().getResourceAsStream("./record.xml"));
final MetadataRecord record = GenerateNativeStoreSparkJob
.parseRecord(xml, "./*[local-name()='record']/*[local-name()='header']/*[local-name()='identifier']", "XML", new Provenance("foo", "bar",
"ns_prefix"), System.currentTimeMillis(), null, null);
final MetadataRecord record1 = GenerateNativeStoreSparkJob
.parseRecord(xml, "./*[local-name()='record']/*[local-name()='header']/*[local-name()='identifier']", "XML", new Provenance("foo", "bar",
"ns_prefix"), System.currentTimeMillis(), null, null);
assert record != null;
record.setBody("ciao");
assert record1 != null;
record1.setBody("mondo");
assertEquals(record, record1);
}
}

@ -7,13 +7,13 @@ import eu.dnetlib.dhp.collection.worker.DnetCollectorWorker;
import eu.dnetlib.dhp.collection.worker.utils.CollectorPluginFactory;
import eu.dnetlib.message.Message;
import eu.dnetlib.message.MessageManager;
import org.junit.After;
import org.junit.Before;
import org.junit.Test;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import java.io.File;
import static org.junit.Assert.assertNotNull;
import static org.junit.jupiter.api.Assertions.assertNotNull;
import static org.mockito.Mockito.*;
@ -24,7 +24,7 @@ public class DnetCollectorWorkerApplicationTests {
private MessageManager messageManager = mock(MessageManager.class);
private DnetCollectorWorker worker;
@Before
@BeforeEach
public void setup() throws Exception {
ObjectMapper mapper = new ObjectMapper();
final String apiJson = mapper.writeValueAsString(getApi());
@ -47,7 +47,7 @@ public class DnetCollectorWorkerApplicationTests {
}
@After
@AfterEach
public void dropDown(){
File f = new File("/tmp/file.seq");
f.delete();

@ -0,0 +1,293 @@
package eu.dnetlib.dhp.migration.step1;
import com.fasterxml.jackson.core.type.TypeReference;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.schema.oaf.*;
import org.apache.commons.io.IOUtils;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.extension.ExtendWith;
import org.mockito.Mock;
import org.mockito.Mockito;
import org.mockito.junit.jupiter.MockitoExtension;
import java.io.IOException;
import java.sql.Array;
import java.sql.Date;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.util.List;
import java.util.Objects;
import static org.junit.jupiter.api.Assertions.assertEquals;
@ExtendWith(MockitoExtension.class)
public class MigrateDbEntitiesApplicationTest {
private MigrateDbEntitiesApplication app;
@Mock
private ResultSet rs;
@BeforeEach
public void setUp() {
this.app = new MigrateDbEntitiesApplication();
}
@Test
public void testProcessDatasource() throws Exception {
final List<TypedField> fields = prepareMocks("datasources_resultset_entry.json");
final List<Oaf> list = app.processDatasource(rs);
assertEquals(1, list.size());
verifyMocks(fields);
final Datasource ds = (Datasource) list.get(0);
assertValidId(ds.getId());
assertEquals(ds.getOfficialname().getValue(), getValueAsString("officialname", fields));
assertEquals(ds.getEnglishname().getValue(), getValueAsString("englishname", fields));
assertEquals(ds.getContactemail().getValue(), getValueAsString("contactemail", fields));
assertEquals(ds.getWebsiteurl().getValue(), getValueAsString("websiteurl", fields));
assertEquals(ds.getNamespaceprefix().getValue(), getValueAsString("namespaceprefix", fields));
assertEquals(ds.getCollectedfrom().get(0).getKey(), getValueAsString("collectedfromid", fields));
assertEquals(ds.getCollectedfrom().get(0).getValue(), getValueAsString("collectedfromname", fields));
}
@Test
public void testProcessProject() throws Exception {
final List<TypedField> fields = prepareMocks("projects_resultset_entry.json");
final List<Oaf> list = app.processProject(rs);
assertEquals(1, list.size());
verifyMocks(fields);
final Project p = (Project) list.get(0);
assertValidId(p.getId());
assertEquals(p.getAcronym().getValue(), getValueAsString("acronym", fields));
assertEquals(p.getTitle().getValue(), getValueAsString("title", fields));
assertEquals(p.getCollectedfrom().get(0).getKey(), getValueAsString("collectedfromid", fields));
assertEquals(p.getCollectedfrom().get(0).getValue(), getValueAsString("collectedfromname", fields));
}
@Test
public void testProcessOrganization() throws Exception {
final List<TypedField> fields = prepareMocks("organizations_resultset_entry.json");
final List<Oaf> list = app.processOrganization(rs);
assertEquals(1, list.size());
verifyMocks(fields);
final Organization o = (Organization) list.get(0);
assertValidId(o.getId());
assertEquals(o.getLegalshortname().getValue(), getValueAsString("legalshortname", fields));
assertEquals(o.getLegalname().getValue(), getValueAsString("legalname", fields));
assertEquals(o.getWebsiteurl().getValue(), getValueAsString("websiteurl", fields));
assertEquals(o.getCountry().getClassid(), getValueAsString("country", fields).split("@@@")[0]);
assertEquals(o.getCountry().getClassname(), getValueAsString("country", fields).split("@@@")[1]);
assertEquals(o.getCountry().getSchemeid(), getValueAsString("country", fields).split("@@@")[2]);
assertEquals(o.getCountry().getSchemename(), getValueAsString("country", fields).split("@@@")[3]);
assertEquals(o.getCollectedfrom().get(0).getKey(), getValueAsString("collectedfromid", fields));
assertEquals(o.getCollectedfrom().get(0).getValue(), getValueAsString("collectedfromname", fields));
}
@Test
public void testProcessDatasourceOrganization() throws Exception {
final List<TypedField> fields = prepareMocks("datasourceorganization_resultset_entry.json");
final List<Oaf> list = app.processDatasourceOrganization(rs);
assertEquals(2, list.size());
verifyMocks(fields);
final Relation r1 = (Relation) list.get(0);
final Relation r2 = (Relation) list.get(1);
assertValidId(r1.getSource());
assertValidId(r2.getSource());
assertEquals(r1.getSource(), r2.getTarget());
assertEquals(r2.getSource(), r1.getTarget());
}
@Test
public void testProcessProjectOrganization() throws Exception {
final List<TypedField> fields = prepareMocks("projectorganization_resultset_entry.json");
final List<Oaf> list = app.processProjectOrganization(rs);
assertEquals(2, list.size());
verifyMocks(fields);
final Relation r1 = (Relation) list.get(0);
final Relation r2 = (Relation) list.get(1);
assertValidId(r1.getSource());
assertValidId(r2.getSource());
assertEquals(r1.getSource(), r2.getTarget());
assertEquals(r2.getSource(), r1.getTarget());
}
@Test
public void testProcessClaims_context() throws Exception {
final List<TypedField> fields = prepareMocks("claimscontext_resultset_entry.json");
final List<Oaf> list = app.processClaims(rs);
assertEquals(1, list.size());
verifyMocks(fields);
}
@Test
public void testProcessClaims_rels() throws Exception {
final List<TypedField> fields = prepareMocks("claimsrel_resultset_entry.json");
final List<Oaf> list = app.processClaims(rs);
assertEquals(2, list.size());
verifyMocks(fields);
}
private List<TypedField> prepareMocks(final String jsonFile) throws IOException, SQLException {
final String json = IOUtils.toString(getClass().getResourceAsStream(jsonFile));
final ObjectMapper mapper = new ObjectMapper();
final List<TypedField> list = mapper.readValue(json, new TypeReference<List<TypedField>>() {});
for (final TypedField tf : list) {
if (tf.getValue() == null) {
switch (tf.getType()) {
case "not_used":
break;
case "boolean":
Mockito.when(rs.getBoolean(tf.getField())).thenReturn(false);
break;
case "date":
Mockito.when(rs.getDate(tf.getField())).thenReturn(null);
break;
case "int":
Mockito.when(rs.getInt(tf.getField())).thenReturn(0);
break;
case "double":
Mockito.when(rs.getDouble(tf.getField())).thenReturn(0.0);
break;
case "array":
Mockito.when(rs.getArray(tf.getField())).thenReturn(null);
break;
case "string":
default:
Mockito.when(rs.getString(tf.getField())).thenReturn(null);
break;
}
} else {
switch (tf.getType()) {
case "not_used":
break;
case "boolean":
Mockito.when(rs.getBoolean(tf.getField())).thenReturn(Boolean.parseBoolean(tf.getValue().toString()));
break;
case "date":
Mockito.when(rs.getDate(tf.getField())).thenReturn(Date.valueOf(tf.getValue().toString()));
break;
case "int":
Mockito.when(rs.getInt(tf.getField())).thenReturn(new Integer(tf.getValue().toString()));
break;
case "double":
Mockito.when(rs.getDouble(tf.getField())).thenReturn(new Double(tf.getValue().toString()));
break;
case "array":
final Array arr = Mockito.mock(Array.class);
final String[] values = ((List<?>) tf.getValue()).stream()
.filter(Objects::nonNull)
.map(o -> o.toString())
.toArray(String[]::new);
Mockito.when(arr.getArray()).thenReturn(values);
Mockito.when(rs.getArray(tf.getField())).thenReturn(arr);
break;
case "string":
default:
Mockito.when(rs.getString(tf.getField())).thenReturn(tf.getValue().toString());
break;
}
}
}
return list;
}
private void verifyMocks(final List<TypedField> list) throws SQLException {
for (final TypedField tf : list) {
switch (tf.getType()) {
case "not_used":
break;
case "boolean":
Mockito.verify(rs, Mockito.atLeastOnce()).getBoolean(tf.getField());
break;
case "date":
Mockito.verify(rs, Mockito.atLeastOnce()).getDate(tf.getField());
break;
case "int":
Mockito.verify(rs, Mockito.atLeastOnce()).getInt(tf.getField());
break;
case "double":
Mockito.verify(rs, Mockito.atLeastOnce()).getDouble(tf.getField());
break;
case "array":
Mockito.verify(rs, Mockito.atLeastOnce()).getArray(tf.getField());
break;
case "string":
default:
Mockito.verify(rs, Mockito.atLeastOnce()).getString(tf.getField());
break;
}
}
}
private void assertValidId(final String id) {
assertEquals(49, id.length());
assertEquals('|', id.charAt(2));
assertEquals(':', id.charAt(15));
assertEquals(':', id.charAt(16));
}
private String getValueAsString(final String name, final List<TypedField> fields) {
return fields.stream()
.filter(f -> f.getField().equals(name))
.map(TypedField::getValue)
.filter(Objects::nonNull)
.map(o -> o.toString())
.findFirst()
.get();
}
}
class TypedField {
private String field;
private String type;
private Object value;
public String getField() {
return field;
}
public void setField(final String field) {
this.field = field;
}
public String getType() {
return type;
}
public void setType(final String type) {
this.type = type;
}
public Object getValue() {
return value;
}
public void setValue(final Object value) {
this.value = value;
}
}

@ -6,47 +6,32 @@ import eu.dnetlib.dhp.transformation.vocabulary.Vocabulary;
import eu.dnetlib.dhp.transformation.vocabulary.VocabularyHelper;
import eu.dnetlib.dhp.utils.DHPUtils;
import net.sf.saxon.s9api.*;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOUtils;
import org.apache.spark.util.LongAccumulator;
import org.dom4j.Document;
import org.dom4j.Node;
import org.dom4j.io.SAXReader;
import org.junit.*;
import org.junit.jupiter.api.DisplayName;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.extension.ExtendWith;
import org.junit.jupiter.api.io.TempDir;
import org.mockito.Mock;
import org.mockito.junit.MockitoJUnit;
import org.mockito.junit.MockitoRule;
import org.mockito.junit.jupiter.MockitoExtension;
import javax.xml.transform.stream.StreamSource;
import java.io.File;
import java.io.IOException;
import java.io.StringWriter;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.Comparator;
import java.util.HashMap;
import java.util.Map;
import static org.junit.jupiter.api.Assertions.assertNotNull;
@ExtendWith(MockitoExtension.class)
public class TransformationJobTest {
@Mock
LongAccumulator accumulator;
@Rule
public MockitoRule mockitoRule = MockitoJUnit.rule();
private Path testDir;
@Before
public void setup() throws IOException {
testDir = Files.createTempDirectory("dhp-collection");
}
@After
public void tearDown() throws IOException {
FileUtils.deleteDirectory(testDir.toFile());
}
private LongAccumulator accumulator;
@Test
public void testTransformSaxonHE() throws Exception {
@ -70,9 +55,9 @@ public class TransformationJobTest {
System.out.println(output.toString());
}
@DisplayName("Test TransformSparkJobNode.main")
@Test
public void transformTest() throws Exception {
public void transformTest(@TempDir Path testDir) throws Exception {
final String mdstore_input = this.getClass().getResource("/eu/dnetlib/dhp/transform/mdstorenative").getFile();
final String mdstore_output = testDir.toString()+"/version";
final String xslt = DHPUtils.compressString(IOUtils.toString(this.getClass().getResourceAsStream("/eu/dnetlib/dhp/transform/tr.xml")));
@ -89,8 +74,6 @@ public class TransformationJobTest {
"-rh", "",
"-ro", "",
"-rr", ""});
}
@Test
@ -121,7 +104,7 @@ public class TransformationJobTest {
record.setBody(IOUtils.toString(this.getClass().getResourceAsStream("/eu/dnetlib/dhp/transform/input.xml")));
final MetadataRecord result = tf.call(record);
Assert.assertNotNull(result.getBody());
assertNotNull(result.getBody());
System.out.println(result.getBody());
}

@ -1,7 +1,7 @@
package eu.dnetlib.dhp.transformation.vocabulary;
import org.junit.Test;
import static org.junit.Assert.*;
import org.junit.jupiter.api.Test;
import static org.junit.jupiter.api.Assertions.*;
public class VocabularyTest {

@ -0,0 +1,27 @@
[
{
"field": "source_type",
"type": "string",
"value": "context"
},
{
"field": "source_id",
"type": "string",
"value": "oa-pg"
},
{
"field": "target_type",
"type": "string",
"value": "publication"
},
{
"field": "target_id",
"type": "string",
"value": "userclaim___::d99de49026e79d271f3e7451d8de18b6"
},
{
"field": "semantics",
"type": "not_used",
"value": "isRelevantTo"
}
]

@ -0,0 +1,27 @@
[
{
"field": "source_type",
"type": "string",
"value": "project"
},
{
"field": "source_id",
"type": "string",
"value": "corda__h2020::b38a638a93b505d670fcacc47a0283d6"
},
{
"field": "target_type",
"type": "string",
"value": "publication"
},
{
"field": "target_id",
"type": "string",
"value": "userclaim___::5b5117253d3c64c79809d0b92fa287b4"
},
{
"field": "semantics",
"type": "not_used",
"value": "resultProject_outcome_produces"
}
]

@ -0,0 +1,62 @@
[
{
"field": "datasource",
"type": "string",
"value": "openaire____::revistasunicauca"
},
{
"field": "organization",
"type": "string",
"value": "openaire____::openaire____::revistasunicauca"
},
{
"field": "startdate",
"type": "not_used",
"value": null
},
{
"field": "enddate",
"type": "not_used",
"value": null
},
{
"field": "inferred",
"type": "boolean",
"value": false
},
{
"field": "deletedbyinference",
"type": "boolean",
"value": false
},
{
"field": "trust",
"type": "string",
"value": "0.9"
},
{
"field": "inferenceprovenance",
"type": "string",
"value": null
},
{
"field": "collectedfromid",
"type": "string",
"value": null
},
{
"field": "collectedfromname",
"type": "string",
"value": null
},
{
"field": "semantics",
"type": "not_used",
"value": "providedBy@@@provided by@@@dnet:datasources_organizations_typologies@@@dnet:datasources_organizations_typologies"
},
{
"field": "provenanceaction",
"type": "not_used",
"value": null
}
]

@ -0,0 +1,234 @@
[
{
"field": "datasourceid",
"type": "string",
"value": "274269ac6f3b::2579-5449"
},
{
"field": "identities",
"type": "not_used",
"value": [
"274269ac6f3b::2579-5449",
null
]
},
{
"field": "officialname",
"type": "string",
"value": "Jurnal Ilmiah Pendidikan Scholastic"
},
{
"field": "englishname",
"type": "string",
"value": "Jurnal Ilmiah Pendidikan Scholastic"
},
{
"field": "contactemail",
"type": "string",
"value": "test@test.it"
},
{
"field": "openairecompatibility",
"type": "string",
"value": "hostedBy@@@collected from a compatible aggregator@@@dnet:datasourceCompatibilityLevel@@@dnet:datasourceCompatibilityLevel"
},
{
"field": "websiteurl",
"type": "string",
"value": "http://e-journal.sastra-unes.com/index.php/JIPS/index"
},
{
"field": "logourl",
"type": "string",
"value": null
},
{
"field": "accessinfopackage",
"type": "array",
"value": [
null
]
},
{
"field": "latitude",
"type": "double",
"value": 0
},
{
"field": "longitude",
"type": "double",
"value": 0
},
{
"field": "namespaceprefix",
"type": "string",
"value": "ojs_25795449"
},
{
"field": "odnumberofitems",
"type": "int",
"value": null
},
{
"field": "odnumberofitemsdate",
"type": "date",
"value": null
},
{
"field": "subjects",
"type": "array",
"value": null
},
{
"field": "description",
"type": "string",
"value": null
},
{
"field": "odpolicies",
"type": "string",
"value": null
},
{
"field": "odlanguages",
"type": "array",
"value": []
},
{
"field": "odcontenttypes",
"type": "array",
"value": [
"Journal articles"
]
},
{
"field": "inferred",
"type": "boolean",
"value": false
},
{
"field": "deletedbyinference",
"type": "boolean",
"value": false
},
{
"field": "trust",
"type": "string",
"value": "0.9"
},
{
"field": "inferenceprovenance",
"type": "string",
"value": null
},
{
"field": "dateofcollection",
"type": "date",
"value": "2020-01-21"
},
{
"field": "dateofvalidation",
"type": "date",
"value": null
},
{
"field": "releasestartdate",
"type": "date",
"value": null
},
{
"field": "releaseenddate",
"type": "date",
"value": null
},
{
"field": "missionstatementurl",
"type": "string",
"value": null
},
{
"field": "dataprovider",
"type": "boolean",
"value": null
},
{
"field": "serviceprovider",
"type": "boolean",
"value": null
},
{
"field": "databaseaccesstype",
"type": "string",
"value": null
},
{
"field": "datauploadtype",
"type": "string",
"value": null
},
{
"field": "databaseaccessrestriction",
"type": "string",
"value": null
},
{
"field": "datauploadrestriction",
"type": "string",
"value": null
},
{
"field": "versioning",
"type": "boolean",
"value": null
},
{
"field": "citationguidelineurl",
"type": "string",
"value": null
},
{
"field": "qualitymanagementkind",
"type": "string",
"value": null
},
{
"field": "pidsystems",
"type": "string",
"value": null
},
{
"field": "certificates",
"type": "string",
"value": null
},
{
"field": "policies",
"type": "not_used",
"value": []
},
{
"field": "collectedfromid",
"type": "string",
"value": "openaire____::SnVybmFsIEZha3VsdGFzIFNhc3RyYSBVbml2ZXJzaXRhcyBFa2FzYWt0aQ=="
},
{
"field": "collectedfromname",
"type": "string",
"value": "Jurnal Fakultas Sastra Universitas Ekasakti"
},
{
"field": "datasourcetype",
"type": "string",
"value": "pubsrepository::journal@@@Journal@@@dnet:datasource_typologies@@@dnet:datasource_typologies"
},
{
"field": "provenanceaction",
"type": "not_used",
"value": "sysimport:crosswalk:entityregistry@@@sysimport:crosswalk:entityregistry@@@dnet:provenance_actions@@@dnet:provenance_actions"
},
{
"field": "journal",
"type": "string",
"value": "2579-5449@@@2597-6540@@@"
}
]

@ -0,0 +1,127 @@
[
{
"field": "organizationid",
"type": "string",
"value": "openaire____::openaire____::microsoft"
},
{
"field": "legalshortname",
"type": "string",
"value": "MSFTResearch"
},
{
"field": "legalname",
"type": "string",
"value": "Microsoft Research"
},
{
"field": "websiteurl",
"type": "string",
"value": "https://www.microsoft.com/en-us/research/"
},
{
"field": "logourl",
"type": "string",
"value": null
},
{
"field": "eclegalbody",
"type": "boolean",
"value": false
},
{
"field": "eclegalperson",
"type": "boolean",
"value": false
},
{
"field": "ecnonprofit",
"type": "boolean",
"value": false
},
{
"field": "ecresearchorganization",
"type": "boolean",
"value": false
},
{
"field": "echighereducation",
"type": "boolean",
"value": false
},
{
"field": "ecinternationalorganizationeurinterests",
"type": "boolean",
"value": false
},
{
"field": "ecinternationalorganization",
"type": "boolean",
"value": false
},
{
"field": "ecenterprise",
"type": "boolean",
"value": false
},
{
"field": "ecsmevalidated",
"type": "boolean",
"value": false
},
{
"field": "ecnutscode",
"type": "boolean",
"value": false
},
{
"field": "dateofcollection",
"type": "date",
"value": "2018-10-19"
},
{
"field": "dateoftransformation",
"type": "date",
"value": "2018-10-19"
},
{
"field": "inferred",
"type": "boolean",
"value": false
},
{
"field": "deletedbyinference",
"type": "boolean",
"value": false
},
{
"field": "trust",
"type": "string",
"value": "0.9"
},
{
"field": "inferenceprovenance",
"type": "string",
"value": ""
},
{
"field": "collectedfromid",
"type": "string",
"value": "openaire____::TEST"
},
{
"field": "collectedfromname",
"type": "string",
"value": "TEST"
},
{
"field": "country",
"type": "string",
"value": "US@@@US@@@dnet:countries@@@dnet:countries"
},
{
"field": "provenanceaction",
"type": "not_used",
"value": "sysimport:crosswalk:entityregistry@@@sysimport:crosswalk:entityregistry@@@dnet:provenance_actions@@@dnet:provenance_actions"
}
]

@ -0,0 +1,72 @@
[
{
"field": "project",
"type": "string",
"value": "nsf_________::1700003"
},
{
"field": "resporganization",
"type": "string",
"value": "nsf_________::University_of_Notre_Dame"
},
{
"field": "participantnumber",
"type": "not_used",
"value": 1
},
{
"field": "contribution",
"type": "not_used",
"value": null
},
{
"field": "startdate",
"type": "not_used",
"value": null
},
{
"field": "enddate",
"type": "not_used",
"value": null
},
{
"field": "inferred",
"type": "boolean",
"value": false
},
{
"field": "deletedbyinference",
"type": "boolean",
"value": false
},
{
"field": "trust",
"type": "string",
"value": "0.9"
},
{
"field": "inferenceprovenance",
"type": "string",
"value": null
},
{
"field": "collectedfromid",
"type": "string",
"value": "openaire____::nsf"
},
{
"field": "collectedfromname",
"type": "string",
"value": "NSF - National Science Foundation"
},
{
"field": "semantics",
"type": "not_used",
"value": "coordinator@@@coordinator@@@dnet:project_organization_relations@@@dnet:project_organization_relations"
},
{
"field": "provenanceaction",
"type": "not_used",
"value": "sysimport:crosswalk:entityregistry@@@sysimport:crosswalk:entityregistry@@@dnet:provenance_actions@@@dnet:provenance_actions"
}
]

@ -0,0 +1,193 @@
[
{
"field": "projectid",
"type": "string",
"value": "aka_________::100469"
},
{
"field": "code",
"type": "string",
"value": "100469"
},
{
"field": "websiteurl",
"type": "string",
"value": "http://test"
},
{
"field": "acronym",
"type": "string",
"value": "RMCAG"
},
{
"field": "title",
"type": "string",
"value": "Regulation of melanoma cell autonomous growth"
},
{
"field": "startdate",
"type": "date",
"value": null
},
{
"field": "enddate",
"type": "date",
"value": null
},
{
"field": "callidentifier",
"type": "string",
"value": "Tutkijankoulutus ja työskentely ulkomailla/kevät TT"
},
{
"field": "keywords",
"type": "string",
"value": null
},
{
"field": "duration",
"type": "int",
"value": null
},
{
"field": "ecsc39",
"type": "boolean",
"value": null
},
{
"field": "oamandatepublications",
"type": "boolean",
"value": false
},
{
"field": "ecarticle29_3",
"type": "boolean",
"value": null
},
{
"field": "dateofcollection",
"type": "date",
"value": "2019-01-25"
},
{
"field": "dateoftransformation",
"type": "date",
"value": "2019-04-16"
},
{
"field": "inferred",
"type": "boolean",
"value": false
},
{
"field": "deletedbyinference",
"type": "boolean",
"value": false
},
{
"field": "trust",
"type": "string",
"value": "0.9"
},
{
"field": "inferenceprovenance",
"type": "string",
"value": null
},
{
"field": "optional1",
"type": "string",
"value": "9,284 €"
},
{
"field": "optional2",
"type": "string",
"value": null
},
{
"field": "jsonextrainfo",
"type": "string",
"value": "{}"
},
{
"field": "contactfullname",
"type": "string",
"value": null
},
{
"field": "contactfax",
"type": "string",
"value": null
},
{
"field": "contactphone",
"type": "string",
"value": null
},
{
"field": "contactemail",
"type": "string",
"value": null
},
{
"field": "summary",
"type": "string",
"value": null
},
{
"field": "currency",
"type": "string",
"value": null
},
{
"field": "totalcost",
"type": "double",
"value": null
},
{
"field": "fundedamount",
"type": "double",
"value": null
},
{
"field": "collectedfromid",
"type": "string",
"value": "openaire____::aka"
},
{
"field": "collectedfromname",
"type": "string",
"value": "Academy of Finland"
},
{
"field": "contracttype",
"type": "string",
"value": null
},
{
"field": "provenanceaction",
"type": "not_used",
"value": "sysimport:crosswalk:entityregistry@@@Harvested@@@dnet:provenanceActions@@@dnet:provenanceActions"
},
{
"field": "pid",
"type": "not_used",
"value": [
null
]
},
{
"field": "subjects",
"type": "array",
"value": [
null
]
},
{
"field": "fundingtree",
"type": "array",
"value": [
"<fundingtree><funder>\n <id>aka_________::AKA</id>\n <shortname>AKA</shortname>\n <name>Academy of Finland</name>\n <originalname>Academy of Finland</originalname>\n <jurisdiction>FI</jurisdiction>\n </funder></fundingtree>"
]
}
]

@ -0,0 +1,97 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<parent>
<artifactId>dhp-workflows</artifactId>
<groupId>eu.dnetlib.dhp</groupId>
<version>1.1.6-SNAPSHOT</version>
</parent>
<modelVersion>4.0.0</modelVersion>
<artifactId>dhp-dedup-openaire</artifactId>
<build>
<plugins>
<plugin>
<groupId>net.alchim31.maven</groupId>
<artifactId>scala-maven-plugin</artifactId>
<version>4.0.1</version>
<executions>
<execution>
<id>scala-compile-first</id>
<phase>initialize</phase>
<goals>
<goal>add-source</goal>
<goal>compile</goal>
</goals>
</execution>
<execution>
<id>scala-test-compile</id>
<phase>process-test-resources</phase>
<goals>
<goal>testCompile</goal>
</goals>
</execution>
</executions>
<configuration>
<scalaVersion>${scala.version}</scalaVersion>
</configuration>
</plugin>
</plugins>
</build>
<dependencies>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.11</artifactId>
</dependency>
<dependency>
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-common</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-schemas</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>com.arakelian</groupId>
<artifactId>java-jq</artifactId>
</dependency>
<dependency>
<groupId>dom4j</groupId>
<artifactId>dom4j</artifactId>
</dependency>
<dependency>
<groupId>jaxen</groupId>
<artifactId>jaxen</artifactId>
</dependency>
<dependency>
<groupId>eu.dnetlib</groupId>
<artifactId>dnet-pace-core</artifactId>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-graphx_2.11</artifactId>
</dependency>
<dependency>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-databind</artifactId>
</dependency>
<dependency>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-core</artifactId>
</dependency>
</dependencies>
</project>

@ -1,4 +1,4 @@
package eu.dnetlib.dedup;
package eu.dnetlib.dhp.dedup;
import eu.dnetlib.dhp.schema.oaf.Field;
import org.apache.commons.lang.StringUtils;

@ -1,6 +1,5 @@
package eu.dnetlib.dedup;
package eu.dnetlib.dhp.dedup;
import com.fasterxml.jackson.databind.DeserializationFeature;
import com.google.common.collect.Lists;
import eu.dnetlib.dhp.schema.oaf.*;
import eu.dnetlib.pace.config.DedupConfig;
@ -11,7 +10,7 @@ import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SparkSession;
import com.fasterxml.jackson.databind.ObjectMapper;
import org.codehaus.jackson.map.ObjectMapper;
import scala.Tuple2;
import java.util.Collection;
@ -69,8 +68,6 @@ public class DedupRecordFactory {
p.setId(e._1());
final ObjectMapper mapper = new ObjectMapper();
mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
final Collection<String> dateofacceptance = Lists.newArrayList();
@ -103,7 +100,6 @@ public class DedupRecordFactory {
d.setId(e._1());
final ObjectMapper mapper = new ObjectMapper();
mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
final Collection<String> dateofacceptance = Lists.newArrayList();
@ -136,7 +132,6 @@ public class DedupRecordFactory {
p.setId(e._1());
final ObjectMapper mapper = new ObjectMapper();
mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
if (e._2() != null)
e._2().forEach(proj -> {
try {
@ -160,7 +155,6 @@ public class DedupRecordFactory {
s.setId(e._1());
final ObjectMapper mapper = new ObjectMapper();
mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
final Collection<String> dateofacceptance = Lists.newArrayList();
if (e._2() != null)
e._2().forEach(soft -> {
@ -188,7 +182,6 @@ public class DedupRecordFactory {
Datasource d = new Datasource(); //the result of the merge, to be returned at the end
d.setId(e._1());
final ObjectMapper mapper = new ObjectMapper();
mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
if (e._2() != null)
e._2().forEach(dat -> {
try {
@ -213,7 +206,6 @@ public class DedupRecordFactory {
o.setId(e._1());
final ObjectMapper mapper = new ObjectMapper();
mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
StringBuilder trust = new StringBuilder("0.0");
@ -254,7 +246,6 @@ public class DedupRecordFactory {
o.setId(e._1());
final ObjectMapper mapper = new ObjectMapper();
mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
final Collection<String> dateofacceptance = Lists.newArrayList();

@ -1,35 +1,32 @@
package eu.dnetlib.dedup;
package eu.dnetlib.dhp.dedup;
import com.google.common.collect.Sets;
import com.wcohen.ss.JaroWinkler;
import eu.dnetlib.dhp.schema.oaf.Author;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
import eu.dnetlib.dhp.utils.ISLookupClientFactory;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
import eu.dnetlib.pace.clustering.BlacklistAwareClusteringCombiner;
import eu.dnetlib.pace.config.DedupConfig;
import eu.dnetlib.pace.model.MapDocument;
import eu.dnetlib.pace.model.Person;
import org.apache.commons.codec.binary.Hex;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.spark.SparkContext;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.util.LongAccumulator;
import org.dom4j.Document;
import org.dom4j.DocumentException;
import org.dom4j.Element;
import org.dom4j.io.SAXReader;
import scala.Tuple2;
import java.io.IOException;
import java.io.StringWriter;
import java.nio.charset.StandardCharsets;
import java.io.StringReader;
import java.security.MessageDigest;
import java.text.Normalizer;
import java.util.*;
import java.util.stream.Collectors;
import java.util.stream.Stream;
public class DedupUtility {
private static final Double THRESHOLD = 0.95;
@ -54,38 +51,6 @@ public class DedupUtility {
return accumulators;
}
public static JavaRDD<String> loadDataFromHDFS(String path, JavaSparkContext context) {
return context.textFile(path);
}
public static void deleteIfExists(String path) throws IOException {
Configuration conf = new Configuration();
FileSystem fileSystem = FileSystem.get(conf);
if (fileSystem.exists(new Path(path))) {
fileSystem.delete(new Path(path), true);
}
}
public static DedupConfig loadConfigFromHDFS(String path) throws IOException {
Configuration conf = new Configuration();
FileSystem fileSystem = FileSystem.get(conf);
FSDataInputStream inputStream = new FSDataInputStream(fileSystem.open(new Path(path)));
return DedupConfig.load(IOUtils.toString(inputStream, StandardCharsets.UTF_8.name()));
}
static <T> String readFromClasspath(final String filename, final Class<T> clazz) {
final StringWriter sw = new StringWriter();
try {
IOUtils.copy(clazz.getResourceAsStream(filename), sw);
return sw.toString();
} catch (final IOException e) {
throw new RuntimeException("cannot load resource from classpath: " + filename);
}
}
static Set<String> getGroupingKeys(DedupConfig conf, MapDocument doc) {
return Sets.newHashSet(BlacklistAwareClusteringCombiner.filterAndCombine(doc, conf));
}
@ -146,16 +111,20 @@ public class DedupUtility {
});
}
public static String createDedupRecordPath(final String basePath, final String actionSetId, final String entityType) {
return String.format("%s/%s/%s_deduprecord", basePath, actionSetId, entityType);
}
public static String createEntityPath(final String basePath, final String entityType) {
return String.format("%s/%s", basePath, entityType);
}
public static String createSimRelPath(final String basePath, final String entityType) {
return String.format("%s/%s/simRel", basePath, entityType);
public static String createSimRelPath(final String basePath, final String actionSetId,final String entityType) {
return String.format("%s/%s/%s_simrel", basePath, actionSetId, entityType);
}
public static String createMergeRelPath(final String basePath, final String entityType) {
return String.format("%s/%s/mergeRel", basePath, entityType);
public static String createMergeRelPath(final String basePath, final String actionSetId, final String entityType) {
return String.format("%s/%s/%s_mergerel", basePath, actionSetId, entityType);
}
private static Double sim(Author a, Author b) {
@ -216,4 +185,37 @@ public class DedupUtility {
return false;
return a.getPid().stream().anyMatch(p -> p != null && StringUtils.isNotBlank(p.getValue()));
}
public static List<DedupConfig> getConfigurations(String isLookUpUrl, String orchestrator) throws ISLookUpException, DocumentException {
final ISLookUpService isLookUpService = ISLookupClientFactory.getLookUpService(isLookUpUrl);
final String xquery = String.format("/RESOURCE_PROFILE[.//DEDUPLICATION/ACTION_SET/@id = '%s']", orchestrator);
String orchestratorProfile = isLookUpService.getResourceProfileByQuery(xquery);
final Document doc = new SAXReader().read(new StringReader(orchestratorProfile));
final String actionSetId = doc.valueOf("//DEDUPLICATION/ACTION_SET/@id");
final List<DedupConfig> configurations = new ArrayList<>();
for (final Object o : doc.selectNodes("//SCAN_SEQUENCE/SCAN")) {
configurations.add(loadConfig(isLookUpService, actionSetId, o));
}
return configurations;
}
private static DedupConfig loadConfig(final ISLookUpService isLookUpService, final String actionSetId, final Object o)
throws ISLookUpException {
final Element s = (Element) o;
final String configProfileId = s.attributeValue("id");
final String conf =
isLookUpService.getResourceProfileByQuery(String.format(
"for $x in /RESOURCE_PROFILE[.//RESOURCE_IDENTIFIER/@value = '%s'] return $x//DEDUPLICATION/text()",
configProfileId));
final DedupConfig dedupConfig = DedupConfig.load(conf);
dedupConfig.getWf().setConfigurationId(actionSetId);
return dedupConfig;
}
}

@ -1,7 +1,6 @@
package eu.dnetlib.dedup;
package eu.dnetlib.dhp.dedup;
import eu.dnetlib.pace.config.DedupConfig;
import eu.dnetlib.pace.model.Field;
import eu.dnetlib.pace.model.MapDocument;
import eu.dnetlib.pace.util.BlockProcessor;
import eu.dnetlib.pace.util.MapDocumentUtil;

@ -0,0 +1,100 @@
package eu.dnetlib.dhp.dedup;
import com.google.common.hash.Hashing;
import eu.dnetlib.dhp.dedup.graph.ConnectedComponent;
import eu.dnetlib.dhp.dedup.graph.GraphProcessor;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.schema.oaf.Relation;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
import eu.dnetlib.pace.config.DedupConfig;
import eu.dnetlib.pace.util.MapDocumentUtil;
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.graphx.Edge;
import org.apache.spark.rdd.RDD;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SparkSession;
import org.dom4j.DocumentException;
import scala.Tuple2;
import java.util.ArrayList;
import java.util.List;
public class SparkCreateConnectedComponent {
public static void main(String[] args) throws Exception {
final ArgumentApplicationParser parser = new ArgumentApplicationParser(IOUtils.toString(SparkCreateConnectedComponent.class.getResourceAsStream("/eu/dnetlib/dhp/dedup/createCC_parameters.json")));
parser.parseArgument(args);
new SparkCreateConnectedComponent().run(parser);
}
private void run(ArgumentApplicationParser parser) throws ISLookUpException, DocumentException {
final String graphBasePath = parser.get("graphBasePath");
final String workingPath = parser.get("workingPath");
final String isLookUpUrl = parser.get("isLookUpUrl");
final String actionSetId = parser.get("actionSetId");
try (SparkSession spark = getSparkSession(parser)) {
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
for (DedupConfig dedupConf: DedupUtility.getConfigurations(isLookUpUrl, actionSetId)) {
final String entity = dedupConf.getWf().getEntityType();
final String subEntity = dedupConf.getWf().getSubEntityValue();
final JavaPairRDD<Object, String> vertexes = sc.textFile(graphBasePath + "/" + subEntity)
.map(s -> MapDocumentUtil.getJPathString(dedupConf.getWf().getIdPath(), s))
.mapToPair((PairFunction<String, Object, String>)
s -> new Tuple2<Object, String>(getHashcode(s), s)
);
final Dataset<Relation> similarityRelations = spark.read().load(DedupUtility.createSimRelPath(workingPath, actionSetId, subEntity)).as(Encoders.bean(Relation.class));
final RDD<Edge<String>> edgeRdd = similarityRelations.javaRDD().map(it -> new Edge<>(getHashcode(it.getSource()), getHashcode(it.getTarget()), it.getRelClass())).rdd();
final JavaRDD<ConnectedComponent> cc = GraphProcessor.findCCs(vertexes.rdd(), edgeRdd, dedupConf.getWf().getMaxIterations()).toJavaRDD();
final Dataset<Relation> mergeRelation = spark.createDataset(cc.filter(k -> k.getDocIds().size() > 1).flatMap((FlatMapFunction<ConnectedComponent, Relation>) c ->
c.getDocIds()
.stream()
.flatMap(id -> {
List<Relation> tmp = new ArrayList<>();
Relation r = new Relation();
r.setSource(c.getCcId());
r.setTarget(id);
r.setRelClass("merges");
tmp.add(r);
r = new Relation();
r.setTarget(c.getCcId());
r.setSource(id);
r.setRelClass("isMergedIn");
tmp.add(r);
return tmp.stream();
}).iterator()).rdd(), Encoders.bean(Relation.class));
mergeRelation.write().mode("overwrite").save(DedupUtility.createMergeRelPath(workingPath, actionSetId, entity));
}
}
}
public static long getHashcode(final String id) {
return Hashing.murmur3_128().hashString(id).asLong();
}
private static SparkSession getSparkSession(ArgumentApplicationParser parser) {
SparkConf conf = new SparkConf();
return SparkSession
.builder()
.appName(SparkCreateSimRels.class.getSimpleName())
.master(parser.get("master"))
.config(conf)
.enableHiveSupport()
.getOrCreate();
}
}

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save