forked from D-Net/dnet-hadoop
merge with master
This commit is contained in:
commit
0cd022ad6a
|
@ -3,6 +3,8 @@
|
|||
*.iws
|
||||
*.ipr
|
||||
*.iml
|
||||
*.ipr
|
||||
*.iws
|
||||
*~
|
||||
.vscode
|
||||
.classpath
|
||||
|
@ -21,5 +23,5 @@
|
|||
/*/build
|
||||
/build
|
||||
spark-warehouse
|
||||
/*/*/job-override.properties
|
||||
/**/job-override.properties
|
||||
|
||||
|
|
|
@ -6,7 +6,7 @@
|
|||
<parent>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<artifactId>dhp-build</artifactId>
|
||||
<version>1.0.5-SNAPSHOT</version>
|
||||
<version>1.1.6-SNAPSHOT</version>
|
||||
</parent>
|
||||
|
||||
<artifactId>dhp-build-assembly-resources</artifactId>
|
||||
|
|
|
@ -6,7 +6,7 @@
|
|||
<parent>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<artifactId>dhp-build</artifactId>
|
||||
<version>1.0.5-SNAPSHOT</version>
|
||||
<version>1.1.6-SNAPSHOT</version>
|
||||
</parent>
|
||||
|
||||
<artifactId>dhp-build-properties-maven-plugin</artifactId>
|
||||
|
@ -76,6 +76,41 @@
|
|||
</configuration>
|
||||
</plugin>
|
||||
</plugins>
|
||||
<pluginManagement>
|
||||
<plugins>
|
||||
<!--This plugin's configuration is used to store Eclipse m2e settings only. It has no influence on the Maven build itself.-->
|
||||
<plugin>
|
||||
<groupId>org.eclipse.m2e</groupId>
|
||||
<artifactId>lifecycle-mapping</artifactId>
|
||||
<version>1.0.0</version>
|
||||
<configuration>
|
||||
<lifecycleMappingMetadata>
|
||||
<pluginExecutions>
|
||||
<pluginExecution>
|
||||
<pluginExecutionFilter>
|
||||
<groupId>
|
||||
org.apache.maven.plugins
|
||||
</groupId>
|
||||
<artifactId>
|
||||
maven-plugin-plugin
|
||||
</artifactId>
|
||||
<versionRange>
|
||||
[3.2,)
|
||||
</versionRange>
|
||||
<goals>
|
||||
<goal>descriptor</goal>
|
||||
</goals>
|
||||
</pluginExecutionFilter>
|
||||
<action>
|
||||
<ignore></ignore>
|
||||
</action>
|
||||
</pluginExecution>
|
||||
</pluginExecutions>
|
||||
</lifecycleMappingMetadata>
|
||||
</configuration>
|
||||
</plugin>
|
||||
</plugins>
|
||||
</pluginManagement>
|
||||
</build>
|
||||
|
||||
</project>
|
||||
|
|
|
@ -1,22 +1,21 @@
|
|||
package eu.dnetlib.maven.plugin.properties;
|
||||
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import static eu.dnetlib.maven.plugin.properties.GenerateOoziePropertiesMojo.PROPERTY_NAME_SANDBOX_NAME;
|
||||
import static eu.dnetlib.maven.plugin.properties.GenerateOoziePropertiesMojo.PROPERTY_NAME_WF_SOURCE_DIR;
|
||||
import static org.junit.Assert.assertEquals;
|
||||
import static org.junit.Assert.assertNull;
|
||||
|
||||
import org.junit.Before;
|
||||
import org.junit.Test;
|
||||
import static org.junit.jupiter.api.Assertions.*;
|
||||
|
||||
/**
|
||||
* @author mhorst
|
||||
* @author mhorst, claudio.atzori
|
||||
*
|
||||
*/
|
||||
public class GenerateOoziePropertiesMojoTest {
|
||||
|
||||
private GenerateOoziePropertiesMojo mojo = new GenerateOoziePropertiesMojo();
|
||||
|
||||
@Before
|
||||
@BeforeEach
|
||||
public void clearSystemProperties() {
|
||||
System.clearProperty(PROPERTY_NAME_SANDBOX_NAME);
|
||||
System.clearProperty(PROPERTY_NAME_WF_SOURCE_DIR);
|
||||
|
@ -28,7 +27,7 @@ public class GenerateOoziePropertiesMojoTest {
|
|||
mojo.execute();
|
||||
|
||||
// assert
|
||||
assertNull(System.getProperty(PROPERTY_NAME_SANDBOX_NAME));
|
||||
assertNull(System.getProperty(PROPERTY_NAME_SANDBOX_NAME));
|
||||
}
|
||||
|
||||
@Test
|
||||
|
|
|
@ -1,51 +1,41 @@
|
|||
package eu.dnetlib.maven.plugin.properties;
|
||||
|
||||
import static eu.dnetlib.maven.plugin.properties.WritePredefinedProjectProperties.PROPERTY_PREFIX_ENV;
|
||||
import static org.junit.Assert.assertEquals;
|
||||
import static org.junit.Assert.assertFalse;
|
||||
import static org.junit.Assert.assertTrue;
|
||||
import static org.mockito.Mockito.doReturn;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.FileWriter;
|
||||
import java.io.IOException;
|
||||
import java.util.Properties;
|
||||
|
||||
import org.apache.maven.plugin.MojoExecutionException;
|
||||
import org.apache.maven.project.MavenProject;
|
||||
import org.junit.Before;
|
||||
import org.junit.Rule;
|
||||
import org.junit.Test;
|
||||
import org.junit.rules.TemporaryFolder;
|
||||
import org.junit.runner.RunWith;
|
||||
import org.junit.jupiter.api.*;
|
||||
import org.junit.jupiter.api.extension.ExtendWith;
|
||||
import org.junit.jupiter.api.io.TempDir;
|
||||
import org.mockito.Mock;
|
||||
import org.mockito.runners.MockitoJUnitRunner;
|
||||
import org.mockito.MockitoAnnotations;
|
||||
import org.mockito.junit.jupiter.MockitoExtension;
|
||||
|
||||
import java.io.*;
|
||||
import java.util.Properties;
|
||||
|
||||
import static eu.dnetlib.maven.plugin.properties.WritePredefinedProjectProperties.PROPERTY_PREFIX_ENV;
|
||||
import static org.junit.jupiter.api.Assertions.*;
|
||||
import static org.mockito.Mockito.doReturn;
|
||||
import static org.mockito.Mockito.lenient;
|
||||
|
||||
/**
|
||||
* @author mhorst
|
||||
* @author mhorst, claudio.atzori
|
||||
*
|
||||
*/
|
||||
@RunWith(MockitoJUnitRunner.class)
|
||||
@ExtendWith(MockitoExtension.class)
|
||||
public class WritePredefinedProjectPropertiesTest {
|
||||
|
||||
@Rule
|
||||
public TemporaryFolder testFolder = new TemporaryFolder();
|
||||
|
||||
@Mock
|
||||
private MavenProject mavenProject;
|
||||
|
||||
private WritePredefinedProjectProperties mojo;
|
||||
|
||||
@Before
|
||||
public void init() {
|
||||
@BeforeEach
|
||||
public void init(@TempDir File testFolder) {
|
||||
MockitoAnnotations.initMocks(this);
|
||||
mojo = new WritePredefinedProjectProperties();
|
||||
mojo.outputFile = getPropertiesFileLocation();
|
||||
mojo.outputFile = getPropertiesFileLocation(testFolder);
|
||||
mojo.project = mavenProject;
|
||||
doReturn(new Properties()).when(mavenProject).getProperties();
|
||||
lenient().doReturn(new Properties()).when(mavenProject).getProperties();
|
||||
}
|
||||
|
||||
// ----------------------------------- TESTS ---------------------------------------------
|
||||
|
@ -57,7 +47,7 @@ public class WritePredefinedProjectPropertiesTest {
|
|||
|
||||
// assert
|
||||
assertTrue(mojo.outputFile.exists());
|
||||
Properties storedProperties = getStoredProperties();
|
||||
Properties storedProperties = getStoredProperties(mojo.outputFile.getParentFile());
|
||||
assertEquals(0, storedProperties.size());
|
||||
}
|
||||
|
||||
|
@ -75,28 +65,28 @@ public class WritePredefinedProjectPropertiesTest {
|
|||
|
||||
// assert
|
||||
assertTrue(mojo.outputFile.exists());
|
||||
Properties storedProperties = getStoredProperties();
|
||||
Properties storedProperties = getStoredProperties(mojo.outputFile.getParentFile());
|
||||
assertEquals(1, storedProperties.size());
|
||||
assertTrue(storedProperties.containsKey(key));
|
||||
assertEquals(value, storedProperties.getProperty(key));
|
||||
}
|
||||
|
||||
@Test(expected=MojoExecutionException.class)
|
||||
public void testExecuteWithProjectPropertiesAndInvalidOutputFile() throws Exception {
|
||||
@Test()
|
||||
public void testExecuteWithProjectPropertiesAndInvalidOutputFile(@TempDir File testFolder) {
|
||||
// given
|
||||
String key = "projectPropertyKey";
|
||||
String value = "projectPropertyValue";
|
||||
Properties projectProperties = new Properties();
|
||||
projectProperties.setProperty(key, value);
|
||||
doReturn(projectProperties).when(mavenProject).getProperties();
|
||||
mojo.outputFile = testFolder.getRoot();
|
||||
mojo.outputFile = testFolder;
|
||||
|
||||
// execute
|
||||
mojo.execute();
|
||||
Assertions.assertThrows(MojoExecutionException.class, () -> mojo.execute());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testExecuteWithProjectPropertiesExclusion() throws Exception {
|
||||
public void testExecuteWithProjectPropertiesExclusion(@TempDir File testFolder) throws Exception {
|
||||
// given
|
||||
String key = "projectPropertyKey";
|
||||
String value = "projectPropertyValue";
|
||||
|
@ -113,14 +103,14 @@ public class WritePredefinedProjectPropertiesTest {
|
|||
|
||||
// assert
|
||||
assertTrue(mojo.outputFile.exists());
|
||||
Properties storedProperties = getStoredProperties();
|
||||
Properties storedProperties = getStoredProperties(testFolder);
|
||||
assertEquals(1, storedProperties.size());
|
||||
assertTrue(storedProperties.containsKey(key));
|
||||
assertEquals(value, storedProperties.getProperty(key));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testExecuteWithProjectPropertiesInclusion() throws Exception {
|
||||
public void testExecuteWithProjectPropertiesInclusion(@TempDir File testFolder) throws Exception {
|
||||
// given
|
||||
String key = "projectPropertyKey";
|
||||
String value = "projectPropertyValue";
|
||||
|
@ -137,14 +127,14 @@ public class WritePredefinedProjectPropertiesTest {
|
|||
|
||||
// assert
|
||||
assertTrue(mojo.outputFile.exists());
|
||||
Properties storedProperties = getStoredProperties();
|
||||
Properties storedProperties = getStoredProperties(testFolder);
|
||||
assertEquals(1, storedProperties.size());
|
||||
assertTrue(storedProperties.containsKey(includedKey));
|
||||
assertEquals(includedValue, storedProperties.getProperty(includedKey));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testExecuteIncludingPropertyKeysFromFile() throws Exception {
|
||||
public void testExecuteIncludingPropertyKeysFromFile(@TempDir File testFolder) throws Exception {
|
||||
// given
|
||||
String key = "projectPropertyKey";
|
||||
String value = "projectPropertyValue";
|
||||
|
@ -155,7 +145,7 @@ public class WritePredefinedProjectPropertiesTest {
|
|||
projectProperties.setProperty(includedKey, includedValue);
|
||||
doReturn(projectProperties).when(mavenProject).getProperties();
|
||||
|
||||
File includedPropertiesFile = new File(testFolder.getRoot(), "included.properties");
|
||||
File includedPropertiesFile = new File(testFolder, "included.properties");
|
||||
Properties includedProperties = new Properties();
|
||||
includedProperties.setProperty(includedKey, "irrelevantValue");
|
||||
includedProperties.store(new FileWriter(includedPropertiesFile), null);
|
||||
|
@ -167,14 +157,14 @@ public class WritePredefinedProjectPropertiesTest {
|
|||
|
||||
// assert
|
||||
assertTrue(mojo.outputFile.exists());
|
||||
Properties storedProperties = getStoredProperties();
|
||||
Properties storedProperties = getStoredProperties(testFolder);
|
||||
assertEquals(1, storedProperties.size());
|
||||
assertTrue(storedProperties.containsKey(includedKey));
|
||||
assertEquals(includedValue, storedProperties.getProperty(includedKey));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testExecuteIncludingPropertyKeysFromClasspathResource() throws Exception {
|
||||
public void testExecuteIncludingPropertyKeysFromClasspathResource(@TempDir File testFolder) throws Exception {
|
||||
// given
|
||||
String key = "projectPropertyKey";
|
||||
String value = "projectPropertyValue";
|
||||
|
@ -192,14 +182,14 @@ public class WritePredefinedProjectPropertiesTest {
|
|||
|
||||
// assert
|
||||
assertTrue(mojo.outputFile.exists());
|
||||
Properties storedProperties = getStoredProperties();
|
||||
Properties storedProperties = getStoredProperties(testFolder);
|
||||
assertEquals(1, storedProperties.size());
|
||||
assertTrue(storedProperties.containsKey(includedKey));
|
||||
assertEquals(includedValue, storedProperties.getProperty(includedKey));
|
||||
}
|
||||
|
||||
@Test(expected=MojoExecutionException.class)
|
||||
public void testExecuteIncludingPropertyKeysFromBlankLocation() throws Exception {
|
||||
@Test
|
||||
public void testExecuteIncludingPropertyKeysFromBlankLocation() {
|
||||
// given
|
||||
String key = "projectPropertyKey";
|
||||
String value = "projectPropertyValue";
|
||||
|
@ -213,11 +203,11 @@ public class WritePredefinedProjectPropertiesTest {
|
|||
mojo.setIncludePropertyKeysFromFiles(new String[] {""});
|
||||
|
||||
// execute
|
||||
mojo.execute();
|
||||
Assertions.assertThrows(MojoExecutionException.class, () -> mojo.execute());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testExecuteIncludingPropertyKeysFromXmlFile() throws Exception {
|
||||
public void testExecuteIncludingPropertyKeysFromXmlFile(@TempDir File testFolder) throws Exception {
|
||||
// given
|
||||
String key = "projectPropertyKey";
|
||||
String value = "projectPropertyValue";
|
||||
|
@ -228,7 +218,7 @@ public class WritePredefinedProjectPropertiesTest {
|
|||
projectProperties.setProperty(includedKey, includedValue);
|
||||
doReturn(projectProperties).when(mavenProject).getProperties();
|
||||
|
||||
File includedPropertiesFile = new File(testFolder.getRoot(), "included.xml");
|
||||
File includedPropertiesFile = new File(testFolder, "included.xml");
|
||||
Properties includedProperties = new Properties();
|
||||
includedProperties.setProperty(includedKey, "irrelevantValue");
|
||||
includedProperties.storeToXML(new FileOutputStream(includedPropertiesFile), null);
|
||||
|
@ -240,14 +230,14 @@ public class WritePredefinedProjectPropertiesTest {
|
|||
|
||||
// assert
|
||||
assertTrue(mojo.outputFile.exists());
|
||||
Properties storedProperties = getStoredProperties();
|
||||
Properties storedProperties = getStoredProperties(testFolder);
|
||||
assertEquals(1, storedProperties.size());
|
||||
assertTrue(storedProperties.containsKey(includedKey));
|
||||
assertEquals(includedValue, storedProperties.getProperty(includedKey));
|
||||
}
|
||||
|
||||
@Test(expected=MojoExecutionException.class)
|
||||
public void testExecuteIncludingPropertyKeysFromInvalidXmlFile() throws Exception {
|
||||
@Test
|
||||
public void testExecuteIncludingPropertyKeysFromInvalidXmlFile(@TempDir File testFolder) throws Exception {
|
||||
// given
|
||||
String key = "projectPropertyKey";
|
||||
String value = "projectPropertyValue";
|
||||
|
@ -258,7 +248,7 @@ public class WritePredefinedProjectPropertiesTest {
|
|||
projectProperties.setProperty(includedKey, includedValue);
|
||||
doReturn(projectProperties).when(mavenProject).getProperties();
|
||||
|
||||
File includedPropertiesFile = new File(testFolder.getRoot(), "included.xml");
|
||||
File includedPropertiesFile = new File(testFolder, "included.xml");
|
||||
Properties includedProperties = new Properties();
|
||||
includedProperties.setProperty(includedKey, "irrelevantValue");
|
||||
includedProperties.store(new FileOutputStream(includedPropertiesFile), null);
|
||||
|
@ -266,11 +256,11 @@ public class WritePredefinedProjectPropertiesTest {
|
|||
mojo.setIncludePropertyKeysFromFiles(new String[] {includedPropertiesFile.getAbsolutePath()});
|
||||
|
||||
// execute
|
||||
mojo.execute();
|
||||
Assertions.assertThrows(MojoExecutionException.class, () -> mojo.execute());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testExecuteWithQuietModeOn() throws Exception {
|
||||
public void testExecuteWithQuietModeOn(@TempDir File testFolder) throws Exception {
|
||||
// given
|
||||
mojo.setQuiet(true);
|
||||
mojo.setIncludePropertyKeysFromFiles(new String[] {"invalid location"});
|
||||
|
@ -280,21 +270,21 @@ public class WritePredefinedProjectPropertiesTest {
|
|||
|
||||
// assert
|
||||
assertTrue(mojo.outputFile.exists());
|
||||
Properties storedProperties = getStoredProperties();
|
||||
Properties storedProperties = getStoredProperties(testFolder);
|
||||
assertEquals(0, storedProperties.size());
|
||||
}
|
||||
|
||||
@Test(expected=MojoExecutionException.class)
|
||||
public void testExecuteIncludingPropertyKeysFromInvalidFile() throws Exception {
|
||||
@Test
|
||||
public void testExecuteIncludingPropertyKeysFromInvalidFile() {
|
||||
// given
|
||||
mojo.setIncludePropertyKeysFromFiles(new String[] {"invalid location"});
|
||||
|
||||
// execute
|
||||
mojo.execute();
|
||||
Assertions.assertThrows(MojoExecutionException.class, () -> mojo.execute());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testExecuteWithEnvironmentProperties() throws Exception {
|
||||
public void testExecuteWithEnvironmentProperties(@TempDir File testFolder) throws Exception {
|
||||
// given
|
||||
mojo.setIncludeEnvironmentVariables(true);
|
||||
|
||||
|
@ -303,7 +293,7 @@ public class WritePredefinedProjectPropertiesTest {
|
|||
|
||||
// assert
|
||||
assertTrue(mojo.outputFile.exists());
|
||||
Properties storedProperties = getStoredProperties();
|
||||
Properties storedProperties = getStoredProperties(testFolder);
|
||||
assertTrue(storedProperties.size() > 0);
|
||||
for (Object currentKey : storedProperties.keySet()) {
|
||||
assertTrue(((String)currentKey).startsWith(PROPERTY_PREFIX_ENV));
|
||||
|
@ -311,7 +301,7 @@ public class WritePredefinedProjectPropertiesTest {
|
|||
}
|
||||
|
||||
@Test
|
||||
public void testExecuteWithSystemProperties() throws Exception {
|
||||
public void testExecuteWithSystemProperties(@TempDir File testFolder) throws Exception {
|
||||
// given
|
||||
String key = "systemPropertyKey";
|
||||
String value = "systemPropertyValue";
|
||||
|
@ -323,14 +313,14 @@ public class WritePredefinedProjectPropertiesTest {
|
|||
|
||||
// assert
|
||||
assertTrue(mojo.outputFile.exists());
|
||||
Properties storedProperties = getStoredProperties();
|
||||
Properties storedProperties = getStoredProperties(testFolder);
|
||||
assertTrue(storedProperties.size() > 0);
|
||||
assertTrue(storedProperties.containsKey(key));
|
||||
assertEquals(value, storedProperties.getProperty(key));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testExecuteWithSystemPropertiesAndEscapeChars() throws Exception {
|
||||
public void testExecuteWithSystemPropertiesAndEscapeChars(@TempDir File testFolder) throws Exception {
|
||||
// given
|
||||
String key = "systemPropertyKey ";
|
||||
String value = "systemPropertyValue";
|
||||
|
@ -344,7 +334,7 @@ public class WritePredefinedProjectPropertiesTest {
|
|||
|
||||
// assert
|
||||
assertTrue(mojo.outputFile.exists());
|
||||
Properties storedProperties = getStoredProperties();
|
||||
Properties storedProperties = getStoredProperties(testFolder);
|
||||
assertTrue(storedProperties.size() > 0);
|
||||
assertFalse(storedProperties.containsKey(key));
|
||||
assertTrue(storedProperties.containsKey(key.trim()));
|
||||
|
@ -353,13 +343,13 @@ public class WritePredefinedProjectPropertiesTest {
|
|||
|
||||
// ----------------------------------- PRIVATE -------------------------------------------
|
||||
|
||||
private File getPropertiesFileLocation() {
|
||||
return new File(testFolder.getRoot(), "test.properties");
|
||||
private File getPropertiesFileLocation(File testFolder) {
|
||||
return new File(testFolder, "test.properties");
|
||||
}
|
||||
|
||||
private Properties getStoredProperties() throws FileNotFoundException, IOException {
|
||||
private Properties getStoredProperties(File testFolder) throws FileNotFoundException, IOException {
|
||||
Properties properties = new Properties();
|
||||
properties.load(new FileInputStream(getPropertiesFileLocation()));
|
||||
properties.load(new FileInputStream(getPropertiesFileLocation(testFolder)));
|
||||
return properties;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -4,7 +4,7 @@
|
|||
<parent>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<artifactId>dhp</artifactId>
|
||||
<version>1.0.5-SNAPSHOT</version>
|
||||
<version>1.1.6-SNAPSHOT</version>
|
||||
</parent>
|
||||
<artifactId>dhp-build</artifactId>
|
||||
<packaging>pom</packaging>
|
||||
|
|
|
@ -5,7 +5,7 @@
|
|||
<parent>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<artifactId>dhp</artifactId>
|
||||
<version>1.0.5-SNAPSHOT</version>
|
||||
<version>1.1.6-SNAPSHOT</version>
|
||||
<relativePath>../</relativePath>
|
||||
</parent>
|
||||
|
||||
|
@ -42,6 +42,23 @@
|
|||
<groupId>com.rabbitmq</groupId>
|
||||
<artifactId>amqp-client</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>net.sf.saxon</groupId>
|
||||
<artifactId>Saxon-HE</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.slf4j</groupId>
|
||||
<artifactId>jcl-over-slf4j</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.cxf</groupId>
|
||||
<artifactId>cxf-rt-transports-http</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>eu.dnetlib</groupId>
|
||||
<artifactId>cnr-rmi-api</artifactId>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>com.ximpleware</groupId>
|
||||
<artifactId>vtd-xml</artifactId>
|
||||
|
|
|
@ -0,0 +1,24 @@
|
|||
package eu.dnetlib.dhp.utils;
|
||||
|
||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
import org.apache.cxf.jaxws.JaxWsProxyFactoryBean;
|
||||
|
||||
public class ISLookupClientFactory {
|
||||
|
||||
private static final Log log = LogFactory.getLog(ISLookupClientFactory.class);
|
||||
|
||||
public static ISLookUpService getLookUpService(final String isLookupUrl) {
|
||||
return getServiceStub(ISLookUpService.class, isLookupUrl);
|
||||
}
|
||||
|
||||
@SuppressWarnings("unchecked")
|
||||
private static <T> T getServiceStub(final Class<T> clazz, final String endpoint) {
|
||||
log.info(String.format("creating %s stub from %s", clazz.getName(), endpoint));
|
||||
final JaxWsProxyFactoryBean jaxWsProxyFactory = new JaxWsProxyFactoryBean();
|
||||
jaxWsProxyFactory.setServiceClass(clazz);
|
||||
jaxWsProxyFactory.setAddress(endpoint);
|
||||
return (T) jaxWsProxyFactory.create();
|
||||
}
|
||||
}
|
|
@ -0,0 +1,32 @@
|
|||
package eu.dnetlib.dhp.utils.saxon;
|
||||
|
||||
import net.sf.saxon.expr.XPathContext;
|
||||
import net.sf.saxon.lib.ExtensionFunctionCall;
|
||||
import net.sf.saxon.lib.ExtensionFunctionDefinition;
|
||||
import net.sf.saxon.om.Sequence;
|
||||
import net.sf.saxon.om.StructuredQName;
|
||||
import net.sf.saxon.trans.XPathException;
|
||||
|
||||
public abstract class AbstractExtensionFunction extends ExtensionFunctionDefinition {
|
||||
|
||||
public static String DEFAULT_SAXON_EXT_NS_URI = "http://www.d-net.research-infrastructures.eu/saxon-extension";
|
||||
|
||||
public abstract String getName();
|
||||
public abstract Sequence doCall(XPathContext context, Sequence[] arguments) throws XPathException;
|
||||
|
||||
@Override
|
||||
public StructuredQName getFunctionQName() {
|
||||
return new StructuredQName("dnet", DEFAULT_SAXON_EXT_NS_URI, getName());
|
||||
}
|
||||
|
||||
@Override
|
||||
public ExtensionFunctionCall makeCallExpression() {
|
||||
return new ExtensionFunctionCall() {
|
||||
@Override
|
||||
public Sequence call(XPathContext context, Sequence[] arguments) throws XPathException {
|
||||
return doCall(context, arguments);
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,67 @@
|
|||
package eu.dnetlib.dhp.utils.saxon;
|
||||
|
||||
import net.sf.saxon.expr.XPathContext;
|
||||
import net.sf.saxon.om.Item;
|
||||
import net.sf.saxon.om.Sequence;
|
||||
import net.sf.saxon.trans.XPathException;
|
||||
import net.sf.saxon.value.SequenceType;
|
||||
import net.sf.saxon.value.StringValue;
|
||||
|
||||
import java.text.ParseException;
|
||||
import java.text.SimpleDateFormat;
|
||||
import java.util.Calendar;
|
||||
import java.util.GregorianCalendar;
|
||||
|
||||
public class ExtractYear extends AbstractExtensionFunction {
|
||||
|
||||
private static final String[] dateFormats = { "yyyy-MM-dd", "yyyy/MM/dd" };
|
||||
|
||||
@Override
|
||||
public String getName() {
|
||||
return "extractYear";
|
||||
}
|
||||
|
||||
@Override
|
||||
public Sequence doCall(XPathContext context, Sequence[] arguments) throws XPathException {
|
||||
if (arguments == null | arguments.length == 0) {
|
||||
return new StringValue("");
|
||||
}
|
||||
final Item item = arguments[0].head();
|
||||
if (item == null) {
|
||||
return new StringValue("");
|
||||
}
|
||||
return new StringValue(_year(item.getStringValue()));
|
||||
}
|
||||
|
||||
@Override
|
||||
public int getMinimumNumberOfArguments() {
|
||||
return 0;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int getMaximumNumberOfArguments() {
|
||||
return 1;
|
||||
}
|
||||
|
||||
@Override
|
||||
public SequenceType[] getArgumentTypes() {
|
||||
return new SequenceType[] { SequenceType.OPTIONAL_ITEM };
|
||||
}
|
||||
|
||||
@Override
|
||||
public SequenceType getResultType(SequenceType[] suppliedArgumentTypes) {
|
||||
return SequenceType.SINGLE_STRING;
|
||||
}
|
||||
|
||||
private String _year(String s) {
|
||||
Calendar c = new GregorianCalendar();
|
||||
for (String format : dateFormats) {
|
||||
try {
|
||||
c.setTime(new SimpleDateFormat(format).parse(s));
|
||||
String year = String.valueOf(c.get(Calendar.YEAR));
|
||||
return year;
|
||||
} catch (ParseException e) {}
|
||||
}
|
||||
return "";
|
||||
}
|
||||
}
|
|
@ -0,0 +1,66 @@
|
|||
package eu.dnetlib.dhp.utils.saxon;
|
||||
|
||||
import net.sf.saxon.expr.XPathContext;
|
||||
import net.sf.saxon.om.Sequence;
|
||||
import net.sf.saxon.trans.XPathException;
|
||||
import net.sf.saxon.value.SequenceType;
|
||||
import net.sf.saxon.value.StringValue;
|
||||
|
||||
import java.text.ParseException;
|
||||
import java.text.SimpleDateFormat;
|
||||
import java.util.Date;
|
||||
|
||||
public class NormalizeDate extends AbstractExtensionFunction {
|
||||
|
||||
private static final String[] normalizeDateFormats = { "yyyy-MM-dd'T'hh:mm:ss", "yyyy-MM-dd", "yyyy/MM/dd", "yyyy" };
|
||||
|
||||
private static final String normalizeOutFormat = new String("yyyy-MM-dd'T'hh:mm:ss'Z'");
|
||||
|
||||
@Override
|
||||
public String getName() {
|
||||
return "normalizeDate";
|
||||
}
|
||||
|
||||
@Override
|
||||
public Sequence doCall(XPathContext context, Sequence[] arguments) throws XPathException {
|
||||
if (arguments == null | arguments.length == 0) {
|
||||
return new StringValue("");
|
||||
}
|
||||
String s = arguments[0].head().getStringValue();
|
||||
return new StringValue(_year(s));
|
||||
}
|
||||
|
||||
@Override
|
||||
public int getMinimumNumberOfArguments() {
|
||||
return 0;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int getMaximumNumberOfArguments() {
|
||||
return 1;
|
||||
}
|
||||
|
||||
@Override
|
||||
public SequenceType[] getArgumentTypes() {
|
||||
return new SequenceType[] { SequenceType.OPTIONAL_ITEM };
|
||||
}
|
||||
|
||||
@Override
|
||||
public SequenceType getResultType(SequenceType[] suppliedArgumentTypes) {
|
||||
return SequenceType.SINGLE_STRING;
|
||||
}
|
||||
|
||||
private String _year(String s) {
|
||||
final String date = s != null ? s.trim() : "";
|
||||
|
||||
for (String format : normalizeDateFormats) {
|
||||
try {
|
||||
Date parse = new SimpleDateFormat(format).parse(date);
|
||||
String res = new SimpleDateFormat(normalizeOutFormat).format(parse);
|
||||
return res;
|
||||
} catch (ParseException e) {}
|
||||
}
|
||||
return "";
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,60 @@
|
|||
package eu.dnetlib.dhp.utils.saxon;
|
||||
|
||||
import net.sf.saxon.expr.XPathContext;
|
||||
import net.sf.saxon.om.Item;
|
||||
import net.sf.saxon.om.Sequence;
|
||||
import net.sf.saxon.trans.XPathException;
|
||||
import net.sf.saxon.value.SequenceType;
|
||||
import net.sf.saxon.value.StringValue;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
public class PickFirst extends AbstractExtensionFunction {
|
||||
|
||||
@Override
|
||||
public String getName() {
|
||||
return "pickFirst";
|
||||
}
|
||||
|
||||
@Override
|
||||
public Sequence doCall(XPathContext context, Sequence[] arguments) throws XPathException {
|
||||
if (arguments == null | arguments.length == 0) {
|
||||
return new StringValue("");
|
||||
}
|
||||
|
||||
final String s1 = getValue(arguments[0]);
|
||||
final String s2 = getValue(arguments[1]);
|
||||
|
||||
return new StringValue(StringUtils.isNotBlank(s1) ? s1 : StringUtils.isNotBlank(s2) ? s2 : "");
|
||||
}
|
||||
|
||||
private String getValue(final Sequence arg) throws XPathException {
|
||||
if (arg != null) {
|
||||
final Item item = arg.head();
|
||||
if (item != null) {
|
||||
return item.getStringValue();
|
||||
}
|
||||
}
|
||||
return "";
|
||||
}
|
||||
|
||||
@Override
|
||||
public int getMinimumNumberOfArguments() {
|
||||
return 0;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int getMaximumNumberOfArguments() {
|
||||
return 2;
|
||||
}
|
||||
|
||||
@Override
|
||||
public SequenceType[] getArgumentTypes() {
|
||||
return new SequenceType[] { SequenceType.OPTIONAL_ITEM };
|
||||
}
|
||||
|
||||
@Override
|
||||
public SequenceType getResultType(SequenceType[] suppliedArgumentTypes) {
|
||||
return SequenceType.SINGLE_STRING;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,30 @@
|
|||
package eu.dnetlib.dhp.utils.saxon;
|
||||
|
||||
import net.sf.saxon.Configuration;
|
||||
import net.sf.saxon.TransformerFactoryImpl;
|
||||
|
||||
import javax.xml.transform.Transformer;
|
||||
import javax.xml.transform.TransformerException;
|
||||
import javax.xml.transform.stream.StreamSource;
|
||||
import java.io.StringReader;
|
||||
|
||||
public class SaxonTransformerFactory {
|
||||
|
||||
/**
|
||||
* Creates the index record transformer from the given XSLT
|
||||
* @param xslt
|
||||
* @return
|
||||
* @throws TransformerException
|
||||
*/
|
||||
public static Transformer newInstance(final String xslt) throws TransformerException {
|
||||
|
||||
final TransformerFactoryImpl factory = new TransformerFactoryImpl();
|
||||
final Configuration conf = factory.getConfiguration();
|
||||
conf.registerExtensionFunction(new ExtractYear());
|
||||
conf.registerExtensionFunction(new NormalizeDate());
|
||||
conf.registerExtensionFunction(new PickFirst());
|
||||
|
||||
return factory.newTransformer(new StreamSource(new StringReader(xslt)));
|
||||
}
|
||||
|
||||
}
|
|
@ -1,18 +1,13 @@
|
|||
package eu.dnetlib.dhp.application;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.junit.Test;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import java.io.ByteArrayOutputStream;
|
||||
import java.util.Base64;
|
||||
import java.util.zip.GZIPOutputStream;
|
||||
|
||||
import static org.junit.Assert.assertEquals;
|
||||
import static org.junit.Assert.assertNotNull;
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
import static org.junit.jupiter.api.Assertions.assertNotNull;
|
||||
|
||||
public class ArgumentApplicationParserTest {
|
||||
|
||||
|
||||
@Test
|
||||
public void testParseParameter() throws Exception {
|
||||
final String jsonConfiguration = IOUtils.toString(this.getClass().getResourceAsStream("/eu/dnetlib/application/parameters.json"));
|
||||
|
|
|
@ -1,8 +1,8 @@
|
|||
package eu.dnetlib.dhp.model.mdstore;
|
||||
|
||||
import org.junit.Test;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import static org.junit.Assert.assertTrue;
|
||||
import static org.junit.jupiter.api.Assertions.assertTrue;
|
||||
|
||||
public class MetadataRecordTest {
|
||||
|
||||
|
@ -10,6 +10,6 @@ public class MetadataRecordTest {
|
|||
public void getTimestamp() {
|
||||
|
||||
MetadataRecord r = new MetadataRecord();
|
||||
assertTrue(r.getDateOfCollection() >0);
|
||||
assertTrue(r.getDateOfCollection() > 0);
|
||||
}
|
||||
}
|
|
@ -1,12 +1,12 @@
|
|||
package eu.dnetlib.message;
|
||||
|
||||
import org.junit.Test;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
import static org.junit.Assert.*;
|
||||
import static org.junit.jupiter.api.Assertions.*;
|
||||
|
||||
public class MessageTest {
|
||||
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
package eu.dnetlib.scholexplorer.relation;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.junit.Test;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
|
||||
public class RelationMapperTest {
|
||||
|
||||
|
|
Binary file not shown.
After Width: | Height: | Size: 689 KiB |
|
@ -1,3 +1,11 @@
|
|||
Description of the project
|
||||
--------------------------
|
||||
This project defines **serialization schemas** of Avro data store files that are used to pass data between workflow nodes in the system.
|
||||
This project defines **object schemas** of the OpenAIRE main entities and the relationships that intercur among them.
|
||||
Namely it defines the model for
|
||||
|
||||
- **research product (result)** which subclasses in publication, dataset, other research product, software
|
||||
- **data source** object describing the data provider (institutional repository, aggregators, cris systems)
|
||||
- **organization** research bodies managing a data source or participating to a research project
|
||||
- **project** research project
|
||||
|
||||
Te serialization of such objects (data store files) are used to pass data between workflow nodes in the processing pipeline.
|
||||
|
|
|
@ -5,7 +5,7 @@
|
|||
<parent>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<artifactId>dhp</artifactId>
|
||||
<version>1.0.5-SNAPSHOT</version>
|
||||
<version>1.1.6-SNAPSHOT</version>
|
||||
<relativePath>../</relativePath>
|
||||
</parent>
|
||||
|
||||
|
@ -27,18 +27,15 @@
|
|||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<artifactId>dhp-common</artifactId>
|
||||
<version>${project.version}</version>
|
||||
<groupId>com.fasterxml.jackson.core</groupId>
|
||||
<artifactId>jackson-databind</artifactId>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>com.fasterxml.jackson.core</groupId>
|
||||
<artifactId>jackson-databind</artifactId>
|
||||
<scope>test</scope>
|
||||
<groupId>com.google.guava</groupId>
|
||||
<artifactId>guava</artifactId>
|
||||
</dependency>
|
||||
|
||||
|
||||
</dependencies>
|
||||
|
||||
|
||||
|
|
|
@ -0,0 +1,38 @@
|
|||
package eu.dnetlib.dhp.schema.action;
|
||||
|
||||
import com.fasterxml.jackson.databind.annotation.JsonDeserialize;
|
||||
import eu.dnetlib.dhp.schema.oaf.Oaf;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
@JsonDeserialize(using = AtomicActionDeserializer.class)
|
||||
public class AtomicAction<T extends Oaf> implements Serializable {
|
||||
|
||||
private Class<T> clazz;
|
||||
|
||||
private T payload;
|
||||
|
||||
public AtomicAction() {
|
||||
}
|
||||
|
||||
public AtomicAction(Class<T> clazz, T payload) {
|
||||
this.clazz = clazz;
|
||||
this.payload = payload;
|
||||
}
|
||||
|
||||
public Class<T> getClazz() {
|
||||
return clazz;
|
||||
}
|
||||
|
||||
public void setClazz(Class<T> clazz) {
|
||||
this.clazz = clazz;
|
||||
}
|
||||
|
||||
public T getPayload() {
|
||||
return payload;
|
||||
}
|
||||
|
||||
public void setPayload(T payload) {
|
||||
this.payload = payload;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,29 @@
|
|||
package eu.dnetlib.dhp.schema.action;
|
||||
|
||||
import com.fasterxml.jackson.core.JsonParser;
|
||||
import com.fasterxml.jackson.core.JsonProcessingException;
|
||||
import com.fasterxml.jackson.databind.DeserializationContext;
|
||||
import com.fasterxml.jackson.databind.JsonDeserializer;
|
||||
import com.fasterxml.jackson.databind.JsonNode;
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import eu.dnetlib.dhp.schema.oaf.Oaf;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
public class AtomicActionDeserializer extends JsonDeserializer {
|
||||
|
||||
@Override
|
||||
public Object deserialize(JsonParser jp, DeserializationContext ctxt) throws IOException, JsonProcessingException {
|
||||
JsonNode node = jp.getCodec().readTree(jp);
|
||||
String classTag = node.get("clazz").asText();
|
||||
JsonNode payload = node.get("payload");
|
||||
ObjectMapper mapper = new ObjectMapper();
|
||||
|
||||
try {
|
||||
final Class<?> clazz = Class.forName(classTag);
|
||||
return new AtomicAction(clazz, (Oaf) mapper.readValue(payload.toString(), clazz));
|
||||
} catch (ClassNotFoundException e) {
|
||||
throw new IOException(e);
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,15 @@
|
|||
package eu.dnetlib.dhp.schema.oaf;
|
||||
|
||||
public class Country extends Qualifier {
|
||||
|
||||
private DataInfo dataInfo;
|
||||
|
||||
public DataInfo getDataInfo() {
|
||||
return dataInfo;
|
||||
}
|
||||
|
||||
public void setDataInfo(DataInfo dataInfo) {
|
||||
this.dataInfo = dataInfo;
|
||||
}
|
||||
|
||||
}
|
|
@ -40,9 +40,9 @@ public class Datasource extends OafEntity implements Serializable {
|
|||
|
||||
private List<Field<String>> odlanguages;
|
||||
|
||||
private List< Field<String>> odcontenttypes;
|
||||
private List<Field<String>> odcontenttypes;
|
||||
|
||||
private List< Field<String>> accessinfopackage;
|
||||
private List<Field<String>> accessinfopackage;
|
||||
|
||||
// re3data fields
|
||||
private Field<String> releasestartdate;
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
package eu.dnetlib.dhp.schema.oaf;
|
||||
|
||||
import com.fasterxml.jackson.annotation.JsonIgnore;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
@ -36,7 +37,7 @@ public class GeoLocation implements Serializable {
|
|||
this.place = place;
|
||||
}
|
||||
|
||||
|
||||
@JsonIgnore
|
||||
public boolean isBlank() {
|
||||
return StringUtils.isBlank(point) &&
|
||||
StringUtils.isBlank(box) &&
|
||||
|
|
|
@ -22,6 +22,14 @@ public class Instance implements Serializable {
|
|||
|
||||
private Field<String> dateofacceptance;
|
||||
|
||||
// ( article | book ) processing charges. Defined here to cope with possible wrongly typed results
|
||||
private Field<String> processingchargeamount;
|
||||
|
||||
// currency - alphabetic code describe in ISO-4217. Defined here to cope with possible wrongly typed results
|
||||
private Field<String> processingchargecurrency;
|
||||
|
||||
private Field<String> refereed; //peer-review status
|
||||
|
||||
public Field<String> getLicense() {
|
||||
return license;
|
||||
}
|
||||
|
@ -86,7 +94,29 @@ public class Instance implements Serializable {
|
|||
this.dateofacceptance = dateofacceptance;
|
||||
}
|
||||
|
||||
public Field<String> getProcessingchargeamount() {
|
||||
return processingchargeamount;
|
||||
}
|
||||
|
||||
public void setProcessingchargeamount(Field<String> processingchargeamount) {
|
||||
this.processingchargeamount = processingchargeamount;
|
||||
}
|
||||
|
||||
public Field<String> getProcessingchargecurrency() {
|
||||
return processingchargecurrency;
|
||||
}
|
||||
|
||||
public void setProcessingchargecurrency(Field<String> processingchargecurrency) {
|
||||
this.processingchargecurrency = processingchargecurrency;
|
||||
}
|
||||
|
||||
public Field<String> getRefereed() {
|
||||
return refereed;
|
||||
}
|
||||
|
||||
public void setRefereed(Field<String> refereed) {
|
||||
this.refereed = refereed;
|
||||
}
|
||||
|
||||
public String toComparableString(){
|
||||
return String.format("%s::%s::%s::%s",
|
||||
|
|
|
@ -1,12 +1,10 @@
|
|||
package eu.dnetlib.dhp.schema.oaf;
|
||||
|
||||
import com.fasterxml.jackson.annotation.JacksonInject;
|
||||
import com.fasterxml.jackson.annotation.JsonIgnore;
|
||||
import com.fasterxml.jackson.annotation.JsonIgnoreProperties;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import java.io.Serializable;
|
||||
@JsonIgnoreProperties({"blank"})
|
||||
|
||||
public class KeyValue implements Serializable {
|
||||
|
||||
private String key;
|
||||
|
@ -39,7 +37,6 @@ public class KeyValue implements Serializable {
|
|||
this.dataInfo = dataInfo;
|
||||
}
|
||||
|
||||
@JsonIgnore
|
||||
public String toComparableString() {
|
||||
return isBlank()?"":String.format("%s::%s", key != null ? key.toLowerCase() : "", value != null ? value.toLowerCase() : "");
|
||||
}
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
package eu.dnetlib.dhp.schema.oaf;
|
||||
|
||||
import com.fasterxml.jackson.annotation.JsonIgnore;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
@ -50,6 +51,8 @@ public class Qualifier implements Serializable {
|
|||
schemeid != null ? schemeid : "",
|
||||
schemename != null ? schemename : "");
|
||||
}
|
||||
|
||||
@JsonIgnore
|
||||
public boolean isBlank() {
|
||||
return StringUtils.isBlank(classid) &&
|
||||
StringUtils.isBlank(classname) &&
|
||||
|
|
|
@ -1,85 +1,104 @@
|
|||
package eu.dnetlib.dhp.schema.oaf;
|
||||
|
||||
import java.util.*;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Objects;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import static com.google.common.base.Preconditions.checkArgument;
|
||||
|
||||
public class Relation extends Oaf {
|
||||
|
||||
private String relType;
|
||||
private String relType;
|
||||
|
||||
private String subRelType;
|
||||
private String subRelType;
|
||||
|
||||
private String relClass;
|
||||
private String relClass;
|
||||
|
||||
private String source;
|
||||
private String source;
|
||||
|
||||
private String target;
|
||||
private String target;
|
||||
|
||||
private List<KeyValue> collectedFrom;
|
||||
private List<KeyValue> collectedFrom = new ArrayList<>();
|
||||
|
||||
public String getRelType() {
|
||||
return relType;
|
||||
}
|
||||
public String getRelType() {
|
||||
return relType;
|
||||
}
|
||||
|
||||
public void setRelType(String relType) {
|
||||
this.relType = relType;
|
||||
}
|
||||
public void setRelType(final String relType) {
|
||||
this.relType = relType;
|
||||
}
|
||||
|
||||
public String getSubRelType() {
|
||||
return subRelType;
|
||||
}
|
||||
public String getSubRelType() {
|
||||
return subRelType;
|
||||
}
|
||||
|
||||
public void setSubRelType(String subRelType) {
|
||||
this.subRelType = subRelType;
|
||||
}
|
||||
public void setSubRelType(final String subRelType) {
|
||||
this.subRelType = subRelType;
|
||||
}
|
||||
|
||||
public String getRelClass() {
|
||||
return relClass;
|
||||
}
|
||||
public String getRelClass() {
|
||||
return relClass;
|
||||
}
|
||||
|
||||
public void setRelClass(String relClass) {
|
||||
this.relClass = relClass;
|
||||
}
|
||||
public void setRelClass(final String relClass) {
|
||||
this.relClass = relClass;
|
||||
}
|
||||
|
||||
public String getSource() {
|
||||
return source;
|
||||
}
|
||||
public String getSource() {
|
||||
return source;
|
||||
}
|
||||
|
||||
public void setSource(String source) {
|
||||
this.source = source;
|
||||
}
|
||||
public void setSource(final String source) {
|
||||
this.source = source;
|
||||
}
|
||||
|
||||
public String getTarget() {
|
||||
return target;
|
||||
}
|
||||
public String getTarget() {
|
||||
return target;
|
||||
}
|
||||
|
||||
public void setTarget(String target) {
|
||||
this.target = target;
|
||||
}
|
||||
public void setTarget(final String target) {
|
||||
this.target = target;
|
||||
}
|
||||
|
||||
public List<KeyValue> getCollectedFrom() {
|
||||
return collectedFrom;
|
||||
}
|
||||
public List<KeyValue> getCollectedFrom() {
|
||||
return collectedFrom;
|
||||
}
|
||||
|
||||
public void setCollectedFrom(List<KeyValue> collectedFrom) {
|
||||
this.collectedFrom = collectedFrom;
|
||||
}
|
||||
public void setCollectedFrom(final List<KeyValue> collectedFrom) {
|
||||
this.collectedFrom = collectedFrom;
|
||||
}
|
||||
|
||||
public void mergeFrom(Relation other) {
|
||||
this.mergeOAFDataInfo(other);
|
||||
if (other.getCollectedFrom() == null || other.getCollectedFrom().size() == 0)
|
||||
return;
|
||||
if (collectedFrom == null && other.getCollectedFrom() != null) {
|
||||
collectedFrom = other.getCollectedFrom();
|
||||
return;
|
||||
}
|
||||
if (other.getCollectedFrom() != null) {
|
||||
collectedFrom.addAll(other.getCollectedFrom());
|
||||
public void mergeFrom(final Relation r) {
|
||||
|
||||
checkArgument(Objects.equals(getSource(), r.getSource()),"source ids must be equal");
|
||||
checkArgument(Objects.equals(getTarget(), r.getTarget()),"target ids must be equal");
|
||||
checkArgument(Objects.equals(getRelType(), r.getRelType()),"relType(s) must be equal");
|
||||
checkArgument(Objects.equals(getSubRelType(), r.getSubRelType()),"subRelType(s) must be equal");
|
||||
checkArgument(Objects.equals(getRelClass(), r.getRelClass()),"relClass(es) must be equal");
|
||||
|
||||
setCollectedFrom(Stream.concat(getCollectedFrom().stream(), r.getCollectedFrom().stream())
|
||||
.distinct() // relies on KeyValue.equals
|
||||
.collect(Collectors.toList()));
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object o) {
|
||||
if (this == o) return true;
|
||||
if (o == null || getClass() != o.getClass()) return false;
|
||||
Relation relation = (Relation) o;
|
||||
return relType.equals(relation.relType) &&
|
||||
subRelType.equals(relation.subRelType) &&
|
||||
relClass.equals(relation.relClass) &&
|
||||
source.equals(relation.source) &&
|
||||
target.equals(relation.target) &&
|
||||
Objects.equals(collectedFrom, relation.collectedFrom);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
return Objects.hash(relType, subRelType, relClass, source, target, collectedFrom);
|
||||
}
|
||||
|
||||
collectedFrom = new ArrayList<>(collectedFrom
|
||||
.stream()
|
||||
.collect(Collectors.toMap(KeyValue::toComparableString, x -> x, (x1, x2) -> x1))
|
||||
.values());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,12 +1,10 @@
|
|||
package eu.dnetlib.dhp.schema.oaf;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.*;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.Comparator;
|
||||
import java.util.List;
|
||||
|
||||
public abstract class Result extends OafEntity implements Serializable {
|
||||
public class Result extends OafEntity implements Serializable {
|
||||
|
||||
private List<Author> author;
|
||||
|
||||
|
@ -16,7 +14,7 @@ public abstract class Result extends OafEntity implements Serializable {
|
|||
// common fields
|
||||
private Qualifier language;
|
||||
|
||||
private List<Qualifier> country;
|
||||
private List<Country> country;
|
||||
|
||||
private List<StructuredProperty> subject;
|
||||
|
||||
|
@ -44,16 +42,10 @@ public abstract class Result extends OafEntity implements Serializable {
|
|||
|
||||
private List<Field<String>> coverage;
|
||||
|
||||
private Field<String> refereed; //peer-review status
|
||||
private Qualifier bestaccessright;
|
||||
|
||||
private List<Context> context;
|
||||
|
||||
// ( article | book ) processing charges. Defined here to cope with possible wrongly typed results
|
||||
private Field<String> processingchargeamount;
|
||||
|
||||
// currency - alphabetic code describe in ISO-4217. Defined here to cope with possible wrongly typed results
|
||||
private Field<String> processingchargecurrency;
|
||||
|
||||
private List<ExternalReference> externalReference;
|
||||
|
||||
private List<Instance> instance;
|
||||
|
@ -82,11 +74,11 @@ public abstract class Result extends OafEntity implements Serializable {
|
|||
this.language = language;
|
||||
}
|
||||
|
||||
public List<Qualifier> getCountry() {
|
||||
public List<Country> getCountry() {
|
||||
return country;
|
||||
}
|
||||
|
||||
public void setCountry(List<Qualifier> country) {
|
||||
public void setCountry(List<Country> country) {
|
||||
this.country = country;
|
||||
}
|
||||
|
||||
|
@ -194,12 +186,12 @@ public abstract class Result extends OafEntity implements Serializable {
|
|||
this.coverage = coverage;
|
||||
}
|
||||
|
||||
public Field<String> getRefereed() {
|
||||
return refereed;
|
||||
public Qualifier getBestaccessright() {
|
||||
return bestaccessright;
|
||||
}
|
||||
|
||||
public void setRefereed(Field<String> refereed) {
|
||||
this.refereed = refereed;
|
||||
public void setBestaccessright(Qualifier bestaccessright) {
|
||||
this.bestaccessright = bestaccessright;
|
||||
}
|
||||
|
||||
public List<Context> getContext() {
|
||||
|
@ -226,24 +218,6 @@ public abstract class Result extends OafEntity implements Serializable {
|
|||
this.instance = instance;
|
||||
}
|
||||
|
||||
public Field<String> getProcessingchargeamount() {
|
||||
return processingchargeamount;
|
||||
}
|
||||
|
||||
public Result setProcessingchargeamount(Field<String> processingchargeamount) {
|
||||
this.processingchargeamount = processingchargeamount;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Field<String> getProcessingchargecurrency() {
|
||||
return processingchargecurrency;
|
||||
}
|
||||
|
||||
public Result setProcessingchargecurrency(Field<String> processingchargecurrency) {
|
||||
this.processingchargecurrency = processingchargecurrency;
|
||||
return this;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void mergeFrom(OafEntity e) {
|
||||
super.mergeFrom(e);
|
||||
|
@ -287,19 +261,9 @@ public abstract class Result extends OafEntity implements Serializable {
|
|||
|
||||
coverage = mergeLists(coverage, r.getCoverage());
|
||||
|
||||
if (r.getRefereed() != null && compareTrust(this, r) < 0)
|
||||
refereed = r.getRefereed();
|
||||
|
||||
context = mergeLists(context, r.getContext());
|
||||
|
||||
if (r.getProcessingchargeamount() != null && compareTrust(this, r) < 0)
|
||||
processingchargeamount = r.getProcessingchargeamount();
|
||||
|
||||
if (r.getProcessingchargecurrency() != null && compareTrust(this, r) < 0)
|
||||
processingchargecurrency = r.getProcessingchargecurrency();
|
||||
|
||||
externalReference = mergeLists(externalReference, r.getExternalReference());
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
@ -314,5 +278,4 @@ public abstract class Result extends OafEntity implements Serializable {
|
|||
return a.size() > b.size() ? a : b;
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
|
|
@ -0,0 +1,42 @@
|
|||
package eu.dnetlib.dhp.schema.action;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.*;
|
||||
|
||||
/**
|
||||
* @author claudio.atzori
|
||||
*/
|
||||
public class AtomicActionTest {
|
||||
|
||||
@Test
|
||||
public void serializationTest() throws IOException {
|
||||
|
||||
Relation rel = new Relation();
|
||||
rel.setSource("1");
|
||||
rel.setTarget("2");
|
||||
rel.setRelType("resultResult");
|
||||
rel.setSubRelType("dedup");
|
||||
rel.setRelClass("merges");
|
||||
|
||||
AtomicAction aa1 = new AtomicAction(Relation.class, rel);
|
||||
|
||||
final ObjectMapper mapper = new ObjectMapper();
|
||||
String json = mapper.writeValueAsString(aa1);
|
||||
|
||||
assertTrue(StringUtils.isNotBlank(json));
|
||||
|
||||
AtomicAction aa2 = mapper.readValue(json, AtomicAction.class);
|
||||
|
||||
assertEquals(aa1.getClazz(), aa2.getClazz());
|
||||
assertEquals(aa1.getPayload(), aa2.getPayload());
|
||||
|
||||
}
|
||||
|
||||
}
|
|
@ -1,11 +1,9 @@
|
|||
package eu.dnetlib.dhp.schema.oaf;
|
||||
|
||||
import org.junit.Assert;
|
||||
import org.junit.Before;
|
||||
import org.junit.Test;
|
||||
import static org.junit.jupiter.api.Assertions.*;
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
|
||||
|
@ -13,7 +11,7 @@ public class MergeTest {
|
|||
|
||||
OafEntity oaf;
|
||||
|
||||
@Before
|
||||
@BeforeEach
|
||||
public void setUp() {
|
||||
oaf = new Publication();
|
||||
}
|
||||
|
@ -44,8 +42,8 @@ public class MergeTest {
|
|||
|
||||
a.mergeFrom(b);
|
||||
|
||||
Assert.assertNotNull(a.getCollectedfrom());
|
||||
Assert.assertEquals(3, a.getCollectedfrom().size());
|
||||
assertNotNull(a.getCollectedfrom());
|
||||
assertEquals(3, a.getCollectedfrom().size());
|
||||
|
||||
}
|
||||
|
||||
|
@ -60,8 +58,8 @@ public class MergeTest {
|
|||
|
||||
a.mergeFrom(b);
|
||||
|
||||
Assert.assertNotNull(a.getSubject());
|
||||
Assert.assertEquals(3, a.getSubject().size());
|
||||
assertNotNull(a.getSubject());
|
||||
assertEquals(3, a.getSubject().size());
|
||||
|
||||
}
|
||||
|
||||
|
|
|
@ -6,7 +6,7 @@ import com.fasterxml.jackson.databind.ObjectMapper;
|
|||
import com.fasterxml.jackson.databind.SerializationFeature;
|
||||
import eu.dnetlib.dhp.schema.oaf.Qualifier;
|
||||
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
||||
import org.junit.Test;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
|
|
|
@ -4,7 +4,7 @@
|
|||
<parent>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<artifactId>dhp-workflows</artifactId>
|
||||
<version>1.0.5-SNAPSHOT</version>
|
||||
<version>1.1.6-SNAPSHOT</version>
|
||||
</parent>
|
||||
<artifactId>dhp-aggregation</artifactId>
|
||||
|
||||
|
@ -24,6 +24,61 @@
|
|||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<artifactId>dhp-common</artifactId>
|
||||
<version>${project.version}</version>
|
||||
<exclusions>
|
||||
<exclusion>
|
||||
<groupId>com.sun.xml.bind</groupId>
|
||||
<artifactId>jaxb-core</artifactId>
|
||||
</exclusion>
|
||||
</exclusions>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<artifactId>dhp-schemas</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>eu.dnetlib</groupId>
|
||||
<artifactId>dnet-actionmanager-common</artifactId>
|
||||
<exclusions>
|
||||
<exclusion>
|
||||
<groupId>eu.dnetlib</groupId>
|
||||
<artifactId>dnet-openaireplus-mapping-utils</artifactId>
|
||||
</exclusion>
|
||||
<exclusion>
|
||||
<groupId>saxonica</groupId>
|
||||
<artifactId>saxon</artifactId>
|
||||
</exclusion>
|
||||
<exclusion>
|
||||
<groupId>saxonica</groupId>
|
||||
<artifactId>saxon-dom</artifactId>
|
||||
</exclusion>
|
||||
<exclusion>
|
||||
<groupId>jgrapht</groupId>
|
||||
<artifactId>jgrapht</artifactId>
|
||||
</exclusion>
|
||||
<exclusion>
|
||||
<groupId>net.sf.ehcache</groupId>
|
||||
<artifactId>ehcache</artifactId>
|
||||
</exclusion>
|
||||
<exclusion>
|
||||
<groupId>org.springframework</groupId>
|
||||
<artifactId>spring-test</artifactId>
|
||||
</exclusion>
|
||||
<exclusion>
|
||||
<groupId>org.apache.*</groupId>
|
||||
<artifactId>*</artifactId>
|
||||
</exclusion>
|
||||
<exclusion>
|
||||
<groupId>apache</groupId>
|
||||
<artifactId>*</artifactId>
|
||||
</exclusion>
|
||||
</exclusions>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>eu.dnetlib</groupId>
|
||||
<artifactId>dnet-openaire-data-protos</artifactId>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
|
@ -44,13 +99,22 @@
|
|||
<groupId>jaxen</groupId>
|
||||
<artifactId>jaxen</artifactId>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.mongodb</groupId>
|
||||
<artifactId>mongo-java-driver</artifactId>
|
||||
</dependency>
|
||||
|
||||
|
||||
<dependency>
|
||||
<groupId>org.mockito</groupId>
|
||||
<artifactId>mockito-core</artifactId>
|
||||
<version>2.25.0</version>
|
||||
<scope>test</scope>
|
||||
<groupId>org.apache.hadoop</groupId>
|
||||
<artifactId>hadoop-distcp</artifactId>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.postgresql</groupId>
|
||||
<artifactId>postgresql</artifactId>
|
||||
<version>42.2.10</version>
|
||||
</dependency>
|
||||
|
||||
</dependencies>
|
||||
|
|
|
@ -0,0 +1,49 @@
|
|||
package eu.dnetlib.dhp.migration.actions;
|
||||
|
||||
import eu.dnetlib.data.proto.FieldTypeProtos.Qualifier;
|
||||
|
||||
import java.util.Comparator;
|
||||
|
||||
public class LicenseComparator implements Comparator<Qualifier> {
|
||||
|
||||
@Override
|
||||
public int compare(Qualifier left, Qualifier right) {
|
||||
|
||||
if (left == null && right == null) return 0;
|
||||
if (left == null) return 1;
|
||||
if (right == null) return -1;
|
||||
|
||||
String lClass = left.getClassid();
|
||||
String rClass = right.getClassid();
|
||||
|
||||
if (lClass.equals(rClass)) return 0;
|
||||
|
||||
if (lClass.equals("OPEN SOURCE")) return -1;
|
||||
if (rClass.equals("OPEN SOURCE")) return 1;
|
||||
|
||||
if (lClass.equals("OPEN")) return -1;
|
||||
if (rClass.equals("OPEN")) return 1;
|
||||
|
||||
if (lClass.equals("6MONTHS")) return -1;
|
||||
if (rClass.equals("6MONTHS")) return 1;
|
||||
|
||||
if (lClass.equals("12MONTHS")) return -1;
|
||||
if (rClass.equals("12MONTHS")) return 1;
|
||||
|
||||
if (lClass.equals("EMBARGO")) return -1;
|
||||
if (rClass.equals("EMBARGO")) return 1;
|
||||
|
||||
if (lClass.equals("RESTRICTED")) return -1;
|
||||
if (rClass.equals("RESTRICTED")) return 1;
|
||||
|
||||
if (lClass.equals("CLOSED")) return -1;
|
||||
if (rClass.equals("CLOSED")) return 1;
|
||||
|
||||
if (lClass.equals("UNKNOWN")) return -1;
|
||||
if (rClass.equals("UNKNOWN")) return 1;
|
||||
|
||||
// Else (but unlikely), lexicographical ordering will do.
|
||||
return lClass.compareTo(rClass);
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,170 @@
|
|||
package eu.dnetlib.dhp.migration.actions;
|
||||
|
||||
import com.google.common.base.Splitter;
|
||||
import com.google.common.collect.Lists;
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.utils.ISLookupClientFactory;
|
||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
|
||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hadoop.tools.DistCp;
|
||||
import org.apache.hadoop.tools.DistCpOptions;
|
||||
import org.apache.hadoop.util.ToolRunner;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.OutputStream;
|
||||
import java.util.*;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
public class MigrateActionSet {
|
||||
|
||||
private static final Log log = LogFactory.getLog(MigrateActionSet.class);
|
||||
|
||||
private static final String SEPARATOR = "/";
|
||||
private static final String TARGET_PATHS = "target_paths";
|
||||
private static final String RAWSET_PREFIX = "rawset_";
|
||||
|
||||
private static Boolean DEFAULT_TRANSFORM_ONLY = false;
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
||||
IOUtils.toString(MigrateActionSet.class.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/migration/migrate_actionsets_parameters.json")));
|
||||
parser.parseArgument(args);
|
||||
|
||||
new MigrateActionSet().run(parser);
|
||||
}
|
||||
|
||||
private void run(ArgumentApplicationParser parser) throws Exception {
|
||||
|
||||
final String isLookupUrl = parser.get("isLookupUrl");
|
||||
final String sourceNN = parser.get("sourceNameNode");
|
||||
final String targetNN = parser.get("targetNameNode");
|
||||
final String workDir = parser.get("workingDirectory");
|
||||
final Integer distcp_num_maps = Integer.parseInt(parser.get("distcp_num_maps"));
|
||||
|
||||
final String distcp_memory_mb = parser.get("distcp_memory_mb");
|
||||
final String distcp_task_timeout = parser.get("distcp_task_timeout");
|
||||
|
||||
final String transform_only_s = parser.get("transform_only");
|
||||
|
||||
log.info("transform only param: " + transform_only_s);
|
||||
|
||||
final Boolean transformOnly = Boolean.valueOf(parser.get("transform_only"));
|
||||
|
||||
log.info("transform only: " + transformOnly);
|
||||
|
||||
ISLookUpService isLookUp = ISLookupClientFactory.getLookUpService(isLookupUrl);
|
||||
|
||||
Configuration conf = getConfiguration(distcp_task_timeout, distcp_memory_mb, distcp_num_maps);
|
||||
FileSystem targetFS = FileSystem.get(conf);
|
||||
|
||||
Configuration sourceConf = getConfiguration(distcp_task_timeout, distcp_memory_mb, distcp_num_maps);
|
||||
sourceConf.set(FileSystem.FS_DEFAULT_NAME_KEY, sourceNN);
|
||||
FileSystem sourceFS = FileSystem.get(sourceConf);
|
||||
|
||||
Properties props = new Properties();
|
||||
|
||||
List<Path> targetPaths = new ArrayList<>();
|
||||
|
||||
final List<Path> sourcePaths = getSourcePaths(sourceNN, isLookUp);
|
||||
log.info(String.format("paths to process:\n%s", sourcePaths.stream().map(p -> p.toString()).collect(Collectors.joining("\n"))));
|
||||
for(Path source : sourcePaths) {
|
||||
|
||||
if (!sourceFS.exists(source)) {
|
||||
log.warn(String.format("skipping unexisting path: %s", source));
|
||||
} else {
|
||||
|
||||
LinkedList<String> pathQ = Lists.newLinkedList(Splitter.on(SEPARATOR).split(source.toUri().getPath()));
|
||||
|
||||
final String rawSet = pathQ.pollLast();
|
||||
log.info(String.format("got RAWSET: %s", rawSet));
|
||||
|
||||
if (StringUtils.isNotBlank(rawSet) && rawSet.startsWith(RAWSET_PREFIX)) {
|
||||
|
||||
final String actionSetDirectory = pathQ.pollLast();
|
||||
|
||||
final Path targetPath = new Path(targetNN + workDir + SEPARATOR + actionSetDirectory + SEPARATOR + rawSet);
|
||||
|
||||
log.info(String.format("using TARGET PATH: %s", targetPath));
|
||||
|
||||
if (!transformOnly) {
|
||||
if (targetFS.exists(targetPath)) {
|
||||
targetFS.delete(targetPath, true);
|
||||
}
|
||||
runDistcp(distcp_num_maps, distcp_memory_mb, distcp_task_timeout, conf, source, targetPath);
|
||||
}
|
||||
|
||||
targetPaths.add(targetPath);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
props.setProperty(TARGET_PATHS, targetPaths
|
||||
.stream()
|
||||
.map(p -> p.toString())
|
||||
.collect(Collectors.joining(",")));
|
||||
File file = new File(System.getProperty("oozie.action.output.properties"));
|
||||
|
||||
try(OutputStream os = new FileOutputStream(file)) {
|
||||
props.store(os, "");
|
||||
}
|
||||
System.out.println(file.getAbsolutePath());
|
||||
}
|
||||
|
||||
private void runDistcp(Integer distcp_num_maps, String distcp_memory_mb, String distcp_task_timeout, Configuration conf, Path source, Path targetPath) throws Exception {
|
||||
|
||||
final DistCpOptions op = new DistCpOptions(source, targetPath);
|
||||
op.setMaxMaps(distcp_num_maps);
|
||||
op.preserve(DistCpOptions.FileAttribute.BLOCKSIZE);
|
||||
op.preserve(DistCpOptions.FileAttribute.REPLICATION);
|
||||
op.preserve(DistCpOptions.FileAttribute.CHECKSUMTYPE);
|
||||
|
||||
int res = ToolRunner.run(new DistCp(conf, op), new String[]{
|
||||
"-Dmapred.task.timeout=" + distcp_task_timeout,
|
||||
"-Dmapreduce.map.memory.mb=" + distcp_memory_mb,
|
||||
"-pb",
|
||||
"-m " + distcp_num_maps,
|
||||
source.toString(),
|
||||
targetPath.toString()});
|
||||
|
||||
if (res != 0) {
|
||||
throw new RuntimeException(String.format("distcp exited with code %s", res));
|
||||
}
|
||||
}
|
||||
|
||||
private Configuration getConfiguration(String distcp_task_timeout, String distcp_memory_mb, Integer distcp_num_maps) {
|
||||
final Configuration conf = new Configuration();
|
||||
conf.set("dfs.webhdfs.socket.connect-timeout", distcp_task_timeout);
|
||||
conf.set("dfs.webhdfs.socket.read-timeout", distcp_task_timeout);
|
||||
conf.set("dfs.http.client.retry.policy.enabled", "true");
|
||||
conf.set("mapred.task.timeout", distcp_task_timeout);
|
||||
conf.set("mapreduce.map.memory.mb", distcp_memory_mb);
|
||||
conf.set("mapred.map.tasks", String.valueOf(distcp_num_maps));
|
||||
return conf;
|
||||
}
|
||||
|
||||
private List<Path> getSourcePaths(String sourceNN, ISLookUpService isLookUp) throws ISLookUpException {
|
||||
String XQUERY = "distinct-values(\n" +
|
||||
"let $basePath := collection('/db/DRIVER/ServiceResources/ActionManagerServiceResourceType')//SERVICE_PROPERTIES/PROPERTY[@key = 'basePath']/@value/string()\n" +
|
||||
"for $x in collection('/db/DRIVER/ActionManagerSetDSResources/ActionManagerSetDSResourceType') \n" +
|
||||
"let $setDir := $x//SET/@directory/string()\n" +
|
||||
"let $rawSet := $x//RAW_SETS/LATEST/@id/string()\n" +
|
||||
"return concat($basePath, '/', $setDir, '/', $rawSet))";
|
||||
|
||||
log.info(String.format("running xquery:\n%s", XQUERY));
|
||||
return isLookUp.quickSearchProfile(XQUERY)
|
||||
.stream()
|
||||
.map(p -> sourceNN + p)
|
||||
.map(Path::new)
|
||||
.collect(Collectors.toList());
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,580 @@
|
|||
package eu.dnetlib.dhp.migration.actions;
|
||||
|
||||
import com.google.common.collect.Lists;
|
||||
import com.googlecode.protobuf.format.JsonFormat;
|
||||
import eu.dnetlib.data.proto.*;
|
||||
import eu.dnetlib.dhp.schema.oaf.*;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.List;
|
||||
import java.util.Optional;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
public class ProtoConverter implements Serializable {
|
||||
|
||||
public static final String UNKNOWN = "UNKNOWN";
|
||||
public static final String NOT_AVAILABLE = "not available";
|
||||
public static final String DNET_ACCESS_MODES = "dnet:access_modes";
|
||||
|
||||
public static Oaf convert(OafProtos.Oaf oaf) {
|
||||
try {
|
||||
switch (oaf.getKind()) {
|
||||
case entity:
|
||||
return convertEntity(oaf);
|
||||
case relation:
|
||||
return convertRelation(oaf);
|
||||
default:
|
||||
throw new IllegalArgumentException("invalid kind " + oaf.getKind());
|
||||
}
|
||||
} catch (Throwable e) {
|
||||
throw new RuntimeException("error on getting " + JsonFormat.printToString(oaf), e);
|
||||
}
|
||||
}
|
||||
|
||||
private static Relation convertRelation(OafProtos.Oaf oaf) {
|
||||
final OafProtos.OafRel r = oaf.getRel();
|
||||
final Relation rel = new Relation();
|
||||
rel.setDataInfo(mapDataInfo(oaf.getDataInfo()));
|
||||
rel.setLastupdatetimestamp(oaf.getLastupdatetimestamp());
|
||||
rel.setSource(r.getSource());
|
||||
rel.setTarget(r.getTarget());
|
||||
rel.setRelType(r.getRelType().toString());
|
||||
rel.setSubRelType(r.getSubRelType().toString());
|
||||
rel.setRelClass(r.getRelClass());
|
||||
rel.setCollectedFrom(r.getCollectedfromCount() > 0 ?
|
||||
r.getCollectedfromList().stream()
|
||||
.map(kv -> mapKV(kv))
|
||||
.collect(Collectors.toList()) : null);
|
||||
return rel;
|
||||
}
|
||||
|
||||
private static OafEntity convertEntity(OafProtos.Oaf oaf) {
|
||||
|
||||
switch (oaf.getEntity().getType()) {
|
||||
case result:
|
||||
final Result r = convertResult(oaf);
|
||||
r.setInstance(convertInstances(oaf));
|
||||
return r;
|
||||
case project:
|
||||
return convertProject(oaf);
|
||||
case datasource:
|
||||
return convertDataSource(oaf);
|
||||
case organization:
|
||||
return convertOrganization(oaf);
|
||||
default:
|
||||
throw new RuntimeException("received unknown type");
|
||||
}
|
||||
}
|
||||
|
||||
private static List<Instance> convertInstances(OafProtos.Oaf oaf) {
|
||||
|
||||
final ResultProtos.Result r = oaf.getEntity().getResult();
|
||||
if (r.getInstanceCount() > 0) {
|
||||
return r.getInstanceList()
|
||||
.stream()
|
||||
.map(i -> convertInstance(i))
|
||||
.collect(Collectors.toList());
|
||||
}
|
||||
return Lists.newArrayList();
|
||||
}
|
||||
|
||||
private static Instance convertInstance(ResultProtos.Result.Instance ri) {
|
||||
final Instance i = new Instance();
|
||||
i.setAccessright(mapQualifier(ri.getAccessright()));
|
||||
i.setCollectedfrom(mapKV(ri.getCollectedfrom()));
|
||||
i.setDateofacceptance(mapStringField(ri.getDateofacceptance()));
|
||||
i.setDistributionlocation(ri.getDistributionlocation());
|
||||
i.setHostedby(mapKV(ri.getHostedby()));
|
||||
i.setInstancetype(mapQualifier(ri.getInstancetype()));
|
||||
i.setLicense(mapStringField(ri.getLicense()));
|
||||
i.setUrl(ri.getUrlList());
|
||||
i.setRefereed(mapStringField(ri.getRefereed()));
|
||||
i.setProcessingchargeamount(mapStringField(ri.getProcessingchargeamount()));
|
||||
i.setProcessingchargecurrency(mapStringField(ri.getProcessingchargecurrency()));
|
||||
return i;
|
||||
}
|
||||
|
||||
private static Organization convertOrganization(OafProtos.Oaf oaf) {
|
||||
final OrganizationProtos.Organization.Metadata m = oaf.getEntity().getOrganization().getMetadata();
|
||||
final Organization org = setOaf(new Organization(), oaf);
|
||||
setEntity(org, oaf);
|
||||
org.setLegalshortname(mapStringField(m.getLegalshortname()));
|
||||
org.setLegalname(mapStringField(m.getLegalname()));
|
||||
org.setAlternativeNames(m.getAlternativeNamesList().
|
||||
stream()
|
||||
.map(ProtoConverter::mapStringField)
|
||||
.collect(Collectors.toList()));
|
||||
org.setWebsiteurl(mapStringField(m.getWebsiteurl()));
|
||||
org.setLogourl(mapStringField(m.getLogourl()));
|
||||
org.setEclegalbody(mapStringField(m.getEclegalbody()));
|
||||
org.setEclegalperson(mapStringField(m.getEclegalperson()));
|
||||
org.setEcnonprofit(mapStringField(m.getEcnonprofit()));
|
||||
org.setEcresearchorganization(mapStringField(m.getEcresearchorganization()));
|
||||
org.setEchighereducation(mapStringField(m.getEchighereducation()));
|
||||
org.setEcinternationalorganizationeurinterests(mapStringField(m.getEcinternationalorganizationeurinterests()));
|
||||
org.setEcinternationalorganization(mapStringField(m.getEcinternationalorganization()));
|
||||
org.setEcenterprise(mapStringField(m.getEcenterprise()));
|
||||
org.setEcsmevalidated(mapStringField(m.getEcsmevalidated()));
|
||||
org.setEcnutscode(mapStringField(m.getEcnutscode()));
|
||||
org.setCountry(mapQualifier(m.getCountry()));
|
||||
|
||||
return org;
|
||||
}
|
||||
|
||||
private static Datasource convertDataSource(OafProtos.Oaf oaf) {
|
||||
final DatasourceProtos.Datasource.Metadata m = oaf.getEntity().getDatasource().getMetadata();
|
||||
final Datasource datasource = setOaf(new Datasource(), oaf);
|
||||
setEntity(datasource, oaf);
|
||||
datasource.setAccessinfopackage(m.getAccessinfopackageList()
|
||||
.stream()
|
||||
.map(ProtoConverter::mapStringField)
|
||||
.collect(Collectors.toList()));
|
||||
datasource.setCertificates(mapStringField(m.getCertificates()));
|
||||
datasource.setCitationguidelineurl(mapStringField(m.getCitationguidelineurl()));
|
||||
datasource.setContactemail(mapStringField(m.getContactemail()));
|
||||
datasource.setDatabaseaccessrestriction(mapStringField(m.getDatabaseaccessrestriction()));
|
||||
datasource.setDatabaseaccesstype(mapStringField(m.getDatabaseaccesstype()));
|
||||
datasource.setDataprovider(mapBoolField(m.getDataprovider()));
|
||||
datasource.setDatasourcetype(mapQualifier(m.getDatasourcetype()));
|
||||
datasource.setDatauploadrestriction(mapStringField(m.getDatauploadrestriction()));
|
||||
datasource.setCitationguidelineurl(mapStringField(m.getCitationguidelineurl()));
|
||||
datasource.setDatauploadtype(mapStringField(m.getDatauploadtype()));
|
||||
datasource.setDateofvalidation(mapStringField(m.getDateofvalidation()));
|
||||
datasource.setDescription(mapStringField(m.getDescription()));
|
||||
datasource.setEnglishname(mapStringField(m.getEnglishname()));
|
||||
datasource.setLatitude(mapStringField(m.getLatitude()));
|
||||
datasource.setLongitude(mapStringField(m.getLongitude()));
|
||||
datasource.setLogourl(mapStringField(m.getLogourl()));
|
||||
datasource.setMissionstatementurl(mapStringField(m.getMissionstatementurl()));
|
||||
datasource.setNamespaceprefix(mapStringField(m.getNamespaceprefix()));
|
||||
datasource.setOdcontenttypes(m.getOdcontenttypesList()
|
||||
.stream()
|
||||
.map(ProtoConverter::mapStringField)
|
||||
.collect(Collectors.toList()));
|
||||
datasource.setOdlanguages(m.getOdlanguagesList()
|
||||
.stream()
|
||||
.map(ProtoConverter::mapStringField)
|
||||
.collect(Collectors.toList()));
|
||||
datasource.setOdnumberofitems(mapStringField(m.getOdnumberofitems()));
|
||||
datasource.setOdnumberofitemsdate(mapStringField(m.getOdnumberofitemsdate()));
|
||||
datasource.setOdpolicies(mapStringField(m.getOdpolicies()));
|
||||
datasource.setOfficialname(mapStringField(m.getOfficialname()));
|
||||
datasource.setOpenairecompatibility(mapQualifier(m.getOpenairecompatibility()));
|
||||
datasource.setPidsystems(mapStringField(m.getPidsystems()));
|
||||
datasource.setPolicies(m.getPoliciesList()
|
||||
.stream()
|
||||
.map(ProtoConverter::mapKV)
|
||||
.collect(Collectors.toList()));
|
||||
datasource.setQualitymanagementkind(mapStringField(m.getQualitymanagementkind()));
|
||||
datasource.setReleaseenddate(mapStringField(m.getReleaseenddate()));
|
||||
datasource.setServiceprovider(mapBoolField(m.getServiceprovider()));
|
||||
datasource.setReleasestartdate(mapStringField(m.getReleasestartdate()));
|
||||
datasource.setSubjects(m.getSubjectsList()
|
||||
.stream()
|
||||
.map(ProtoConverter::mapStructuredProperty)
|
||||
.collect(Collectors.toList()));
|
||||
datasource.setVersioning(mapBoolField(m.getVersioning()));
|
||||
datasource.setWebsiteurl(mapStringField(m.getWebsiteurl()));
|
||||
datasource.setJournal(mapJournal(m.getJournal()));
|
||||
|
||||
|
||||
return datasource;
|
||||
}
|
||||
|
||||
private static Project convertProject(OafProtos.Oaf oaf) {
|
||||
final ProjectProtos.Project.Metadata m = oaf.getEntity().getProject().getMetadata();
|
||||
final Project project = setOaf(new Project(), oaf);
|
||||
setEntity(project, oaf);
|
||||
project.setAcronym(mapStringField(m.getAcronym()));
|
||||
project.setCallidentifier(mapStringField(m.getCallidentifier()));
|
||||
project.setCode(mapStringField(m.getCode()));
|
||||
project.setContactemail(mapStringField(m.getContactemail()));
|
||||
project.setContactfax(mapStringField(m.getContactfax()));
|
||||
project.setContactfullname(mapStringField(m.getContactfullname()));
|
||||
project.setContactphone(mapStringField(m.getContactphone()));
|
||||
project.setContracttype(mapQualifier(m.getContracttype()));
|
||||
project.setCurrency(mapStringField(m.getCurrency()));
|
||||
project.setDuration(mapStringField(m.getDuration()));
|
||||
project.setEcarticle29_3(mapStringField(m.getEcarticle293()));
|
||||
project.setEcsc39(mapStringField(m.getEcsc39()));
|
||||
project.setOamandatepublications(mapStringField(m.getOamandatepublications()));
|
||||
project.setStartdate(mapStringField(m.getStartdate()));
|
||||
project.setEnddate(mapStringField(m.getEnddate()));
|
||||
project.setFundedamount(m.getFundedamount());
|
||||
project.setTotalcost(m.getTotalcost());
|
||||
project.setKeywords(mapStringField(m.getKeywords()));
|
||||
project.setSubjects(m.getSubjectsList().stream()
|
||||
.map(sp -> mapStructuredProperty(sp))
|
||||
.collect(Collectors.toList()));
|
||||
project.setTitle(mapStringField(m.getTitle()));
|
||||
project.setWebsiteurl(mapStringField(m.getWebsiteurl()));
|
||||
project.setFundingtree(m.getFundingtreeList().stream()
|
||||
.map(f -> mapStringField(f))
|
||||
.collect(Collectors.toList()));
|
||||
project.setJsonextrainfo(mapStringField(m.getJsonextrainfo()));
|
||||
project.setSummary(mapStringField(m.getSummary()));
|
||||
project.setOptional1(mapStringField(m.getOptional1()));
|
||||
project.setOptional2(mapStringField(m.getOptional2()));
|
||||
return project;
|
||||
}
|
||||
|
||||
private static Result convertResult(OafProtos.Oaf oaf) {
|
||||
switch (oaf.getEntity().getResult().getMetadata().getResulttype().getClassid()) {
|
||||
case "dataset":
|
||||
return createDataset(oaf);
|
||||
case "publication":
|
||||
return createPublication(oaf);
|
||||
case "software":
|
||||
return createSoftware(oaf);
|
||||
case "other":
|
||||
return createORP(oaf);
|
||||
default:
|
||||
Result result = setOaf(new Result(), oaf);
|
||||
setEntity(result, oaf);
|
||||
return setResult(result, oaf);
|
||||
}
|
||||
}
|
||||
|
||||
private static Software createSoftware(OafProtos.Oaf oaf) {
|
||||
ResultProtos.Result.Metadata m = oaf.getEntity().getResult().getMetadata();
|
||||
Software software = setOaf(new Software(), oaf);
|
||||
setEntity(software, oaf);
|
||||
setResult(software, oaf);
|
||||
|
||||
software.setDocumentationUrl(m.getDocumentationUrlList()
|
||||
.stream()
|
||||
.map(ProtoConverter::mapStringField)
|
||||
.collect(Collectors.toList()));
|
||||
software.setLicense(m.getLicenseList()
|
||||
.stream()
|
||||
.map(ProtoConverter::mapStructuredProperty)
|
||||
.collect(Collectors.toList()));
|
||||
software.setCodeRepositoryUrl(mapStringField(m.getCodeRepositoryUrl()));
|
||||
software.setProgrammingLanguage(mapQualifier(m.getProgrammingLanguage()));
|
||||
return software;
|
||||
}
|
||||
|
||||
private static OtherResearchProduct createORP(OafProtos.Oaf oaf) {
|
||||
ResultProtos.Result.Metadata m = oaf.getEntity().getResult().getMetadata();
|
||||
OtherResearchProduct otherResearchProducts = setOaf(new OtherResearchProduct(), oaf);
|
||||
setEntity(otherResearchProducts, oaf);
|
||||
setResult(otherResearchProducts, oaf);
|
||||
otherResearchProducts.setContactperson(m.getContactpersonList()
|
||||
.stream()
|
||||
.map(ProtoConverter::mapStringField)
|
||||
.collect(Collectors.toList()));
|
||||
otherResearchProducts.setContactgroup(m.getContactgroupList()
|
||||
.stream()
|
||||
.map(ProtoConverter::mapStringField)
|
||||
.collect(Collectors.toList()));
|
||||
otherResearchProducts.setTool(m.getToolList()
|
||||
.stream()
|
||||
.map(ProtoConverter::mapStringField)
|
||||
.collect(Collectors.toList()));
|
||||
|
||||
return otherResearchProducts;
|
||||
}
|
||||
|
||||
private static Publication createPublication(OafProtos.Oaf oaf) {
|
||||
|
||||
ResultProtos.Result.Metadata m = oaf.getEntity().getResult().getMetadata();
|
||||
Publication publication = setOaf(new Publication(), oaf);
|
||||
setEntity(publication, oaf);
|
||||
setResult(publication, oaf);
|
||||
publication.setJournal(mapJournal(m.getJournal()));
|
||||
return publication;
|
||||
}
|
||||
|
||||
private static Dataset createDataset(OafProtos.Oaf oaf) {
|
||||
|
||||
ResultProtos.Result.Metadata m = oaf.getEntity().getResult().getMetadata();
|
||||
Dataset dataset = setOaf(new Dataset(), oaf);
|
||||
setEntity(dataset, oaf);
|
||||
setResult(dataset, oaf);
|
||||
dataset.setStoragedate(mapStringField(m.getStoragedate()));
|
||||
dataset.setDevice(mapStringField(m.getDevice()));
|
||||
dataset.setSize(mapStringField(m.getSize()));
|
||||
dataset.setVersion(mapStringField(m.getVersion()));
|
||||
dataset.setLastmetadataupdate(mapStringField(m.getLastmetadataupdate()));
|
||||
dataset.setMetadataversionnumber(mapStringField(m.getMetadataversionnumber()));
|
||||
dataset.setGeolocation(m.getGeolocationList()
|
||||
.stream()
|
||||
.map(ProtoConverter::mapGeolocation)
|
||||
.collect(Collectors.toList()));
|
||||
return dataset;
|
||||
|
||||
}
|
||||
|
||||
public static <T extends Oaf> T setOaf(T oaf, OafProtos.Oaf o) {
|
||||
oaf.setDataInfo(mapDataInfo(o.getDataInfo()));
|
||||
oaf.setLastupdatetimestamp(o.getLastupdatetimestamp());
|
||||
return oaf;
|
||||
}
|
||||
|
||||
public static <T extends OafEntity> T setEntity(T entity, OafProtos.Oaf oaf) {
|
||||
//setting Entity fields
|
||||
final OafProtos.OafEntity e = oaf.getEntity();
|
||||
entity.setId(e.getId());
|
||||
entity.setOriginalId(e.getOriginalIdList());
|
||||
entity.setCollectedfrom(e.getCollectedfromList()
|
||||
.stream()
|
||||
.map(ProtoConverter::mapKV)
|
||||
.collect(Collectors.toList()));
|
||||
entity.setPid(e.getPidList().stream()
|
||||
.map(ProtoConverter::mapStructuredProperty)
|
||||
.collect(Collectors.toList()));
|
||||
entity.setDateofcollection(e.getDateofcollection());
|
||||
entity.setDateoftransformation(e.getDateoftransformation());
|
||||
entity.setExtraInfo(e.getExtraInfoList()
|
||||
.stream()
|
||||
.map(ProtoConverter::mapExtraInfo)
|
||||
.collect(Collectors.toList()));
|
||||
return entity;
|
||||
}
|
||||
|
||||
public static <T extends Result> T setResult(T entity, OafProtos.Oaf oaf) {
|
||||
//setting Entity fields
|
||||
final ResultProtos.Result.Metadata m = oaf.getEntity().getResult().getMetadata();
|
||||
entity.setAuthor(m.getAuthorList()
|
||||
.stream()
|
||||
.map(ProtoConverter::mapAuthor)
|
||||
.collect(Collectors.toList()));
|
||||
entity.setResulttype(mapQualifier(m.getResulttype()));
|
||||
entity.setLanguage(mapQualifier(m.getLanguage()));
|
||||
entity.setCountry(m.getCountryList()
|
||||
.stream()
|
||||
.map(ProtoConverter::mapQualifierAsCountry)
|
||||
.collect(Collectors.toList()));
|
||||
entity.setSubject(m.getSubjectList()
|
||||
.stream()
|
||||
.map(ProtoConverter::mapStructuredProperty)
|
||||
.collect(Collectors.toList()));
|
||||
entity.setTitle(m.getTitleList()
|
||||
.stream()
|
||||
.map(ProtoConverter::mapStructuredProperty)
|
||||
.collect(Collectors.toList()));
|
||||
entity.setRelevantdate(m.getRelevantdateList()
|
||||
.stream()
|
||||
.map(ProtoConverter::mapStructuredProperty)
|
||||
.collect(Collectors.toList()));
|
||||
entity.setDescription(m.getDescriptionList()
|
||||
.stream()
|
||||
.map(ProtoConverter::mapStringField)
|
||||
.collect(Collectors.toList()));
|
||||
entity.setDateofacceptance(mapStringField(m.getDateofacceptance()));
|
||||
entity.setPublisher(mapStringField(m.getPublisher()));
|
||||
entity.setEmbargoenddate(mapStringField(m.getEmbargoenddate()));
|
||||
entity.setSource(m.getSourceList()
|
||||
.stream()
|
||||
.map(ProtoConverter::mapStringField)
|
||||
.collect(Collectors.toList()));
|
||||
entity.setFulltext(m.getFulltextList()
|
||||
.stream()
|
||||
.map(ProtoConverter::mapStringField)
|
||||
.collect(Collectors.toList()));
|
||||
entity.setFormat(m.getFormatList()
|
||||
.stream()
|
||||
.map(ProtoConverter::mapStringField)
|
||||
.collect(Collectors.toList()));
|
||||
entity.setContributor(m.getContributorList()
|
||||
.stream()
|
||||
.map(ProtoConverter::mapStringField)
|
||||
.collect(Collectors.toList()));
|
||||
entity.setResourcetype(mapQualifier(m.getResourcetype()));
|
||||
entity.setCoverage(m.getCoverageList()
|
||||
.stream()
|
||||
.map(ProtoConverter::mapStringField)
|
||||
.collect(Collectors.toList()));
|
||||
entity.setContext(m.getContextList()
|
||||
.stream()
|
||||
.map(ProtoConverter::mapContext)
|
||||
.collect(Collectors.toList()));
|
||||
|
||||
entity.setBestaccessright(getBestAccessRights(oaf.getEntity().getResult().getInstanceList()));
|
||||
|
||||
return entity;
|
||||
}
|
||||
|
||||
private static Qualifier getBestAccessRights(List<ResultProtos.Result.Instance> instanceList) {
|
||||
if (instanceList != null) {
|
||||
final Optional<FieldTypeProtos.Qualifier> min = instanceList.stream()
|
||||
.map(i -> i.getAccessright()).min(new LicenseComparator());
|
||||
|
||||
final Qualifier rights = min.isPresent() ? mapQualifier(min.get()) : new Qualifier();
|
||||
|
||||
if (StringUtils.isBlank(rights.getClassid())) {
|
||||
rights.setClassid(UNKNOWN);
|
||||
}
|
||||
if (StringUtils.isBlank(rights.getClassname()) || UNKNOWN.equalsIgnoreCase(rights.getClassname())) {
|
||||
rights.setClassname(NOT_AVAILABLE);
|
||||
}
|
||||
if (StringUtils.isBlank(rights.getSchemeid())) {
|
||||
rights.setSchemeid(DNET_ACCESS_MODES);
|
||||
}
|
||||
if (StringUtils.isBlank(rights.getSchemename())) {
|
||||
rights.setSchemename(DNET_ACCESS_MODES);
|
||||
}
|
||||
|
||||
return rights;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
private static Context mapContext(ResultProtos.Result.Context context) {
|
||||
|
||||
final Context entity = new Context();
|
||||
entity.setId(context.getId());
|
||||
entity.setDataInfo(context.getDataInfoList()
|
||||
.stream()
|
||||
.map(ProtoConverter::mapDataInfo)
|
||||
.collect(Collectors.toList()));
|
||||
return entity;
|
||||
}
|
||||
|
||||
|
||||
public static KeyValue mapKV(FieldTypeProtos.KeyValue kv) {
|
||||
final KeyValue keyValue = new KeyValue();
|
||||
keyValue.setKey(kv.getKey());
|
||||
keyValue.setValue(kv.getValue());
|
||||
keyValue.setDataInfo(mapDataInfo(kv.getDataInfo()));
|
||||
return keyValue;
|
||||
}
|
||||
|
||||
public static DataInfo mapDataInfo(FieldTypeProtos.DataInfo d) {
|
||||
final DataInfo dataInfo = new DataInfo();
|
||||
dataInfo.setDeletedbyinference(d.getDeletedbyinference());
|
||||
dataInfo.setInferenceprovenance(d.getInferenceprovenance());
|
||||
dataInfo.setInferred(d.getInferred());
|
||||
dataInfo.setInvisible(d.getInvisible());
|
||||
dataInfo.setProvenanceaction(mapQualifier(d.getProvenanceaction()));
|
||||
dataInfo.setTrust(d.getTrust());
|
||||
return dataInfo;
|
||||
}
|
||||
|
||||
public static Qualifier mapQualifier(FieldTypeProtos.Qualifier q) {
|
||||
final Qualifier qualifier = new Qualifier();
|
||||
qualifier.setClassid(q.getClassid());
|
||||
qualifier.setClassname(q.getClassname());
|
||||
qualifier.setSchemeid(q.getSchemeid());
|
||||
qualifier.setSchemename(q.getSchemename());
|
||||
return qualifier;
|
||||
}
|
||||
|
||||
public static Country mapQualifierAsCountry(FieldTypeProtos.Qualifier q) {
|
||||
final Country c = new Country();
|
||||
c.setClassid(q.getClassid());
|
||||
c.setClassname(q.getClassname());
|
||||
c.setSchemeid(q.getSchemeid());
|
||||
c.setSchemename(q.getSchemename());
|
||||
c.setDataInfo(mapDataInfo(q.getDataInfo()));
|
||||
return c;
|
||||
}
|
||||
|
||||
public static StructuredProperty mapStructuredProperty(FieldTypeProtos.StructuredProperty sp) {
|
||||
final StructuredProperty structuredProperty = new StructuredProperty();
|
||||
structuredProperty.setValue(sp.getValue());
|
||||
structuredProperty.setQualifier(mapQualifier(sp.getQualifier()));
|
||||
structuredProperty.setDataInfo(mapDataInfo(sp.getDataInfo()));
|
||||
return structuredProperty;
|
||||
}
|
||||
|
||||
public static ExtraInfo mapExtraInfo(FieldTypeProtos.ExtraInfo extraInfo) {
|
||||
final ExtraInfo entity = new ExtraInfo();
|
||||
entity.setName(extraInfo.getName());
|
||||
entity.setTypology(extraInfo.getTypology());
|
||||
entity.setProvenance(extraInfo.getProvenance());
|
||||
entity.setTrust(extraInfo.getTrust());
|
||||
entity.setValue(extraInfo.getValue());
|
||||
return entity;
|
||||
}
|
||||
|
||||
public static OAIProvenance mapOAIProvenance(FieldTypeProtos.OAIProvenance oaiProvenance) {
|
||||
final OAIProvenance entity = new OAIProvenance();
|
||||
entity.setOriginDescription(mapOriginalDescription(oaiProvenance.getOriginDescription()));
|
||||
return entity;
|
||||
}
|
||||
|
||||
public static OriginDescription mapOriginalDescription(FieldTypeProtos.OAIProvenance.OriginDescription originDescription) {
|
||||
final OriginDescription originDescriptionResult = new OriginDescription();
|
||||
originDescriptionResult.setHarvestDate(originDescription.getHarvestDate());
|
||||
originDescriptionResult.setAltered(originDescription.getAltered());
|
||||
originDescriptionResult.setBaseURL(originDescription.getBaseURL());
|
||||
originDescriptionResult.setIdentifier(originDescription.getIdentifier());
|
||||
originDescriptionResult.setDatestamp(originDescription.getDatestamp());
|
||||
originDescriptionResult.setMetadataNamespace(originDescription.getMetadataNamespace());
|
||||
return originDescriptionResult;
|
||||
}
|
||||
|
||||
public static Field<String> mapStringField(FieldTypeProtos.StringField s) {
|
||||
final Field<String> stringField = new Field<>();
|
||||
stringField.setValue(s.getValue());
|
||||
stringField.setDataInfo(mapDataInfo(s.getDataInfo()));
|
||||
return stringField;
|
||||
}
|
||||
|
||||
public static Field<Boolean> mapBoolField(FieldTypeProtos.BoolField b) {
|
||||
final Field<Boolean> booleanField = new Field<>();
|
||||
booleanField.setValue(b.getValue());
|
||||
booleanField.setDataInfo(mapDataInfo(b.getDataInfo()));
|
||||
return booleanField;
|
||||
}
|
||||
|
||||
public static Field<Integer> mapIntField(FieldTypeProtos.IntField b) {
|
||||
final Field<Integer> entity = new Field<>();
|
||||
entity.setValue(b.getValue());
|
||||
entity.setDataInfo(mapDataInfo(b.getDataInfo()));
|
||||
return entity;
|
||||
}
|
||||
|
||||
public static Journal mapJournal(FieldTypeProtos.Journal j) {
|
||||
final Journal journal = new Journal();
|
||||
journal.setConferencedate(j.getConferencedate());
|
||||
journal.setConferenceplace(j.getConferenceplace());
|
||||
journal.setEdition(j.getEdition());
|
||||
journal.setEp(j.getEp());
|
||||
journal.setIss(j.getIss());
|
||||
journal.setIssnLinking(j.getIssnLinking());
|
||||
journal.setIssnOnline(j.getIssnOnline());
|
||||
journal.setIssnPrinted(j.getIssnPrinted());
|
||||
journal.setName(j.getName());
|
||||
journal.setSp(j.getSp());
|
||||
journal.setVol(j.getVol());
|
||||
journal.setDataInfo(mapDataInfo(j.getDataInfo()));
|
||||
return journal;
|
||||
}
|
||||
|
||||
public static Author mapAuthor(FieldTypeProtos.Author author) {
|
||||
final Author entity = new Author();
|
||||
entity.setFullname(author.getFullname());
|
||||
entity.setName(author.getName());
|
||||
entity.setSurname(author.getSurname());
|
||||
entity.setRank(author.getRank());
|
||||
entity.setPid(author.getPidList()
|
||||
.stream()
|
||||
.map(kv -> {
|
||||
final StructuredProperty sp = new StructuredProperty();
|
||||
sp.setValue(kv.getValue());
|
||||
final Qualifier q = new Qualifier();
|
||||
q.setClassid(kv.getKey());
|
||||
q.setClassname(kv.getKey());
|
||||
sp.setQualifier(q);
|
||||
return sp;
|
||||
})
|
||||
.collect(Collectors.toList()));
|
||||
entity.setAffiliation(author.getAffiliationList()
|
||||
.stream()
|
||||
.map(ProtoConverter::mapStringField)
|
||||
.collect(Collectors.toList()));
|
||||
return entity;
|
||||
|
||||
}
|
||||
|
||||
public static GeoLocation mapGeolocation(ResultProtos.Result.GeoLocation geoLocation) {
|
||||
final GeoLocation entity = new GeoLocation();
|
||||
entity.setPoint(geoLocation.getPoint());
|
||||
entity.setBox(geoLocation.getBox());
|
||||
entity.setPlace(geoLocation.getPlace());
|
||||
return entity;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,194 @@
|
|||
package eu.dnetlib.dhp.migration.actions;
|
||||
|
||||
import com.fasterxml.jackson.core.JsonProcessingException;
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import com.google.common.base.Splitter;
|
||||
import com.google.common.collect.Lists;
|
||||
import com.google.protobuf.InvalidProtocolBufferException;
|
||||
import eu.dnetlib.dhp.schema.action.AtomicAction;
|
||||
import eu.dnetlib.data.proto.OafProtos;
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.schema.oaf.*;
|
||||
import eu.dnetlib.dhp.utils.ISLookupClientFactory;
|
||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
|
||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hadoop.io.Text;
|
||||
import org.apache.hadoop.io.compress.GzipCodec;
|
||||
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
import scala.Tuple2;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Serializable;
|
||||
import java.util.LinkedList;
|
||||
|
||||
public class TransformActions implements Serializable {
|
||||
|
||||
private static final Log log = LogFactory.getLog(TransformActions.class);
|
||||
private static final String SEPARATOR = "/";
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
||||
IOUtils.toString(MigrateActionSet.class.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/migration/transform_actionsets_parameters.json")));
|
||||
parser.parseArgument(args);
|
||||
|
||||
new TransformActions().run(parser);
|
||||
}
|
||||
|
||||
private void run(ArgumentApplicationParser parser) throws ISLookUpException, IOException {
|
||||
|
||||
final String isLookupUrl = parser.get("isLookupUrl");
|
||||
log.info("isLookupUrl: " + isLookupUrl);
|
||||
|
||||
final String inputPaths = parser.get("inputPaths");
|
||||
|
||||
if (StringUtils.isBlank(inputPaths)) {
|
||||
throw new RuntimeException("empty inputPaths");
|
||||
}
|
||||
log.info("inputPaths: " + inputPaths);
|
||||
|
||||
final String targetBaseDir = getTargetBaseDir(isLookupUrl);
|
||||
|
||||
try(SparkSession spark = getSparkSession(parser)) {
|
||||
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
|
||||
final FileSystem fs = FileSystem.get(spark.sparkContext().hadoopConfiguration());
|
||||
|
||||
for(String sourcePath : Lists.newArrayList(Splitter.on(",").split(inputPaths))) {
|
||||
|
||||
LinkedList<String> pathQ = Lists.newLinkedList(Splitter.on(SEPARATOR).split(sourcePath));
|
||||
|
||||
final String rawset = pathQ.pollLast();
|
||||
final String actionSetDirectory = pathQ.pollLast();
|
||||
|
||||
final Path targetDirectory = new Path(targetBaseDir + SEPARATOR + actionSetDirectory + SEPARATOR + rawset);
|
||||
|
||||
if (fs.exists(targetDirectory)) {
|
||||
log.info(String.format("found target directory '%s", targetDirectory));
|
||||
fs.delete(targetDirectory, true);
|
||||
log.info(String.format("deleted target directory '%s", targetDirectory));
|
||||
}
|
||||
|
||||
log.info(String.format("transforming actions from '%s' to '%s'", sourcePath, targetDirectory));
|
||||
|
||||
sc.sequenceFile(sourcePath, Text.class, Text.class)
|
||||
.mapToPair(a -> new Tuple2<>(a._1(), eu.dnetlib.actionmanager.actions.AtomicAction.fromJSON(a._2().toString())))
|
||||
.mapToPair(a -> new Tuple2<>(a._1(), transformAction(a._1().toString(), a._2())))
|
||||
.filter(t -> StringUtils.isNotBlank(t._2().toString()))
|
||||
.saveAsHadoopFile(targetDirectory.toString(), Text.class, Text.class, SequenceFileOutputFormat.class, GzipCodec.class);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private Text transformAction(String atomicaActionId, eu.dnetlib.actionmanager.actions.AtomicAction aa) throws InvalidProtocolBufferException, JsonProcessingException {
|
||||
final Text out = new Text();
|
||||
final ObjectMapper mapper = new ObjectMapper();
|
||||
if (aa.getTargetValue() != null && aa.getTargetValue().length > 0) {
|
||||
out.set(mapper.writeValueAsString(doTransform(aa)));
|
||||
} else {
|
||||
if (atomicaActionId.contains("dedupSimilarity")) {
|
||||
out.set(mapper.writeValueAsString(getRelationAtomicAction(atomicaActionId)));
|
||||
}
|
||||
}
|
||||
|
||||
return out;
|
||||
}
|
||||
|
||||
private AtomicAction<Relation> getRelationAtomicAction(String atomicaActionId) {
|
||||
final String[] splitId = atomicaActionId.split("@");
|
||||
|
||||
String source = splitId[0];
|
||||
String target = splitId[2];
|
||||
|
||||
String[] relSemantic = splitId[1].split("_");
|
||||
|
||||
Relation rel = new Relation();
|
||||
rel.setSource(source);
|
||||
rel.setTarget(target);
|
||||
rel.setRelType(relSemantic[0]);
|
||||
rel.setSubRelType(relSemantic[1]);
|
||||
rel.setRelClass(relSemantic[2]);
|
||||
|
||||
DataInfo d = new DataInfo();
|
||||
d.setDeletedbyinference(false);
|
||||
d.setInferenceprovenance("deduplication");
|
||||
d.setInferred(true);
|
||||
d.setInvisible(false);
|
||||
Qualifier provenanceaction = new Qualifier();
|
||||
|
||||
provenanceaction.setClassid("deduplication");
|
||||
provenanceaction.setClassname("deduplication");
|
||||
provenanceaction.setSchemeid("dnet:provenanceActions");
|
||||
provenanceaction.setSchemename("dnet:provenanceActions");
|
||||
|
||||
d.setProvenanceaction(provenanceaction);
|
||||
|
||||
rel.setDataInfo(d);
|
||||
|
||||
return new AtomicAction<>(Relation.class, rel);
|
||||
}
|
||||
|
||||
private AtomicAction doTransform(eu.dnetlib.actionmanager.actions.AtomicAction aa) throws InvalidProtocolBufferException {
|
||||
final OafProtos.Oaf proto_oaf = OafProtos.Oaf.parseFrom(aa.getTargetValue());
|
||||
final Oaf oaf = ProtoConverter.convert(proto_oaf);
|
||||
switch (proto_oaf.getKind()) {
|
||||
case entity:
|
||||
switch (proto_oaf.getEntity().getType()) {
|
||||
case datasource:
|
||||
return new AtomicAction<>(Datasource.class, (Datasource) oaf);
|
||||
case organization:
|
||||
return new AtomicAction<>(Organization.class, (Organization) oaf);
|
||||
case project:
|
||||
return new AtomicAction<>(Project.class, (Project) oaf);
|
||||
case result:
|
||||
final String resulttypeid = proto_oaf.getEntity().getResult().getMetadata().getResulttype().getClassid();
|
||||
switch (resulttypeid) {
|
||||
case "publication":
|
||||
return new AtomicAction<>(Publication.class, (Publication) oaf);
|
||||
case "software":
|
||||
return new AtomicAction<>(Software.class, (Software) oaf);
|
||||
case "other":
|
||||
return new AtomicAction<>(OtherResearchProduct.class, (OtherResearchProduct) oaf);
|
||||
case "dataset":
|
||||
return new AtomicAction<>(Dataset.class, (Dataset) oaf);
|
||||
default:
|
||||
// can be an update, where the resulttype is not specified
|
||||
return new AtomicAction<>(Result.class, (Result) oaf);
|
||||
}
|
||||
default:
|
||||
throw new IllegalArgumentException("invalid entity type: " + proto_oaf.getEntity().getType());
|
||||
}
|
||||
case relation:
|
||||
return new AtomicAction<>(Relation.class, (Relation) oaf);
|
||||
default:
|
||||
throw new IllegalArgumentException("invalid kind: " + proto_oaf.getKind());
|
||||
}
|
||||
}
|
||||
|
||||
private String getTargetBaseDir(String isLookupUrl) throws ISLookUpException {
|
||||
ISLookUpService isLookUp = ISLookupClientFactory.getLookUpService(isLookupUrl);
|
||||
String XQUERY = "collection('/db/DRIVER/ServiceResources/ActionManagerServiceResourceType')//SERVICE_PROPERTIES/PROPERTY[@key = 'basePath']/@value/string()";
|
||||
return isLookUp.getResourceProfileByQuery(XQUERY);
|
||||
}
|
||||
|
||||
private static SparkSession getSparkSession(ArgumentApplicationParser parser) {
|
||||
SparkConf conf = new SparkConf();
|
||||
|
||||
return SparkSession
|
||||
.builder()
|
||||
.appName(TransformActions.class.getSimpleName())
|
||||
.master(parser.get("master"))
|
||||
.config(conf)
|
||||
.enableHiveSupport()
|
||||
.getOrCreate();
|
||||
}
|
||||
}
|
|
@ -0,0 +1,481 @@
|
|||
package eu.dnetlib.dhp.migration.step1;
|
||||
|
||||
import static eu.dnetlib.dhp.migration.utils.OafMapperUtils.asString;
|
||||
import static eu.dnetlib.dhp.migration.utils.OafMapperUtils.createOpenaireId;
|
||||
import static eu.dnetlib.dhp.migration.utils.OafMapperUtils.dataInfo;
|
||||
import static eu.dnetlib.dhp.migration.utils.OafMapperUtils.field;
|
||||
import static eu.dnetlib.dhp.migration.utils.OafMapperUtils.journal;
|
||||
import static eu.dnetlib.dhp.migration.utils.OafMapperUtils.listFields;
|
||||
import static eu.dnetlib.dhp.migration.utils.OafMapperUtils.listKeyValues;
|
||||
import static eu.dnetlib.dhp.migration.utils.OafMapperUtils.qualifier;
|
||||
import static eu.dnetlib.dhp.migration.utils.OafMapperUtils.structuredProperty;
|
||||
|
||||
import java.io.Closeable;
|
||||
import java.io.IOException;
|
||||
import java.sql.Array;
|
||||
import java.sql.ResultSet;
|
||||
import java.sql.SQLException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Date;
|
||||
import java.util.List;
|
||||
import java.util.function.Consumer;
|
||||
import java.util.function.Function;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.migration.utils.AbstractMigrationApplication;
|
||||
import eu.dnetlib.dhp.migration.utils.DbClient;
|
||||
import eu.dnetlib.dhp.schema.oaf.Context;
|
||||
import eu.dnetlib.dhp.schema.oaf.DataInfo;
|
||||
import eu.dnetlib.dhp.schema.oaf.Dataset;
|
||||
import eu.dnetlib.dhp.schema.oaf.Datasource;
|
||||
import eu.dnetlib.dhp.schema.oaf.Field;
|
||||
import eu.dnetlib.dhp.schema.oaf.Journal;
|
||||
import eu.dnetlib.dhp.schema.oaf.KeyValue;
|
||||
import eu.dnetlib.dhp.schema.oaf.Oaf;
|
||||
import eu.dnetlib.dhp.schema.oaf.Organization;
|
||||
import eu.dnetlib.dhp.schema.oaf.OtherResearchProduct;
|
||||
import eu.dnetlib.dhp.schema.oaf.Project;
|
||||
import eu.dnetlib.dhp.schema.oaf.Publication;
|
||||
import eu.dnetlib.dhp.schema.oaf.Qualifier;
|
||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||
import eu.dnetlib.dhp.schema.oaf.Software;
|
||||
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
||||
|
||||
public class MigrateDbEntitiesApplication extends AbstractMigrationApplication implements Closeable {
|
||||
|
||||
private static final Qualifier ENTITYREGISTRY_PROVENANCE_ACTION =
|
||||
qualifier("sysimport:crosswalk:entityregistry", "sysimport:crosswalk:entityregistry", "dnet:provenance_actions", "dnet:provenance_actions");
|
||||
|
||||
private static final Log log = LogFactory.getLog(MigrateDbEntitiesApplication.class);
|
||||
|
||||
private final DbClient dbClient;
|
||||
|
||||
private final long lastUpdateTimestamp;
|
||||
|
||||
public static void main(final String[] args) throws Exception {
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
||||
IOUtils.toString(MigrateDbEntitiesApplication.class.getResourceAsStream("/eu/dnetlib/dhp/migration/migrate_db_entities_parameters.json")));
|
||||
|
||||
parser.parseArgument(args);
|
||||
|
||||
final String dbUrl = parser.get("postgresUrl");
|
||||
final String dbUser = parser.get("postgresUser");
|
||||
final String dbPassword = parser.get("postgresPassword");
|
||||
|
||||
final String hdfsPath = parser.get("hdfsPath");
|
||||
|
||||
final boolean processClaims = parser.get("action") != null && parser.get("action").equalsIgnoreCase("claims");
|
||||
|
||||
try (final MigrateDbEntitiesApplication smdbe = new MigrateDbEntitiesApplication(hdfsPath, dbUrl, dbUser, dbPassword)) {
|
||||
if (processClaims) {
|
||||
log.info("Processing claims...");
|
||||
smdbe.execute("queryClaims.sql", smdbe::processClaims);
|
||||
} else {
|
||||
log.info("Processing datasources...");
|
||||
smdbe.execute("queryDatasources.sql", smdbe::processDatasource);
|
||||
|
||||
log.info("Processing projects...");
|
||||
smdbe.execute("queryProjects.sql", smdbe::processProject);
|
||||
|
||||
log.info("Processing orgs...");
|
||||
smdbe.execute("queryOrganizations.sql", smdbe::processOrganization);
|
||||
|
||||
log.info("Processing relations ds <-> orgs ...");
|
||||
smdbe.execute("queryDatasourceOrganization.sql", smdbe::processDatasourceOrganization);
|
||||
|
||||
log.info("Processing projects <-> orgs ...");
|
||||
smdbe.execute("queryProjectOrganization.sql", smdbe::processProjectOrganization);
|
||||
}
|
||||
log.info("All done.");
|
||||
}
|
||||
}
|
||||
|
||||
protected MigrateDbEntitiesApplication() { // ONLY FOR UNIT TEST
|
||||
super();
|
||||
this.dbClient = null;
|
||||
this.lastUpdateTimestamp = new Date().getTime();
|
||||
}
|
||||
|
||||
public MigrateDbEntitiesApplication(final String hdfsPath, final String dbUrl, final String dbUser,
|
||||
final String dbPassword) throws Exception {
|
||||
super(hdfsPath);
|
||||
this.dbClient = new DbClient(dbUrl, dbUser, dbPassword);
|
||||
this.lastUpdateTimestamp = new Date().getTime();
|
||||
}
|
||||
|
||||
public void execute(final String sqlFile, final Function<ResultSet, List<Oaf>> producer) throws Exception {
|
||||
final String sql = IOUtils.toString(getClass().getResourceAsStream("/eu/dnetlib/dhp/migration/sql/" + sqlFile));
|
||||
|
||||
final Consumer<ResultSet> consumer = rs -> producer.apply(rs).forEach(oaf -> emitOaf(oaf));
|
||||
|
||||
dbClient.processResults(sql, consumer);
|
||||
}
|
||||
|
||||
public List<Oaf> processDatasource(final ResultSet rs) {
|
||||
|
||||
try {
|
||||
|
||||
final DataInfo info = prepareDataInfo(rs);
|
||||
|
||||
final Datasource ds = new Datasource();
|
||||
|
||||
ds.setId(createOpenaireId(10, rs.getString("datasourceid"), true));
|
||||
ds.setOriginalId(Arrays.asList(rs.getString("datasourceid")));
|
||||
ds.setCollectedfrom(listKeyValues(rs.getString("collectedfromid"), rs.getString("collectedfromname")));
|
||||
ds.setPid(new ArrayList<>());
|
||||
ds.setDateofcollection(asString(rs.getDate("dateofcollection")));
|
||||
ds.setDateoftransformation(null); // Value not returned by the SQL query
|
||||
ds.setExtraInfo(new ArrayList<>()); // Values not present in the DB
|
||||
ds.setOaiprovenance(null); // Values not present in the DB
|
||||
ds.setDatasourcetype(prepareQualifierSplitting(rs.getString("datasourcetype")));
|
||||
ds.setOpenairecompatibility(prepareQualifierSplitting(rs.getString("openairecompatibility")));
|
||||
ds.setOfficialname(field(rs.getString("officialname"), info));
|
||||
ds.setEnglishname(field(rs.getString("englishname"), info));
|
||||
ds.setWebsiteurl(field(rs.getString("websiteurl"), info));
|
||||
ds.setLogourl(field(rs.getString("logourl"), info));
|
||||
ds.setContactemail(field(rs.getString("contactemail"), info));
|
||||
ds.setNamespaceprefix(field(rs.getString("namespaceprefix"), info));
|
||||
ds.setLatitude(field(Double.toString(rs.getDouble("latitude")), info));
|
||||
ds.setLongitude(field(Double.toString(rs.getDouble("longitude")), info));
|
||||
ds.setDateofvalidation(field(asString(rs.getDate("dateofvalidation")), info));
|
||||
ds.setDescription(field(rs.getString("description"), info));
|
||||
ds.setSubjects(prepareListOfStructProps(rs.getArray("subjects"), info));
|
||||
ds.setOdnumberofitems(field(Double.toString(rs.getInt("odnumberofitems")), info));
|
||||
ds.setOdnumberofitemsdate(field(asString(rs.getDate("odnumberofitemsdate")), info));
|
||||
ds.setOdpolicies(field(rs.getString("odpolicies"), info));
|
||||
ds.setOdlanguages(prepareListFields(rs.getArray("odlanguages"), info));
|
||||
ds.setOdcontenttypes(prepareListFields(rs.getArray("odcontenttypes"), info));
|
||||
ds.setAccessinfopackage(prepareListFields(rs.getArray("accessinfopackage"), info));
|
||||
ds.setReleasestartdate(field(asString(rs.getDate("releasestartdate")), info));
|
||||
ds.setReleaseenddate(field(asString(rs.getDate("releaseenddate")), info));
|
||||
ds.setMissionstatementurl(field(rs.getString("missionstatementurl"), info));
|
||||
ds.setDataprovider(field(rs.getBoolean("dataprovider"), info));
|
||||
ds.setServiceprovider(field(rs.getBoolean("serviceprovider"), info));
|
||||
ds.setDatabaseaccesstype(field(rs.getString("databaseaccesstype"), info));
|
||||
ds.setDatauploadtype(field(rs.getString("datauploadtype"), info));
|
||||
ds.setDatabaseaccessrestriction(field(rs.getString("databaseaccessrestriction"), info));
|
||||
ds.setDatauploadrestriction(field(rs.getString("datauploadrestriction"), info));
|
||||
ds.setVersioning(field(rs.getBoolean("versioning"), info));
|
||||
ds.setCitationguidelineurl(field(rs.getString("citationguidelineurl"), info));
|
||||
ds.setQualitymanagementkind(field(rs.getString("qualitymanagementkind"), info));
|
||||
ds.setPidsystems(field(rs.getString("pidsystems"), info));
|
||||
ds.setCertificates(field(rs.getString("certificates"), info));
|
||||
ds.setPolicies(new ArrayList<>()); // The sql query returns an empty array
|
||||
ds.setJournal(prepareJournal(rs.getString("officialname"), rs.getString("journal"), info)); // Journal
|
||||
ds.setDataInfo(info);
|
||||
ds.setLastupdatetimestamp(lastUpdateTimestamp);
|
||||
|
||||
return Arrays.asList(ds);
|
||||
} catch (final Exception e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
public List<Oaf> processProject(final ResultSet rs) {
|
||||
try {
|
||||
|
||||
final DataInfo info = prepareDataInfo(rs);
|
||||
|
||||
final Project p = new Project();
|
||||
|
||||
p.setId(createOpenaireId(40, rs.getString("projectid"), true));
|
||||
p.setOriginalId(Arrays.asList(rs.getString("projectid")));
|
||||
p.setCollectedfrom(listKeyValues(rs.getString("collectedfromid"), rs.getString("collectedfromname")));
|
||||
p.setPid(new ArrayList<>());
|
||||
p.setDateofcollection(asString(rs.getDate("dateofcollection")));
|
||||
p.setDateoftransformation(asString(rs.getDate("dateoftransformation")));
|
||||
p.setExtraInfo(new ArrayList<>()); // Values not present in the DB
|
||||
p.setOaiprovenance(null); // Values not present in the DB
|
||||
p.setWebsiteurl(field(rs.getString("websiteurl"), info));
|
||||
p.setCode(field(rs.getString("code"), info));
|
||||
p.setAcronym(field(rs.getString("acronym"), info));
|
||||
p.setTitle(field(rs.getString("title"), info));
|
||||
p.setStartdate(field(asString(rs.getDate("startdate")), info));
|
||||
p.setEnddate(field(asString(rs.getDate("enddate")), info));
|
||||
p.setCallidentifier(field(rs.getString("callidentifier"), info));
|
||||
p.setKeywords(field(rs.getString("keywords"), info));
|
||||
p.setDuration(field(Integer.toString(rs.getInt("duration")), info));
|
||||
p.setEcsc39(field(Boolean.toString(rs.getBoolean("ecsc39")), info));
|
||||
p.setOamandatepublications(field(Boolean.toString(rs.getBoolean("oamandatepublications")), info));
|
||||
p.setEcarticle29_3(field(Boolean.toString(rs.getBoolean("ecarticle29_3")), info));
|
||||
p.setSubjects(prepareListOfStructProps(rs.getArray("subjects"), info));
|
||||
p.setFundingtree(prepareListFields(rs.getArray("fundingtree"), info));
|
||||
p.setContracttype(prepareQualifierSplitting(rs.getString("contracttype")));
|
||||
p.setOptional1(field(rs.getString("optional1"), info));
|
||||
p.setOptional2(field(rs.getString("optional2"), info));
|
||||
p.setJsonextrainfo(field(rs.getString("jsonextrainfo"), info));
|
||||
p.setContactfullname(field(rs.getString("contactfullname"), info));
|
||||
p.setContactfax(field(rs.getString("contactfax"), info));
|
||||
p.setContactphone(field(rs.getString("contactphone"), info));
|
||||
p.setContactemail(field(rs.getString("contactemail"), info));
|
||||
p.setSummary(field(rs.getString("summary"), info));
|
||||
p.setCurrency(field(rs.getString("currency"), info));
|
||||
p.setTotalcost(new Float(rs.getDouble("totalcost")));
|
||||
p.setFundedamount(new Float(rs.getDouble("fundedamount")));
|
||||
p.setDataInfo(info);
|
||||
p.setLastupdatetimestamp(lastUpdateTimestamp);
|
||||
|
||||
return Arrays.asList(p);
|
||||
|
||||
} catch (final Exception e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
public List<Oaf> processOrganization(final ResultSet rs) {
|
||||
|
||||
try {
|
||||
|
||||
final DataInfo info = prepareDataInfo(rs);
|
||||
|
||||
final Organization o = new Organization();
|
||||
|
||||
o.setId(createOpenaireId(20, rs.getString("organizationid"), true));
|
||||
o.setOriginalId(Arrays.asList(rs.getString("organizationid")));
|
||||
o.setCollectedfrom(listKeyValues(rs.getString("collectedfromid"), rs.getString("collectedfromname")));
|
||||
o.setPid(new ArrayList<>());
|
||||
o.setDateofcollection(asString(rs.getDate("dateofcollection")));
|
||||
o.setDateoftransformation(asString(rs.getDate("dateoftransformation")));
|
||||
o.setExtraInfo(new ArrayList<>()); // Values not present in the DB
|
||||
o.setOaiprovenance(null); // Values not present in the DB
|
||||
o.setLegalshortname(field(rs.getString("legalshortname"), info));
|
||||
o.setLegalname(field(rs.getString("legalname"), info));
|
||||
o.setAlternativeNames(new ArrayList<>()); // Values not returned by the SQL query
|
||||
o.setWebsiteurl(field(rs.getString("websiteurl"), info));
|
||||
o.setLogourl(field(rs.getString("logourl"), info));
|
||||
o.setEclegalbody(field(Boolean.toString(rs.getBoolean("eclegalbody")), info));
|
||||
o.setEclegalperson(field(Boolean.toString(rs.getBoolean("eclegalperson")), info));
|
||||
o.setEcnonprofit(field(Boolean.toString(rs.getBoolean("ecnonprofit")), info));
|
||||
o.setEcresearchorganization(field(Boolean.toString(rs.getBoolean("ecresearchorganization")), info));
|
||||
o.setEchighereducation(field(Boolean.toString(rs.getBoolean("echighereducation")), info));
|
||||
o.setEcinternationalorganizationeurinterests(field(Boolean.toString(rs.getBoolean("ecinternationalorganizationeurinterests")), info));
|
||||
o.setEcinternationalorganization(field(Boolean.toString(rs.getBoolean("ecinternationalorganization")), info));
|
||||
o.setEcenterprise(field(Boolean.toString(rs.getBoolean("ecenterprise")), info));
|
||||
o.setEcsmevalidated(field(Boolean.toString(rs.getBoolean("ecsmevalidated")), info));
|
||||
o.setEcnutscode(field(Boolean.toString(rs.getBoolean("ecnutscode")), info));
|
||||
o.setCountry(prepareQualifierSplitting(rs.getString("country")));
|
||||
o.setDataInfo(info);
|
||||
o.setLastupdatetimestamp(lastUpdateTimestamp);
|
||||
|
||||
return Arrays.asList(o);
|
||||
} catch (final Exception e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
public List<Oaf> processDatasourceOrganization(final ResultSet rs) {
|
||||
try {
|
||||
final DataInfo info = prepareDataInfo(rs);
|
||||
final String orgId = createOpenaireId(20, rs.getString("organization"), true);
|
||||
final String dsId = createOpenaireId(10, rs.getString("datasource"), true);
|
||||
final List<KeyValue> collectedFrom = listKeyValues(rs.getString("collectedfromid"), rs.getString("collectedfromname"));
|
||||
|
||||
final Relation r1 = new Relation();
|
||||
r1.setRelType("datasourceOrganization");
|
||||
r1.setSubRelType("provision");
|
||||
r1.setRelClass("isProvidedBy");
|
||||
r1.setSource(dsId);
|
||||
r1.setTarget(orgId);
|
||||
r1.setCollectedFrom(collectedFrom);
|
||||
r1.setDataInfo(info);
|
||||
r1.setLastupdatetimestamp(lastUpdateTimestamp);
|
||||
|
||||
final Relation r2 = new Relation();
|
||||
r2.setRelType("datasourceOrganization");
|
||||
r2.setSubRelType("provision");
|
||||
r2.setRelClass("provides");
|
||||
r2.setSource(orgId);
|
||||
r2.setTarget(dsId);
|
||||
r2.setCollectedFrom(collectedFrom);
|
||||
r2.setDataInfo(info);
|
||||
r2.setLastupdatetimestamp(lastUpdateTimestamp);
|
||||
|
||||
return Arrays.asList(r1, r2);
|
||||
} catch (final Exception e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
public List<Oaf> processProjectOrganization(final ResultSet rs) {
|
||||
try {
|
||||
final DataInfo info = prepareDataInfo(rs);
|
||||
final String orgId = createOpenaireId(20, rs.getString("resporganization"), true);
|
||||
final String projectId = createOpenaireId(40, rs.getString("project"), true);
|
||||
final List<KeyValue> collectedFrom = listKeyValues(rs.getString("collectedfromid"), rs.getString("collectedfromname"));
|
||||
|
||||
final Relation r1 = new Relation();
|
||||
r1.setRelType("projectOrganization");
|
||||
r1.setSubRelType("participation");
|
||||
r1.setRelClass("isParticipant");
|
||||
r1.setSource(projectId);
|
||||
r1.setTarget(orgId);
|
||||
r1.setCollectedFrom(collectedFrom);
|
||||
r1.setDataInfo(info);
|
||||
r1.setLastupdatetimestamp(lastUpdateTimestamp);
|
||||
|
||||
final Relation r2 = new Relation();
|
||||
r2.setRelType("projectOrganization");
|
||||
r2.setSubRelType("participation");
|
||||
r2.setRelClass("hasParticipant");
|
||||
r2.setSource(orgId);
|
||||
r2.setTarget(projectId);
|
||||
r2.setCollectedFrom(collectedFrom);
|
||||
r2.setDataInfo(info);
|
||||
r2.setLastupdatetimestamp(lastUpdateTimestamp);
|
||||
|
||||
return Arrays.asList(r1, r2);
|
||||
} catch (final Exception e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
public List<Oaf> processClaims(final ResultSet rs) {
|
||||
|
||||
final DataInfo info =
|
||||
dataInfo(false, null, false, false, qualifier("user:claim", "user:claim", "dnet:provenanceActions", "dnet:provenanceActions"), "0.9");
|
||||
|
||||
try {
|
||||
|
||||
if (rs.getString("source_type").equals("context")) {
|
||||
final Result r;
|
||||
|
||||
if (rs.getString("target_type").equals("dataset")) {
|
||||
r = new Dataset();
|
||||
} else if (rs.getString("target_type").equals("software")) {
|
||||
r = new Software();
|
||||
} else if (rs.getString("target_type").equals("other")) {
|
||||
r = new OtherResearchProduct();
|
||||
} else {
|
||||
r = new Publication();
|
||||
}
|
||||
r.setId(createOpenaireId(50, rs.getString("target_id"), false));
|
||||
r.setLastupdatetimestamp(lastUpdateTimestamp);
|
||||
r.setContext(prepareContext(rs.getString("source_id"), info));
|
||||
r.setDataInfo(info);
|
||||
|
||||
return Arrays.asList(r);
|
||||
} else {
|
||||
final String sourceId = createOpenaireId(rs.getString("source_type"), rs.getString("source_id"), false);
|
||||
final String targetId = createOpenaireId(rs.getString("target_type"), rs.getString("target_id"), false);
|
||||
|
||||
final Relation r1 = new Relation();
|
||||
final Relation r2 = new Relation();
|
||||
|
||||
if (rs.getString("source_type").equals("project")) {
|
||||
r1.setRelType("resultProject");
|
||||
r1.setSubRelType("outcome");
|
||||
r1.setRelClass("produces");
|
||||
|
||||
r2.setRelType("resultProject");
|
||||
r2.setSubRelType("outcome");
|
||||
r2.setRelClass("isProducedBy");
|
||||
} else {
|
||||
r1.setRelType("resultResult");
|
||||
r1.setSubRelType("relationship");
|
||||
r1.setRelClass("isRelatedTo");
|
||||
|
||||
r2.setRelType("resultResult");
|
||||
r2.setSubRelType("relationship");
|
||||
r2.setRelClass("isRelatedTo");
|
||||
}
|
||||
|
||||
r1.setSource(sourceId);
|
||||
r1.setTarget(targetId);
|
||||
r1.setDataInfo(info);
|
||||
r1.setLastupdatetimestamp(lastUpdateTimestamp);
|
||||
|
||||
r2.setSource(targetId);
|
||||
r2.setTarget(sourceId);
|
||||
r2.setDataInfo(info);
|
||||
r2.setLastupdatetimestamp(lastUpdateTimestamp);
|
||||
|
||||
return Arrays.asList(r1, r2);
|
||||
}
|
||||
|
||||
} catch (final Exception e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
private List<Context> prepareContext(final String id, final DataInfo dataInfo) {
|
||||
final Context context = new Context();
|
||||
context.setId(id);
|
||||
context.setDataInfo(Arrays.asList(dataInfo));
|
||||
return Arrays.asList(context);
|
||||
}
|
||||
|
||||
private DataInfo prepareDataInfo(final ResultSet rs) throws SQLException {
|
||||
final Boolean deletedbyinference = rs.getBoolean("deletedbyinference");
|
||||
final String inferenceprovenance = rs.getString("inferenceprovenance");
|
||||
final Boolean inferred = rs.getBoolean("inferred");
|
||||
final String trust = rs.getString("trust");
|
||||
return dataInfo(deletedbyinference, inferenceprovenance, inferred, false, ENTITYREGISTRY_PROVENANCE_ACTION, trust);
|
||||
}
|
||||
|
||||
private Qualifier prepareQualifierSplitting(final String s) {
|
||||
if (StringUtils.isBlank(s)) { return null; }
|
||||
final String[] arr = s.split("@@@");
|
||||
return arr.length == 4 ? qualifier(arr[0], arr[1], arr[2], arr[3]) : null;
|
||||
}
|
||||
|
||||
private List<Field<String>> prepareListFields(final Array array, final DataInfo info) {
|
||||
try {
|
||||
return array != null ? listFields(info, (String[]) array.getArray()) : new ArrayList<>();
|
||||
} catch (final SQLException e) {
|
||||
throw new RuntimeException("Invalid SQL array", e);
|
||||
}
|
||||
}
|
||||
|
||||
private StructuredProperty prepareStructProp(final String s, final DataInfo dataInfo) {
|
||||
if (StringUtils.isBlank(s)) { return null; }
|
||||
final String[] parts = s.split("###");
|
||||
if (parts.length == 2) {
|
||||
final String value = parts[0];
|
||||
final String[] arr = parts[1].split("@@@");
|
||||
if (arr.length == 4) { return structuredProperty(value, arr[0], arr[1], arr[2], arr[3], dataInfo); }
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
private List<StructuredProperty> prepareListOfStructProps(final Array array, final DataInfo dataInfo) throws SQLException {
|
||||
final List<StructuredProperty> res = new ArrayList<>();
|
||||
if (array != null) {
|
||||
for (final String s : (String[]) array.getArray()) {
|
||||
final StructuredProperty sp = prepareStructProp(s, dataInfo);
|
||||
if (sp != null) {
|
||||
res.add(sp);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
private Journal prepareJournal(final String name, final String sj, final DataInfo info) {
|
||||
if (StringUtils.isNotBlank(sj)) {
|
||||
final String[] arr = sj.split("@@@");
|
||||
if (arr.length == 3) {
|
||||
final String issn = StringUtils.isNotBlank(arr[0]) ? arr[0] : null;
|
||||
final String eissn = StringUtils.isNotBlank(arr[1]) ? arr[1] : null;;
|
||||
final String lissn = StringUtils.isNotBlank(arr[2]) ? arr[2] : null;;
|
||||
if (issn != null || eissn != null
|
||||
|| lissn != null) { return journal(name, issn, eissn, eissn, null, null, null, null, null, null, null, info); }
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() throws IOException {
|
||||
super.close();
|
||||
dbClient.close();
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,67 @@
|
|||
package eu.dnetlib.dhp.migration.step1;
|
||||
|
||||
import java.io.Closeable;
|
||||
import java.io.IOException;
|
||||
import java.util.Map;
|
||||
import java.util.Map.Entry;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.migration.utils.AbstractMigrationApplication;
|
||||
import eu.dnetlib.dhp.migration.utils.MdstoreClient;
|
||||
|
||||
public class MigrateMongoMdstoresApplication extends AbstractMigrationApplication implements Closeable {
|
||||
|
||||
private static final Log log = LogFactory.getLog(MigrateMongoMdstoresApplication.class);
|
||||
|
||||
private final MdstoreClient mdstoreClient;
|
||||
|
||||
public static void main(final String[] args) throws Exception {
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
||||
IOUtils.toString(MigrateMongoMdstoresApplication.class.getResourceAsStream("/eu/dnetlib/dhp/migration/migrate_mongo_mstores_parameters.json")));
|
||||
parser.parseArgument(args);
|
||||
|
||||
final String mongoBaseUrl = parser.get("mongoBaseUrl");
|
||||
final String mongoDb = parser.get("mongoDb");
|
||||
|
||||
final String mdFormat = parser.get("mdFormat");
|
||||
final String mdLayout = parser.get("mdLayout");
|
||||
final String mdInterpretation = parser.get("mdInterpretation");
|
||||
|
||||
final String hdfsPath = parser.get("hdfsPath");
|
||||
|
||||
try (MigrateMongoMdstoresApplication app = new MigrateMongoMdstoresApplication(hdfsPath, mongoBaseUrl, mongoDb)) {
|
||||
app.execute(mdFormat, mdLayout, mdInterpretation);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
public MigrateMongoMdstoresApplication(final String hdfsPath, final String mongoBaseUrl, final String mongoDb) throws Exception {
|
||||
super(hdfsPath);
|
||||
this.mdstoreClient = new MdstoreClient(mongoBaseUrl, mongoDb);
|
||||
}
|
||||
|
||||
public void execute(final String format, final String layout, final String interpretation) {
|
||||
final Map<String, String> colls = mdstoreClient.validCollections(format, layout, interpretation);
|
||||
log.info("Found " + colls.size() + " mdstores");
|
||||
|
||||
for (final Entry<String, String> entry : colls.entrySet()) {
|
||||
log.info("Processing mdstore " + entry.getKey() + " (collection: " + entry.getValue() + ")");
|
||||
final String currentColl = entry.getValue();
|
||||
|
||||
for (final String xml : mdstoreClient.listRecords(currentColl)) {
|
||||
emit(xml, "native_" + format);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() throws IOException {
|
||||
super.close();
|
||||
mdstoreClient.close();
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,394 @@
|
|||
package eu.dnetlib.dhp.migration.step2;
|
||||
|
||||
import static eu.dnetlib.dhp.migration.utils.OafMapperUtils.createOpenaireId;
|
||||
import static eu.dnetlib.dhp.migration.utils.OafMapperUtils.dataInfo;
|
||||
import static eu.dnetlib.dhp.migration.utils.OafMapperUtils.field;
|
||||
import static eu.dnetlib.dhp.migration.utils.OafMapperUtils.journal;
|
||||
import static eu.dnetlib.dhp.migration.utils.OafMapperUtils.keyValue;
|
||||
import static eu.dnetlib.dhp.migration.utils.OafMapperUtils.listFields;
|
||||
import static eu.dnetlib.dhp.migration.utils.OafMapperUtils.oaiIProvenance;
|
||||
import static eu.dnetlib.dhp.migration.utils.OafMapperUtils.qualifier;
|
||||
import static eu.dnetlib.dhp.migration.utils.OafMapperUtils.structuredProperty;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Date;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.dom4j.Document;
|
||||
import org.dom4j.DocumentFactory;
|
||||
import org.dom4j.DocumentHelper;
|
||||
import org.dom4j.Node;
|
||||
|
||||
import eu.dnetlib.dhp.schema.oaf.Author;
|
||||
import eu.dnetlib.dhp.schema.oaf.DataInfo;
|
||||
import eu.dnetlib.dhp.schema.oaf.Dataset;
|
||||
import eu.dnetlib.dhp.schema.oaf.Field;
|
||||
import eu.dnetlib.dhp.schema.oaf.GeoLocation;
|
||||
import eu.dnetlib.dhp.schema.oaf.Instance;
|
||||
import eu.dnetlib.dhp.schema.oaf.Journal;
|
||||
import eu.dnetlib.dhp.schema.oaf.KeyValue;
|
||||
import eu.dnetlib.dhp.schema.oaf.OAIProvenance;
|
||||
import eu.dnetlib.dhp.schema.oaf.Oaf;
|
||||
import eu.dnetlib.dhp.schema.oaf.OtherResearchProduct;
|
||||
import eu.dnetlib.dhp.schema.oaf.Publication;
|
||||
import eu.dnetlib.dhp.schema.oaf.Qualifier;
|
||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||
import eu.dnetlib.dhp.schema.oaf.Software;
|
||||
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
||||
|
||||
public abstract class AbstractMdRecordToOafMapper {
|
||||
|
||||
protected final Map<String, String> code2name;
|
||||
|
||||
protected static final Qualifier MAIN_TITLE_QUALIFIER = qualifier("main title", "main title", "dnet:dataCite_title", "dnet:dataCite_title");
|
||||
|
||||
protected static final Qualifier PUBLICATION_RESULTTYPE_QUALIFIER =
|
||||
qualifier("publication", "publication", "dnet:result_typologies", "dnet:result_typologies");
|
||||
protected static final Qualifier DATASET_RESULTTYPE_QUALIFIER = qualifier("dataset", "dataset", "dnet:result_typologies", "dnet:result_typologies");
|
||||
protected static final Qualifier SOFTWARE_RESULTTYPE_QUALIFIER = qualifier("software", "software", "dnet:result_typologies", "dnet:result_typologies");
|
||||
protected static final Qualifier OTHER_RESULTTYPE_QUALIFIER = qualifier("other", "other", "dnet:result_typologies", "dnet:result_typologies");
|
||||
|
||||
protected AbstractMdRecordToOafMapper(final Map<String, String> code2name) {
|
||||
this.code2name = code2name;
|
||||
}
|
||||
|
||||
public List<Oaf> processMdRecord(final String xml) {
|
||||
try {
|
||||
final Map<String, String> nsContext = new HashMap<>();
|
||||
nsContext.put("dr", "http://www.driver-repository.eu/namespace/dr");
|
||||
nsContext.put("dri", "http://www.driver-repository.eu/namespace/dri");
|
||||
nsContext.put("oaf", "http://namespace.openaire.eu/oaf");
|
||||
nsContext.put("oai", "http://www.openarchives.org/OAI/2.0/");
|
||||
nsContext.put("prov", "http://www.openarchives.org/OAI/2.0/provenance");
|
||||
nsContext.put("dc", "http://purl.org/dc/elements/1.1/");
|
||||
nsContext.put("datacite", "http://datacite.org/schema/kernel-3");
|
||||
DocumentFactory.getInstance().setXPathNamespaceURIs(nsContext);
|
||||
|
||||
final Document doc = DocumentHelper.parseText(xml);
|
||||
|
||||
final String type = doc.valueOf("//dr:CobjCategory/@type");
|
||||
final KeyValue collectedFrom = keyValue(doc.valueOf("//oaf:collectedFrom/@id"), doc.valueOf("//oaf:collectedFrom/@name"));
|
||||
final KeyValue hostedBy = StringUtils.isBlank(doc.valueOf("//oaf:hostedBy/@id")) ? collectedFrom
|
||||
: keyValue(doc.valueOf("//oaf:hostedBy/@id"), doc.valueOf("//oaf:hostedBy/@name"));
|
||||
|
||||
final DataInfo info = prepareDataInfo(doc);
|
||||
final long lastUpdateTimestamp = new Date().getTime();
|
||||
|
||||
return createOafs(doc, type, collectedFrom, hostedBy, info, lastUpdateTimestamp);
|
||||
} catch (final Exception e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
protected List<Oaf> createOafs(final Document doc,
|
||||
final String type,
|
||||
final KeyValue collectedFrom,
|
||||
final KeyValue hostedBy,
|
||||
final DataInfo info,
|
||||
final long lastUpdateTimestamp) {
|
||||
|
||||
final List<Oaf> oafs = new ArrayList<>();
|
||||
|
||||
switch (type.toLowerCase()) {
|
||||
case "":
|
||||
case "publication":
|
||||
final Publication p = new Publication();
|
||||
populateResultFields(p, doc, collectedFrom, hostedBy, info, lastUpdateTimestamp);
|
||||
p.setResulttype(PUBLICATION_RESULTTYPE_QUALIFIER);
|
||||
p.setJournal(prepareJournal(doc, info));
|
||||
oafs.add(p);
|
||||
break;
|
||||
case "dataset":
|
||||
final Dataset d = new Dataset();
|
||||
populateResultFields(d, doc, collectedFrom, hostedBy, info, lastUpdateTimestamp);
|
||||
d.setResulttype(DATASET_RESULTTYPE_QUALIFIER);
|
||||
d.setStoragedate(prepareDatasetStorageDate(doc, info));
|
||||
d.setDevice(prepareDatasetDevice(doc, info));
|
||||
d.setSize(prepareDatasetSize(doc, info));
|
||||
d.setVersion(prepareDatasetVersion(doc, info));
|
||||
d.setLastmetadataupdate(prepareDatasetLastMetadataUpdate(doc, info));
|
||||
d.setMetadataversionnumber(prepareDatasetMetadataVersionNumber(doc, info));
|
||||
d.setGeolocation(prepareDatasetGeoLocations(doc, info));
|
||||
oafs.add(d);
|
||||
break;
|
||||
case "software":
|
||||
final Software s = new Software();
|
||||
populateResultFields(s, doc, collectedFrom, hostedBy, info, lastUpdateTimestamp);
|
||||
s.setResulttype(SOFTWARE_RESULTTYPE_QUALIFIER);
|
||||
s.setDocumentationUrl(prepareSoftwareDocumentationUrls(doc, info));
|
||||
s.setLicense(prepareSoftwareLicenses(doc, info));
|
||||
s.setCodeRepositoryUrl(prepareSoftwareCodeRepositoryUrl(doc, info));
|
||||
s.setProgrammingLanguage(prepareSoftwareProgrammingLanguage(doc, info));
|
||||
oafs.add(s);
|
||||
break;
|
||||
case "otherresearchproducts":
|
||||
default:
|
||||
final OtherResearchProduct o = new OtherResearchProduct();
|
||||
populateResultFields(o, doc, collectedFrom, hostedBy, info, lastUpdateTimestamp);
|
||||
o.setResulttype(OTHER_RESULTTYPE_QUALIFIER);
|
||||
o.setContactperson(prepareOtherResearchProductContactPersons(doc, info));
|
||||
o.setContactgroup(prepareOtherResearchProductContactGroups(doc, info));
|
||||
o.setTool(prepareOtherResearchProductTools(doc, info));
|
||||
oafs.add(o);
|
||||
break;
|
||||
}
|
||||
|
||||
if (!oafs.isEmpty()) {
|
||||
oafs.addAll(addProjectRels(doc, collectedFrom, info, lastUpdateTimestamp));
|
||||
oafs.addAll(addOtherResultRels(doc, collectedFrom, info, lastUpdateTimestamp));
|
||||
}
|
||||
|
||||
return oafs;
|
||||
}
|
||||
|
||||
private List<Oaf> addProjectRels(final Document doc,
|
||||
final KeyValue collectedFrom,
|
||||
final DataInfo info,
|
||||
final long lastUpdateTimestamp) {
|
||||
|
||||
final List<Oaf> res = new ArrayList<>();
|
||||
|
||||
final String docId = createOpenaireId(50, doc.valueOf("//dri:objIdentifier"), false);
|
||||
|
||||
for (final Object o : doc.selectNodes("//oaf:projectid")) {
|
||||
final String projectId = createOpenaireId(40, ((Node) o).getText(), true);
|
||||
|
||||
final Relation r1 = new Relation();
|
||||
r1.setRelType("resultProject");
|
||||
r1.setSubRelType("outcome");
|
||||
r1.setRelClass("isProducedBy");
|
||||
r1.setSource(docId);
|
||||
r1.setTarget(projectId);
|
||||
r1.setCollectedFrom(Arrays.asList(collectedFrom));
|
||||
r1.setDataInfo(info);
|
||||
r1.setLastupdatetimestamp(lastUpdateTimestamp);
|
||||
res.add(r1);
|
||||
|
||||
final Relation r2 = new Relation();
|
||||
r2.setRelType("resultProject");
|
||||
r2.setSubRelType("outcome");
|
||||
r2.setRelClass("produces");
|
||||
r2.setSource(projectId);
|
||||
r2.setTarget(docId);
|
||||
r2.setCollectedFrom(Arrays.asList(collectedFrom));
|
||||
r2.setDataInfo(info);
|
||||
r2.setLastupdatetimestamp(lastUpdateTimestamp);
|
||||
res.add(r2);
|
||||
}
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
protected abstract List<Oaf> addOtherResultRels(final Document doc,
|
||||
final KeyValue collectedFrom,
|
||||
final DataInfo info,
|
||||
final long lastUpdateTimestamp);
|
||||
|
||||
private void populateResultFields(final Result r,
|
||||
final Document doc,
|
||||
final KeyValue collectedFrom,
|
||||
final KeyValue hostedBy,
|
||||
final DataInfo info,
|
||||
final long lastUpdateTimestamp) {
|
||||
r.setDataInfo(info);
|
||||
r.setLastupdatetimestamp(lastUpdateTimestamp);
|
||||
r.setId(createOpenaireId(50, doc.valueOf("//dri:objIdentifier"), false));
|
||||
r.setOriginalId(Arrays.asList(doc.valueOf("//dri:objIdentifier")));
|
||||
r.setCollectedfrom(Arrays.asList(collectedFrom));
|
||||
r.setPid(prepareListStructProps(doc, "//oaf:identifier", "@identifierType", "dnet:pid_types", "dnet:pid_types", info));
|
||||
r.setDateofcollection(doc.valueOf("//dr:dateOfCollection"));
|
||||
r.setDateoftransformation(doc.valueOf("//dr:dateOfTransformation"));
|
||||
r.setExtraInfo(new ArrayList<>()); // NOT PRESENT IN MDSTORES
|
||||
r.setOaiprovenance(prepareOAIprovenance(doc));
|
||||
r.setAuthor(prepareAuthors(doc, info));
|
||||
r.setLanguage(prepareLanguages(doc));
|
||||
r.setCountry(new ArrayList<>()); // NOT PRESENT IN MDSTORES
|
||||
r.setSubject(prepareSubjects(doc, info));
|
||||
r.setTitle(prepareTitles(doc, info));
|
||||
r.setRelevantdate(prepareRelevantDates(doc, info));
|
||||
r.setDescription(prepareDescriptions(doc, info));
|
||||
r.setDateofacceptance(prepareField(doc, "//oaf:dateAccepted", info));
|
||||
r.setPublisher(preparePublisher(doc, info));
|
||||
r.setEmbargoenddate(prepareField(doc, "//oaf:embargoenddate", info));
|
||||
r.setSource(prepareSources(doc, info));
|
||||
r.setFulltext(new ArrayList<>()); // NOT PRESENT IN MDSTORES
|
||||
r.setFormat(prepareFormats(doc, info));
|
||||
r.setContributor(prepareContributors(doc, info));
|
||||
r.setResourcetype(prepareResourceType(doc, info));
|
||||
r.setCoverage(prepareCoverages(doc, info));
|
||||
r.setContext(new ArrayList<>()); // NOT PRESENT IN MDSTORES
|
||||
r.setExternalReference(new ArrayList<>()); // NOT PRESENT IN MDSTORES
|
||||
r.setInstance(prepareInstances(doc, info, collectedFrom, hostedBy));
|
||||
}
|
||||
|
||||
protected abstract Qualifier prepareResourceType(Document doc, DataInfo info);
|
||||
|
||||
protected abstract List<Instance> prepareInstances(Document doc, DataInfo info, KeyValue collectedfrom, KeyValue hostedby);
|
||||
|
||||
protected abstract List<Field<String>> prepareSources(Document doc, DataInfo info);
|
||||
|
||||
protected abstract List<StructuredProperty> prepareRelevantDates(Document doc, DataInfo info);
|
||||
|
||||
protected abstract List<Field<String>> prepareCoverages(Document doc, DataInfo info);
|
||||
|
||||
protected abstract List<Field<String>> prepareContributors(Document doc, DataInfo info);
|
||||
|
||||
protected abstract List<Field<String>> prepareFormats(Document doc, DataInfo info);
|
||||
|
||||
protected abstract Field<String> preparePublisher(Document doc, DataInfo info);
|
||||
|
||||
protected abstract List<Field<String>> prepareDescriptions(Document doc, DataInfo info);
|
||||
|
||||
protected abstract List<StructuredProperty> prepareTitles(Document doc, DataInfo info);
|
||||
|
||||
protected abstract List<StructuredProperty> prepareSubjects(Document doc, DataInfo info);
|
||||
|
||||
protected abstract Qualifier prepareLanguages(Document doc);
|
||||
|
||||
protected abstract List<Author> prepareAuthors(Document doc, DataInfo info);
|
||||
|
||||
protected abstract List<Field<String>> prepareOtherResearchProductTools(Document doc, DataInfo info);
|
||||
|
||||
protected abstract List<Field<String>> prepareOtherResearchProductContactGroups(Document doc, DataInfo info);
|
||||
|
||||
protected abstract List<Field<String>> prepareOtherResearchProductContactPersons(Document doc, DataInfo info);
|
||||
|
||||
protected abstract Qualifier prepareSoftwareProgrammingLanguage(Document doc, DataInfo info);
|
||||
|
||||
protected abstract Field<String> prepareSoftwareCodeRepositoryUrl(Document doc, DataInfo info);
|
||||
|
||||
protected abstract List<StructuredProperty> prepareSoftwareLicenses(Document doc, DataInfo info);
|
||||
|
||||
protected abstract List<Field<String>> prepareSoftwareDocumentationUrls(Document doc, DataInfo info);
|
||||
|
||||
protected abstract List<GeoLocation> prepareDatasetGeoLocations(Document doc, DataInfo info);
|
||||
|
||||
protected abstract Field<String> prepareDatasetMetadataVersionNumber(Document doc, DataInfo info);
|
||||
|
||||
protected abstract Field<String> prepareDatasetLastMetadataUpdate(Document doc, DataInfo info);
|
||||
|
||||
protected abstract Field<String> prepareDatasetVersion(Document doc, DataInfo info);
|
||||
|
||||
protected abstract Field<String> prepareDatasetSize(Document doc, DataInfo info);
|
||||
|
||||
protected abstract Field<String> prepareDatasetDevice(Document doc, DataInfo info);
|
||||
|
||||
protected abstract Field<String> prepareDatasetStorageDate(Document doc, DataInfo info);
|
||||
|
||||
private Journal prepareJournal(final Document doc, final DataInfo info) {
|
||||
final Node n = doc.selectSingleNode("//oaf:journal");
|
||||
if (n != null) {
|
||||
final String name = n.getText();
|
||||
final String issnPrinted = n.valueOf("@issn");
|
||||
final String issnOnline = n.valueOf("@eissn");
|
||||
final String issnLinking = n.valueOf("@lissn");
|
||||
final String ep = n.valueOf("@ep");
|
||||
final String iss = n.valueOf("@iss");
|
||||
final String sp = n.valueOf("@sp");
|
||||
final String vol = n.valueOf("@vol");
|
||||
final String edition = n.valueOf("@edition");
|
||||
if (StringUtils.isNotBlank(name)) { return journal(name, issnPrinted, issnOnline, issnLinking, ep, iss, sp, vol, edition, null, null, info); }
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
protected Qualifier prepareQualifier(final Node node, final String xpath, final String schemeId, final String schemeName) {
|
||||
final String classId = node.valueOf(xpath);
|
||||
final String className = code2name.get(classId);
|
||||
return qualifier(classId, className, schemeId, schemeName);
|
||||
}
|
||||
|
||||
protected List<StructuredProperty> prepareListStructProps(final Node node,
|
||||
final String xpath,
|
||||
final String xpathClassId,
|
||||
final String schemeId,
|
||||
final String schemeName,
|
||||
final DataInfo info) {
|
||||
final List<StructuredProperty> res = new ArrayList<>();
|
||||
for (final Object o : node.selectNodes(xpath)) {
|
||||
final Node n = (Node) o;
|
||||
final String classId = n.valueOf(xpathClassId);
|
||||
final String className = code2name.get(classId);
|
||||
res.add(structuredProperty(n.getText(), classId, className, schemeId, schemeName, info));
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
protected List<StructuredProperty> prepareListStructProps(final Node node, final String xpath, final Qualifier qualifier, final DataInfo info) {
|
||||
final List<StructuredProperty> res = new ArrayList<>();
|
||||
for (final Object o : node.selectNodes(xpath)) {
|
||||
final Node n = (Node) o;
|
||||
res.add(structuredProperty(n.getText(), qualifier, info));
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
protected List<StructuredProperty> prepareListStructProps(final Node node, final String xpath, final DataInfo info) {
|
||||
final List<StructuredProperty> res = new ArrayList<>();
|
||||
for (final Object o : node.selectNodes(xpath)) {
|
||||
final Node n = (Node) o;
|
||||
res.add(structuredProperty(n.getText(), n.valueOf("@classid"), n.valueOf("@classname"), n.valueOf("@schemeid"), n
|
||||
.valueOf("@schemename"), info));
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
protected OAIProvenance prepareOAIprovenance(final Document doc) {
|
||||
final Node n = doc.selectSingleNode("//*[local-name()='provenance']/*[local-name()='originDescription']");
|
||||
|
||||
if (n == null) { return null; }
|
||||
|
||||
final String identifier = n.valueOf("./*[local-name()='identifier']");
|
||||
final String baseURL = n.valueOf("./*[local-name()='baseURL']");;
|
||||
final String metadataNamespace = n.valueOf("./*[local-name()='metadataNamespace']");;
|
||||
final boolean altered = n.valueOf("@altered").equalsIgnoreCase("true");
|
||||
final String datestamp = n.valueOf("./*[local-name()='datestamp']");;
|
||||
final String harvestDate = n.valueOf("@harvestDate");;
|
||||
|
||||
return oaiIProvenance(identifier, baseURL, metadataNamespace, altered, datestamp, harvestDate);
|
||||
|
||||
}
|
||||
|
||||
protected DataInfo prepareDataInfo(final Document doc) {
|
||||
final Node n = doc.selectSingleNode("//oaf:datainfo");
|
||||
|
||||
if (n == null) { return null; }
|
||||
|
||||
final String paClassId = n.valueOf("./oaf:provenanceaction/@classid");
|
||||
final String paClassName = n.valueOf("./oaf:provenanceaction/@classname");
|
||||
final String paSchemeId = n.valueOf("./oaf:provenanceaction/@schemeid");
|
||||
final String paSchemeName = n.valueOf("./oaf:provenanceaction/@schemename");
|
||||
|
||||
final boolean deletedbyinference = Boolean.parseBoolean(n.valueOf("./oaf:deletedbyinference"));
|
||||
final String inferenceprovenance = n.valueOf("./oaf:inferenceprovenance");
|
||||
final Boolean inferred = Boolean.parseBoolean(n.valueOf("./oaf:inferred"));
|
||||
final String trust = n.valueOf("./oaf:trust");
|
||||
|
||||
return dataInfo(deletedbyinference, inferenceprovenance, inferred, false, qualifier(paClassId, paClassName, paSchemeId, paSchemeName), trust);
|
||||
}
|
||||
|
||||
protected Field<String> prepareField(final Node node, final String xpath, final DataInfo info) {
|
||||
return field(node.valueOf(xpath), info);
|
||||
}
|
||||
|
||||
protected List<Field<String>> prepareListFields(final Node node, final String xpath, final DataInfo info) {
|
||||
return listFields(info, prepareListString(node, xpath));
|
||||
}
|
||||
|
||||
protected List<String> prepareListString(final Node node, final String xpath) {
|
||||
final List<String> res = new ArrayList<>();
|
||||
for (final Object o : node.selectNodes(xpath)) {
|
||||
final String s = ((Node) o).getText().trim();
|
||||
if (StringUtils.isNotBlank(s)) {
|
||||
res.add(s);
|
||||
}
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,173 @@
|
|||
package eu.dnetlib.dhp.migration.step2;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.sql.SQLException;
|
||||
import java.util.Arrays;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hadoop.io.Text;
|
||||
import org.apache.hadoop.io.compress.GzipCodec;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.migration.step1.MigrateMongoMdstoresApplication;
|
||||
import eu.dnetlib.dhp.migration.utils.DbClient;
|
||||
import eu.dnetlib.dhp.schema.oaf.Dataset;
|
||||
import eu.dnetlib.dhp.schema.oaf.Datasource;
|
||||
import eu.dnetlib.dhp.schema.oaf.Oaf;
|
||||
import eu.dnetlib.dhp.schema.oaf.Organization;
|
||||
import eu.dnetlib.dhp.schema.oaf.OtherResearchProduct;
|
||||
import eu.dnetlib.dhp.schema.oaf.Project;
|
||||
import eu.dnetlib.dhp.schema.oaf.Publication;
|
||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||
import eu.dnetlib.dhp.schema.oaf.Software;
|
||||
import scala.Tuple2;
|
||||
|
||||
public class GenerateEntitiesApplication {
|
||||
|
||||
private static final Log log = LogFactory.getLog(GenerateEntitiesApplication.class);
|
||||
|
||||
public static void main(final String[] args) throws Exception {
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
||||
IOUtils.toString(MigrateMongoMdstoresApplication.class
|
||||
.getResourceAsStream("/eu/dnetlib/dhp/migration/generate_entities_parameters.json")));
|
||||
|
||||
parser.parseArgument(args);
|
||||
|
||||
final String sourcePaths = parser.get("sourcePaths");
|
||||
final String targetPath = parser.get("targetPath");
|
||||
|
||||
final String dbUrl = parser.get("postgresUrl");
|
||||
final String dbUser = parser.get("postgresUser");
|
||||
final String dbPassword = parser.get("postgresPassword");
|
||||
|
||||
final Map<String, String> code2name = loadClassNames(dbUrl, dbUser, dbPassword);
|
||||
|
||||
try (final SparkSession spark = newSparkSession(parser); final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext())) {
|
||||
final List<String> existingSourcePaths = Arrays.stream(sourcePaths.split(",")).filter(p -> exists(sc, p)).collect(Collectors.toList());
|
||||
generateEntities(sc, code2name, existingSourcePaths, targetPath);
|
||||
}
|
||||
}
|
||||
|
||||
private static SparkSession newSparkSession(final ArgumentApplicationParser parser) {
|
||||
return SparkSession
|
||||
.builder()
|
||||
.appName(GenerateEntitiesApplication.class.getSimpleName())
|
||||
.master(parser.get("master"))
|
||||
.getOrCreate();
|
||||
}
|
||||
|
||||
private static void generateEntities(final JavaSparkContext sc,
|
||||
final Map<String, String> code2name,
|
||||
final List<String> sourcePaths,
|
||||
final String targetPath) {
|
||||
|
||||
log.info("Generate entities from files:");
|
||||
sourcePaths.forEach(log::info);
|
||||
|
||||
JavaRDD<String> inputRdd = sc.emptyRDD();
|
||||
|
||||
for (final String sp : sourcePaths) {
|
||||
inputRdd = inputRdd.union(sc.sequenceFile(sp, Text.class, Text.class)
|
||||
.map(k -> new Tuple2<>(k._1().toString(), k._2().toString()))
|
||||
.map(k -> convertToListOaf(k._1(), k._2(), code2name))
|
||||
.flatMap(list -> list.iterator())
|
||||
.map(oaf -> oaf.getClass().getSimpleName().toLowerCase() + "|" + convertToJson(oaf)));
|
||||
}
|
||||
|
||||
inputRdd.saveAsTextFile(targetPath, GzipCodec.class);
|
||||
|
||||
}
|
||||
|
||||
private static List<Oaf> convertToListOaf(final String id, final String s, final Map<String, String> code2name) {
|
||||
final String type = StringUtils.substringAfter(id, ":");
|
||||
|
||||
switch (type.toLowerCase()) {
|
||||
case "native_oaf":
|
||||
return new OafToOafMapper(code2name).processMdRecord(s);
|
||||
case "native_odf":
|
||||
return new OdfToOafMapper(code2name).processMdRecord(s);
|
||||
case "datasource":
|
||||
return Arrays.asList(convertFromJson(s, Datasource.class));
|
||||
case "organization":
|
||||
return Arrays.asList(convertFromJson(s, Organization.class));
|
||||
case "project":
|
||||
return Arrays.asList(convertFromJson(s, Project.class));
|
||||
case "relation":
|
||||
return Arrays.asList(convertFromJson(s, Relation.class));
|
||||
case "publication":
|
||||
return Arrays.asList(convertFromJson(s, Publication.class));
|
||||
case "dataset":
|
||||
return Arrays.asList(convertFromJson(s, Dataset.class));
|
||||
case "software":
|
||||
return Arrays.asList(convertFromJson(s, Software.class));
|
||||
case "otherresearchproducts":
|
||||
default:
|
||||
return Arrays.asList(convertFromJson(s, OtherResearchProduct.class));
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
private static Map<String, String> loadClassNames(final String dbUrl, final String dbUser, final String dbPassword) throws IOException {
|
||||
|
||||
log.info("Loading vocabulary terms from db...");
|
||||
|
||||
final Map<String, String> map = new HashMap<>();
|
||||
|
||||
try (DbClient dbClient = new DbClient(dbUrl, dbUser, dbPassword)) {
|
||||
dbClient.processResults("select code, name from class", rs -> {
|
||||
try {
|
||||
map.put(rs.getString("code"), rs.getString("name"));
|
||||
} catch (final SQLException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
log.info("Found " + map.size() + " terms.");
|
||||
|
||||
return map;
|
||||
|
||||
}
|
||||
|
||||
private static String convertToJson(final Oaf oaf) {
|
||||
try {
|
||||
return new ObjectMapper().writeValueAsString(oaf);
|
||||
} catch (final Exception e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
private static Oaf convertFromJson(final String s, final Class<? extends Oaf> clazz) {
|
||||
try {
|
||||
return new ObjectMapper().readValue(s, clazz);
|
||||
} catch (final Exception e) {
|
||||
log.error("Error parsing object of class: " + clazz);
|
||||
log.error(s);
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
private static boolean exists(final JavaSparkContext context, final String pathToFile) {
|
||||
try {
|
||||
final FileSystem hdfs = org.apache.hadoop.fs.FileSystem.get(context.hadoopConfiguration());
|
||||
final Path path = new Path(pathToFile);
|
||||
return hdfs.exists(path);
|
||||
} catch (final IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,242 @@
|
|||
package eu.dnetlib.dhp.migration.step2;
|
||||
|
||||
import static eu.dnetlib.dhp.migration.utils.OafMapperUtils.createOpenaireId;
|
||||
import static eu.dnetlib.dhp.migration.utils.OafMapperUtils.field;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import org.dom4j.Document;
|
||||
import org.dom4j.Node;
|
||||
|
||||
import eu.dnetlib.dhp.migration.utils.PacePerson;
|
||||
import eu.dnetlib.dhp.schema.oaf.Author;
|
||||
import eu.dnetlib.dhp.schema.oaf.DataInfo;
|
||||
import eu.dnetlib.dhp.schema.oaf.Field;
|
||||
import eu.dnetlib.dhp.schema.oaf.GeoLocation;
|
||||
import eu.dnetlib.dhp.schema.oaf.Instance;
|
||||
import eu.dnetlib.dhp.schema.oaf.KeyValue;
|
||||
import eu.dnetlib.dhp.schema.oaf.Oaf;
|
||||
import eu.dnetlib.dhp.schema.oaf.Qualifier;
|
||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
||||
|
||||
public class OafToOafMapper extends AbstractMdRecordToOafMapper {
|
||||
|
||||
public OafToOafMapper(final Map<String, String> code2name) {
|
||||
super(code2name);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected List<Author> prepareAuthors(final Document doc, final DataInfo info) {
|
||||
final List<Author> res = new ArrayList<>();
|
||||
int pos = 1;
|
||||
for (final Object o : doc.selectNodes("//dc:creator")) {
|
||||
final Node n = (Node) o;
|
||||
final Author author = new Author();
|
||||
author.setFullname(n.getText());
|
||||
author.setRank(pos++);
|
||||
final PacePerson p = new PacePerson(n.getText(), false);
|
||||
if (p.isAccurate()) {
|
||||
author.setName(p.getNormalisedFirstName());
|
||||
author.setSurname(p.getNormalisedSurname());
|
||||
}
|
||||
res.add(author);
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Qualifier prepareLanguages(final Document doc) {
|
||||
return prepareQualifier(doc, "//dc:language", "dnet:languages", "dnet:languages");
|
||||
}
|
||||
|
||||
@Override
|
||||
protected List<StructuredProperty> prepareSubjects(final Document doc, final DataInfo info) {
|
||||
return prepareListStructProps(doc, "//dc:subject", info);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected List<StructuredProperty> prepareTitles(final Document doc, final DataInfo info) {
|
||||
return prepareListStructProps(doc, "//dc:title", MAIN_TITLE_QUALIFIER, info);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected List<Field<String>> prepareDescriptions(final Document doc, final DataInfo info) {
|
||||
return prepareListFields(doc, "//dc:description", info);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Field<String> preparePublisher(final Document doc, final DataInfo info) {
|
||||
return prepareField(doc, "//dc:publisher", info);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected List<Field<String>> prepareFormats(final Document doc, final DataInfo info) {
|
||||
return prepareListFields(doc, "//dc:format", info);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected List<Field<String>> prepareContributors(final Document doc, final DataInfo info) {
|
||||
return prepareListFields(doc, "//dc:contributor", info);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected List<Field<String>> prepareCoverages(final Document doc, final DataInfo info) {
|
||||
return prepareListFields(doc, "//dc:coverage", info);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected List<Instance> prepareInstances(final Document doc, final DataInfo info, final KeyValue collectedfrom, final KeyValue hostedby) {
|
||||
final List<Instance> res = new ArrayList<>();
|
||||
for (final Object o : doc.selectNodes("//dc:identifier")) {
|
||||
final String url = ((Node) o).getText().trim();
|
||||
if (url.startsWith("http")) {
|
||||
final Instance instance = new Instance();
|
||||
instance.setUrl(Arrays.asList(url));
|
||||
instance.setInstancetype(prepareQualifier(doc, "//dr:CobjCategory", "dnet:publication_resource", "dnet:publication_resource"));
|
||||
instance.setCollectedfrom(collectedfrom);
|
||||
instance.setHostedby(hostedby);
|
||||
instance.setDateofacceptance(field(doc.valueOf("//oaf:dateAccepted"), info));
|
||||
instance.setDistributionlocation(doc.valueOf("//oaf:distributionlocation"));
|
||||
instance.setAccessright(prepareQualifier(doc, "//oaf:accessrights", "dnet:access_modes", "dnet:access_modes"));
|
||||
instance.setLicense(field(doc.valueOf("//oaf:license"), info));
|
||||
instance.setRefereed(field(doc.valueOf("//oaf:refereed"), info));
|
||||
instance.setProcessingchargeamount(field(doc.valueOf("//oaf:processingchargeamount"), info));
|
||||
instance.setProcessingchargecurrency(field(doc.valueOf("//oaf:processingchargeamount/@currency"), info));
|
||||
res.add(instance);
|
||||
}
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected List<Field<String>> prepareSources(final Document doc, final DataInfo info) {
|
||||
return prepareListFields(doc, "//dc:source", info);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected List<StructuredProperty> prepareRelevantDates(final Document doc, final DataInfo info) {
|
||||
return new ArrayList<>(); // NOT PRESENT IN OAF
|
||||
}
|
||||
|
||||
// SOFTWARES
|
||||
|
||||
@Override
|
||||
protected Qualifier prepareSoftwareProgrammingLanguage(final Document doc, final DataInfo info) {
|
||||
return null; // NOT PRESENT IN OAF
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Field<String> prepareSoftwareCodeRepositoryUrl(final Document doc, final DataInfo info) {
|
||||
return null; // NOT PRESENT IN OAF
|
||||
}
|
||||
|
||||
@Override
|
||||
protected List<StructuredProperty> prepareSoftwareLicenses(final Document doc, final DataInfo info) {
|
||||
return new ArrayList<>(); // NOT PRESENT IN OAF
|
||||
}
|
||||
|
||||
@Override
|
||||
protected List<Field<String>> prepareSoftwareDocumentationUrls(final Document doc, final DataInfo info) {
|
||||
return new ArrayList<>(); // NOT PRESENT IN OAF
|
||||
}
|
||||
|
||||
// DATASETS
|
||||
@Override
|
||||
protected List<GeoLocation> prepareDatasetGeoLocations(final Document doc, final DataInfo info) {
|
||||
return new ArrayList<>(); // NOT PRESENT IN OAF
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Field<String> prepareDatasetMetadataVersionNumber(final Document doc, final DataInfo info) {
|
||||
return null; // NOT PRESENT IN OAF
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Field<String> prepareDatasetLastMetadataUpdate(final Document doc, final DataInfo info) {
|
||||
return null; // NOT PRESENT IN OAF
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Field<String> prepareDatasetVersion(final Document doc, final DataInfo info) {
|
||||
return null; // NOT PRESENT IN OAF
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Field<String> prepareDatasetSize(final Document doc, final DataInfo info) {
|
||||
return null; // NOT PRESENT IN OAF
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Field<String> prepareDatasetDevice(final Document doc, final DataInfo info) {
|
||||
return null; // NOT PRESENT IN OAF
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Field<String> prepareDatasetStorageDate(final Document doc, final DataInfo info) {
|
||||
return null; // NOT PRESENT IN OAF
|
||||
}
|
||||
|
||||
// OTHER PRODUCTS
|
||||
|
||||
@Override
|
||||
protected List<Field<String>> prepareOtherResearchProductTools(final Document doc, final DataInfo info) {
|
||||
return new ArrayList<>(); // NOT PRESENT IN OAF
|
||||
}
|
||||
|
||||
@Override
|
||||
protected List<Field<String>> prepareOtherResearchProductContactGroups(final Document doc, final DataInfo info) {
|
||||
return new ArrayList<>(); // NOT PRESENT IN OAF
|
||||
}
|
||||
|
||||
@Override
|
||||
protected List<Field<String>> prepareOtherResearchProductContactPersons(final Document doc, final DataInfo info) {
|
||||
return new ArrayList<>(); // NOT PRESENT IN OAF
|
||||
}
|
||||
|
||||
@Override
|
||||
protected List<Oaf> addOtherResultRels(final Document doc,
|
||||
final KeyValue collectedFrom,
|
||||
final DataInfo info,
|
||||
final long lastUpdateTimestamp) {
|
||||
final String docId = createOpenaireId(50, doc.valueOf("//dri:objIdentifier"), false);
|
||||
|
||||
final List<Oaf> res = new ArrayList<>();
|
||||
|
||||
for (final Object o : doc.selectNodes("//*[local-name()='relatedDataset']")) {
|
||||
final String otherId = createOpenaireId(50, ((Node) o).getText(), false);
|
||||
|
||||
final Relation r1 = new Relation();
|
||||
r1.setRelType("resultResult");
|
||||
r1.setSubRelType("publicationDataset");
|
||||
r1.setRelClass("isRelatedTo");
|
||||
r1.setSource(docId);
|
||||
r1.setTarget(otherId);
|
||||
r1.setCollectedFrom(Arrays.asList(collectedFrom));
|
||||
r1.setDataInfo(info);
|
||||
r1.setLastupdatetimestamp(lastUpdateTimestamp);
|
||||
res.add(r1);
|
||||
|
||||
final Relation r2 = new Relation();
|
||||
r2.setRelType("resultResult");
|
||||
r2.setSubRelType("publicationDataset");
|
||||
r2.setRelClass("isRelatedTo");
|
||||
r2.setSource(otherId);
|
||||
r2.setTarget(docId);
|
||||
r2.setCollectedFrom(Arrays.asList(collectedFrom));
|
||||
r2.setDataInfo(info);
|
||||
r2.setLastupdatetimestamp(lastUpdateTimestamp);
|
||||
res.add(r2);
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Qualifier prepareResourceType(final Document doc, final DataInfo info) {
|
||||
return null; // NOT PRESENT IN OAF
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,265 @@
|
|||
package eu.dnetlib.dhp.migration.step2;
|
||||
|
||||
import static eu.dnetlib.dhp.migration.utils.OafMapperUtils.createOpenaireId;
|
||||
import static eu.dnetlib.dhp.migration.utils.OafMapperUtils.field;
|
||||
import static eu.dnetlib.dhp.migration.utils.OafMapperUtils.structuredProperty;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.dom4j.Document;
|
||||
import org.dom4j.Node;
|
||||
|
||||
import eu.dnetlib.dhp.schema.oaf.Author;
|
||||
import eu.dnetlib.dhp.schema.oaf.DataInfo;
|
||||
import eu.dnetlib.dhp.schema.oaf.Field;
|
||||
import eu.dnetlib.dhp.schema.oaf.GeoLocation;
|
||||
import eu.dnetlib.dhp.schema.oaf.Instance;
|
||||
import eu.dnetlib.dhp.schema.oaf.KeyValue;
|
||||
import eu.dnetlib.dhp.schema.oaf.Oaf;
|
||||
import eu.dnetlib.dhp.schema.oaf.Qualifier;
|
||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
||||
|
||||
public class OdfToOafMapper extends AbstractMdRecordToOafMapper {
|
||||
|
||||
public OdfToOafMapper(final Map<String, String> code2name) {
|
||||
super(code2name);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected List<StructuredProperty> prepareTitles(final Document doc, final DataInfo info) {
|
||||
return prepareListStructProps(doc, "//datacite:title", MAIN_TITLE_QUALIFIER, info);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected List<Author> prepareAuthors(final Document doc, final DataInfo info) {
|
||||
final List<Author> res = new ArrayList<>();
|
||||
int pos = 1;
|
||||
for (final Object o : doc.selectNodes("//datacite:creator")) {
|
||||
final Node n = (Node) o;
|
||||
final Author author = new Author();
|
||||
author.setFullname(n.valueOf("./datacite:creatorName"));
|
||||
author.setName(n.valueOf("./datacite:givenName"));
|
||||
author.setSurname(n.valueOf("./datacite:familyName"));
|
||||
author.setAffiliation(prepareListFields(doc, "./datacite:affiliation", info));
|
||||
author.setPid(preparePids(doc, info));
|
||||
author.setRank(pos++);
|
||||
res.add(author);
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
private List<StructuredProperty> preparePids(final Document doc, final DataInfo info) {
|
||||
final List<StructuredProperty> res = new ArrayList<>();
|
||||
for (final Object o : doc.selectNodes("./datacite:nameIdentifier")) {
|
||||
res.add(structuredProperty(((Node) o).getText(), prepareQualifier((Node) o, "./@nameIdentifierScheme", "dnet:pid_types", "dnet:pid_types"), info));
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected List<Instance> prepareInstances(final Document doc, final DataInfo info, final KeyValue collectedfrom, final KeyValue hostedby) {
|
||||
final List<Instance> res = new ArrayList<>();
|
||||
for (final Object o : doc.selectNodes("//datacite:alternateIdentifier[@alternateIdentifierType='URL']")) {
|
||||
final Instance instance = new Instance();
|
||||
instance.setUrl(Arrays.asList(((Node) o).getText().trim()));
|
||||
instance.setInstancetype(prepareQualifier(doc, "//dr:CobjCategory", "dnet:publication_resource", "dnet:publication_resource"));
|
||||
instance.setCollectedfrom(collectedfrom);
|
||||
instance.setHostedby(hostedby);
|
||||
instance.setDateofacceptance(field(doc.valueOf("//oaf:dateAccepted"), info));
|
||||
instance.setDistributionlocation(doc.valueOf("//oaf:distributionlocation"));
|
||||
instance.setAccessright(prepareQualifier(doc, "//oaf:accessrights", "dnet:access_modes", "dnet:access_modes"));
|
||||
instance.setLicense(field(doc.valueOf("//oaf:license"), info));
|
||||
instance.setRefereed(field(doc.valueOf("//oaf:refereed"), info));
|
||||
instance.setProcessingchargeamount(field(doc.valueOf("//oaf:processingchargeamount"), info));
|
||||
instance.setProcessingchargecurrency(field(doc.valueOf("//oaf:processingchargeamount/@currency"), info));
|
||||
res.add(instance);
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected List<Field<String>> prepareSources(final Document doc, final DataInfo info) {
|
||||
return new ArrayList<>(); // Not present in ODF ???
|
||||
}
|
||||
|
||||
@Override
|
||||
protected List<StructuredProperty> prepareRelevantDates(final Document doc, final DataInfo info) {
|
||||
final List<StructuredProperty> res = new ArrayList<>();
|
||||
for (final Object o : doc.selectNodes("//datacite:date")) {
|
||||
final String dateType = ((Node) o).valueOf("@dateType");
|
||||
if (StringUtils.isBlank(dateType) && !dateType.equalsIgnoreCase("Accepted") && !dateType.equalsIgnoreCase("Issued")
|
||||
&& !dateType.equalsIgnoreCase("Updated") && !dateType.equalsIgnoreCase("Available")) {
|
||||
res.add(structuredProperty(((Node) o).getText(), "UNKNOWN", "UNKNOWN", "dnet:dataCite_date", "dnet:dataCite_date", info));
|
||||
}
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected List<Field<String>> prepareCoverages(final Document doc, final DataInfo info) {
|
||||
return new ArrayList<>(); // Not present in ODF ???
|
||||
}
|
||||
|
||||
@Override
|
||||
protected List<Field<String>> prepareContributors(final Document doc, final DataInfo info) {
|
||||
return prepareListFields(doc, "//datacite:contributorName", info);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected List<Field<String>> prepareFormats(final Document doc, final DataInfo info) {
|
||||
return prepareListFields(doc, "//datacite:format", info);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Field<String> preparePublisher(final Document doc, final DataInfo info) {
|
||||
return prepareField(doc, "//datacite:publisher", info);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected List<Field<String>> prepareDescriptions(final Document doc, final DataInfo info) {
|
||||
return prepareListFields(doc, "//datacite:description[@descriptionType='Abstract']", info);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected List<StructuredProperty> prepareSubjects(final Document doc, final DataInfo info) {
|
||||
return prepareListStructProps(doc, "//datacite:subject", info);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Qualifier prepareLanguages(final Document doc) {
|
||||
return prepareQualifier(doc, "//datacite:language", "dnet:languages", "dnet:languages");
|
||||
}
|
||||
|
||||
@Override
|
||||
protected List<Field<String>> prepareOtherResearchProductTools(final Document doc, final DataInfo info) {
|
||||
return new ArrayList<>(); // Not present in ODF ???
|
||||
}
|
||||
|
||||
@Override
|
||||
protected List<Field<String>> prepareOtherResearchProductContactGroups(final Document doc, final DataInfo info) {
|
||||
return prepareListFields(doc, "//datacite:contributor[@contributorType='ContactGroup']/datacite:contributorName", info);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected List<Field<String>> prepareOtherResearchProductContactPersons(final Document doc, final DataInfo info) {
|
||||
return prepareListFields(doc, "//datacite:contributor[@contributorType='ContactPerson']/datacite:contributorName", info);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Qualifier prepareSoftwareProgrammingLanguage(final Document doc, final DataInfo info) {
|
||||
return prepareQualifier(doc, "//datacite:format", "dnet:programming_languages", "dnet:programming_languages");
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Field<String> prepareSoftwareCodeRepositoryUrl(final Document doc, final DataInfo info) {
|
||||
return null; // Not present in ODF ???
|
||||
}
|
||||
|
||||
@Override
|
||||
protected List<StructuredProperty> prepareSoftwareLicenses(final Document doc, final DataInfo info) {
|
||||
return new ArrayList<>(); // Not present in ODF ???
|
||||
}
|
||||
|
||||
@Override
|
||||
protected List<Field<String>> prepareSoftwareDocumentationUrls(final Document doc, final DataInfo info) {
|
||||
return prepareListFields(doc, "//datacite:relatedIdentifier[@relatedIdentifierType='URL' and @relationType='IsDocumentedBy']", info);
|
||||
}
|
||||
|
||||
// DATASETS
|
||||
|
||||
@Override
|
||||
protected List<GeoLocation> prepareDatasetGeoLocations(final Document doc, final DataInfo info) {
|
||||
final List<GeoLocation> res = new ArrayList<>();
|
||||
|
||||
for (final Object o : doc.selectNodes("//datacite:geoLocation")) {
|
||||
final GeoLocation loc = new GeoLocation();
|
||||
loc.setBox(((Node) o).valueOf("./datacite:geoLocationBox"));
|
||||
loc.setPlace(((Node) o).valueOf("./datacite:geoLocationPlace"));
|
||||
loc.setPoint(((Node) o).valueOf("./datacite:geoLocationPoint"));
|
||||
res.add(loc);
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Field<String> prepareDatasetMetadataVersionNumber(final Document doc, final DataInfo info) {
|
||||
return null; // Not present in ODF ???
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Field<String> prepareDatasetLastMetadataUpdate(final Document doc, final DataInfo info) {
|
||||
return prepareField(doc, "//datacite:date[@dateType='Updated']", info);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Field<String> prepareDatasetVersion(final Document doc, final DataInfo info) {
|
||||
return prepareField(doc, "//datacite:version", info);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Field<String> prepareDatasetSize(final Document doc, final DataInfo info) {
|
||||
return prepareField(doc, "//datacite:size", info);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Field<String> prepareDatasetDevice(final Document doc, final DataInfo info) {
|
||||
return null; // Not present in ODF ???
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Field<String> prepareDatasetStorageDate(final Document doc, final DataInfo info) {
|
||||
return prepareField(doc, "//datacite:date[@dateType='Issued']", info);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected List<Oaf> addOtherResultRels(final Document doc, final KeyValue collectedFrom, final DataInfo info, final long lastUpdateTimestamp) {
|
||||
|
||||
final String docId = createOpenaireId(50, doc.valueOf("//dri:objIdentifier"), false);
|
||||
|
||||
final List<Oaf> res = new ArrayList<>();
|
||||
|
||||
for (final Object o : doc.selectNodes("//datacite:relatedIdentifier[@relatedIdentifierType='OPENAIRE']")) {
|
||||
final String otherId = createOpenaireId(50, ((Node) o).getText(), false);
|
||||
final String type = ((Node) o).valueOf("@relationType");
|
||||
|
||||
if (type.equals("IsSupplementTo")) {
|
||||
res.add(prepareOtherResultRel(collectedFrom, info, lastUpdateTimestamp, docId, otherId, "supplement", "isSupplementTo"));
|
||||
res.add(prepareOtherResultRel(collectedFrom, info, lastUpdateTimestamp, otherId, docId, "supplement", "isSupplementedBy"));
|
||||
} else if (type.equals("IsPartOf")) {
|
||||
res.add(prepareOtherResultRel(collectedFrom, info, lastUpdateTimestamp, docId, otherId, "part", "IsPartOf"));
|
||||
res.add(prepareOtherResultRel(collectedFrom, info, lastUpdateTimestamp, otherId, docId, "part", "HasParts"));
|
||||
} else {}
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
private Relation prepareOtherResultRel(final KeyValue collectedFrom,
|
||||
final DataInfo info,
|
||||
final long lastUpdateTimestamp,
|
||||
final String source,
|
||||
final String target,
|
||||
final String subRelType,
|
||||
final String relClass) {
|
||||
final Relation r = new Relation();
|
||||
r.setRelType("resultResult");
|
||||
r.setSubRelType(subRelType);
|
||||
r.setRelClass(relClass);
|
||||
r.setSource(source);
|
||||
r.setTarget(target);
|
||||
r.setCollectedFrom(Arrays.asList(collectedFrom));
|
||||
r.setDataInfo(info);
|
||||
r.setLastupdatetimestamp(lastUpdateTimestamp);
|
||||
return r;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Qualifier prepareResourceType(final Document doc, final DataInfo info) {
|
||||
return prepareQualifier(doc, "//*[local-name() = 'resource']//*[local-name() = 'resourceType']", "dnet:dataCite_resource", "dnet:dataCite_resource");
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,71 @@
|
|||
package eu.dnetlib.dhp.migration.step3;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
import org.apache.hadoop.io.compress.GzipCodec;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.migration.step1.MigrateMongoMdstoresApplication;
|
||||
import eu.dnetlib.dhp.schema.oaf.Dataset;
|
||||
import eu.dnetlib.dhp.schema.oaf.Datasource;
|
||||
import eu.dnetlib.dhp.schema.oaf.Organization;
|
||||
import eu.dnetlib.dhp.schema.oaf.OtherResearchProduct;
|
||||
import eu.dnetlib.dhp.schema.oaf.Project;
|
||||
import eu.dnetlib.dhp.schema.oaf.Publication;
|
||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||
import eu.dnetlib.dhp.schema.oaf.Software;
|
||||
|
||||
public class DispatchEntitiesApplication {
|
||||
|
||||
private static final Log log = LogFactory.getLog(DispatchEntitiesApplication.class);
|
||||
|
||||
public static void main(final String[] args) throws Exception {
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
||||
IOUtils.toString(MigrateMongoMdstoresApplication.class
|
||||
.getResourceAsStream("/eu/dnetlib/dhp/migration/dispatch_entities_parameters.json")));
|
||||
parser.parseArgument(args);
|
||||
|
||||
try (final SparkSession spark = newSparkSession(parser); final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext())) {
|
||||
|
||||
final String sourcePath = parser.get("sourcePath");
|
||||
final String targetPath = parser.get("graphRawPath");
|
||||
|
||||
processEntity(sc, Publication.class, sourcePath, targetPath);
|
||||
processEntity(sc, Dataset.class, sourcePath, targetPath);
|
||||
processEntity(sc, Software.class, sourcePath, targetPath);
|
||||
processEntity(sc, OtherResearchProduct.class, sourcePath, targetPath);
|
||||
processEntity(sc, Datasource.class, sourcePath, targetPath);
|
||||
processEntity(sc, Organization.class, sourcePath, targetPath);
|
||||
processEntity(sc, Project.class, sourcePath, targetPath);
|
||||
processEntity(sc, Relation.class, sourcePath, targetPath);
|
||||
}
|
||||
}
|
||||
|
||||
private static SparkSession newSparkSession(final ArgumentApplicationParser parser) {
|
||||
return SparkSession
|
||||
.builder()
|
||||
.appName(DispatchEntitiesApplication.class.getSimpleName())
|
||||
.master(parser.get("master"))
|
||||
.getOrCreate();
|
||||
}
|
||||
|
||||
private static void processEntity(final JavaSparkContext sc, final Class<?> clazz, final String sourcePath, final String targetPath) {
|
||||
final String type = clazz.getSimpleName().toLowerCase();
|
||||
|
||||
log.info(String.format("Processing entities (%s) in file: %s", type, sourcePath));
|
||||
|
||||
sc.textFile(sourcePath)
|
||||
.filter(l -> isEntityType(l, type))
|
||||
.map(l -> StringUtils.substringAfter(l, "|"))
|
||||
.saveAsTextFile(targetPath + "/" + type, GzipCodec.class); // use repartition(XXX) ???
|
||||
}
|
||||
|
||||
private static boolean isEntityType(final String line, final String type) {
|
||||
return StringUtils.substringBefore(line, "|").equalsIgnoreCase(type);
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,81 @@
|
|||
package eu.dnetlib.dhp.migration.utils;
|
||||
|
||||
import java.io.Closeable;
|
||||
import java.io.IOException;
|
||||
import java.util.concurrent.atomic.AtomicInteger;
|
||||
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hadoop.io.SequenceFile;
|
||||
import org.apache.hadoop.io.Text;
|
||||
import org.codehaus.jackson.map.ObjectMapper;
|
||||
|
||||
import eu.dnetlib.dhp.schema.oaf.Oaf;
|
||||
|
||||
public class AbstractMigrationApplication implements Closeable {
|
||||
|
||||
private final AtomicInteger counter = new AtomicInteger(0);
|
||||
|
||||
private final Text key = new Text();
|
||||
|
||||
private final Text value = new Text();
|
||||
|
||||
private final SequenceFile.Writer writer;
|
||||
|
||||
private final ObjectMapper objectMapper = new ObjectMapper();
|
||||
|
||||
private static final Log log = LogFactory.getLog(AbstractMigrationApplication.class);
|
||||
|
||||
protected AbstractMigrationApplication() { // ONLY FOR UNIT TEST
|
||||
this.writer = null;
|
||||
}
|
||||
|
||||
public AbstractMigrationApplication(final String hdfsPath) throws Exception {
|
||||
|
||||
log.info(String.format("Creating SequenceFile Writer, hdfsPath=%s", hdfsPath));
|
||||
|
||||
this.writer = SequenceFile.createWriter(getConf(), SequenceFile.Writer.file(new Path(hdfsPath)), SequenceFile.Writer
|
||||
.keyClass(Text.class), SequenceFile.Writer.valueClass(Text.class));
|
||||
}
|
||||
|
||||
private Configuration getConf() throws IOException {
|
||||
final Configuration conf = new Configuration();
|
||||
/*
|
||||
* conf.set("fs.defaultFS", hdfsNameNode); conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName());
|
||||
* conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName()); System.setProperty("HADOOP_USER_NAME", hdfsUser);
|
||||
* System.setProperty("hadoop.home.dir", "/"); FileSystem.get(URI.create(hdfsNameNode), conf);
|
||||
*/
|
||||
return conf;
|
||||
}
|
||||
|
||||
protected void emit(final String s, final String type) {
|
||||
try {
|
||||
key.set(counter.getAndIncrement() + ":" + type);
|
||||
value.set(s);
|
||||
writer.append(key, value);
|
||||
} catch (final Exception e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
protected void emitOaf(final Oaf oaf) {
|
||||
try {
|
||||
emit(objectMapper.writeValueAsString(oaf), oaf.getClass().getSimpleName().toLowerCase());
|
||||
} catch (final Exception e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
public ObjectMapper getObjectMapper() {
|
||||
return objectMapper;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() throws IOException {
|
||||
writer.hflush();
|
||||
writer.close();
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,65 @@
|
|||
package eu.dnetlib.dhp.migration.utils;
|
||||
|
||||
import java.io.Closeable;
|
||||
import java.io.IOException;
|
||||
import java.sql.Connection;
|
||||
import java.sql.DriverManager;
|
||||
import java.sql.ResultSet;
|
||||
import java.sql.SQLException;
|
||||
import java.sql.Statement;
|
||||
import java.util.function.Consumer;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
|
||||
public class DbClient implements Closeable {
|
||||
|
||||
private static final Log log = LogFactory.getLog(DbClient.class);
|
||||
|
||||
private Connection connection;
|
||||
|
||||
public DbClient(final String address, final String login, final String password) {
|
||||
|
||||
try {
|
||||
Class.forName("org.postgresql.Driver");
|
||||
|
||||
this.connection =
|
||||
StringUtils.isNoneBlank(login, password) ? DriverManager.getConnection(address, login, password) : DriverManager.getConnection(address);
|
||||
this.connection.setAutoCommit(false);
|
||||
} catch (final Exception e) {
|
||||
log.error("Connection to postgresDB failed");
|
||||
throw new RuntimeException("Connection to postgresDB failed", e);
|
||||
}
|
||||
log.info("Opened database successfully");
|
||||
}
|
||||
|
||||
public void processResults(final String sql, final Consumer<ResultSet> consumer) {
|
||||
|
||||
try (final Statement stmt = connection.createStatement()) {
|
||||
stmt.setFetchSize(100);
|
||||
|
||||
try (final ResultSet rs = stmt.executeQuery(sql)) {
|
||||
while (rs.next()) {
|
||||
consumer.accept(rs);
|
||||
}
|
||||
} catch (final SQLException e) {
|
||||
log.error("Error executing sql query: " + sql, e);
|
||||
throw new RuntimeException("Error executing sql query", e);
|
||||
}
|
||||
} catch (final SQLException e1) {
|
||||
log.error("Error preparing sql statement", e1);
|
||||
throw new RuntimeException("Error preparing sql statement", e1);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() throws IOException {
|
||||
try {
|
||||
connection.close();
|
||||
} catch (final SQLException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,94 @@
|
|||
package eu.dnetlib.dhp.migration.utils;
|
||||
|
||||
import java.io.Closeable;
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
import java.util.stream.StreamSupport;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
import org.bson.Document;
|
||||
|
||||
import com.google.common.collect.Iterables;
|
||||
import com.mongodb.MongoClient;
|
||||
import com.mongodb.MongoClientURI;
|
||||
import com.mongodb.client.MongoCollection;
|
||||
import com.mongodb.client.MongoDatabase;
|
||||
|
||||
public class MdstoreClient implements Closeable {
|
||||
|
||||
private final MongoClient client;
|
||||
private final MongoDatabase db;
|
||||
|
||||
private static final String COLL_METADATA = "metadata";
|
||||
private static final String COLL_METADATA_MANAGER = "metadataManager";
|
||||
|
||||
private static final Log log = LogFactory.getLog(MdstoreClient.class);
|
||||
|
||||
public MdstoreClient(final String baseUrl, final String dbName) {
|
||||
this.client = new MongoClient(new MongoClientURI(baseUrl));
|
||||
this.db = getDb(client, dbName);
|
||||
}
|
||||
|
||||
public Map<String, String> validCollections(final String mdFormat, final String mdLayout, final String mdInterpretation) {
|
||||
|
||||
final Map<String, String> transactions = new HashMap<>();
|
||||
for (final Document entry : getColl(db, COLL_METADATA_MANAGER, true).find()) {
|
||||
final String mdId = entry.getString("mdId");
|
||||
final String currentId = entry.getString("currentId");
|
||||
if (StringUtils.isNoneBlank(mdId, currentId)) {
|
||||
transactions.put(mdId, currentId);
|
||||
}
|
||||
}
|
||||
|
||||
final Map<String, String> res = new HashMap<>();
|
||||
for (final Document entry : getColl(db, COLL_METADATA, true).find()) {
|
||||
if (entry.getString("format").equals(mdFormat) && entry.getString("layout").equals(mdLayout)
|
||||
&& entry.getString("interpretation").equals(mdInterpretation) && transactions.containsKey(entry.getString("mdId"))) {
|
||||
res.put(entry.getString("mdId"), transactions.get(entry.getString("mdId")));
|
||||
}
|
||||
}
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
private MongoDatabase getDb(final MongoClient client, final String dbName) {
|
||||
if (!Iterables.contains(client.listDatabaseNames(), dbName)) {
|
||||
final String err = String.format("Database '%s' not found in %s", dbName, client.getAddress());
|
||||
log.warn(err);
|
||||
throw new RuntimeException(err);
|
||||
}
|
||||
return client.getDatabase(dbName);
|
||||
}
|
||||
|
||||
private MongoCollection<Document> getColl(final MongoDatabase db, final String collName, final boolean abortIfMissing) {
|
||||
if (!Iterables.contains(db.listCollectionNames(), collName)) {
|
||||
final String err = String.format(String.format("Missing collection '%s' in database '%s'", collName, db.getName()));
|
||||
log.warn(err);
|
||||
if (abortIfMissing) {
|
||||
throw new RuntimeException(err);
|
||||
} else {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
return db.getCollection(collName);
|
||||
}
|
||||
|
||||
public Iterable<String> listRecords(final String collName) {
|
||||
final MongoCollection<Document> coll = getColl(db, collName, false);
|
||||
return coll == null ? new ArrayList<>()
|
||||
: () -> StreamSupport.stream(coll.find().spliterator(), false)
|
||||
.filter(e -> e.containsKey("body"))
|
||||
.map(e -> e.getString("body"))
|
||||
.iterator();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() throws IOException {
|
||||
client.close();
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,195 @@
|
|||
package eu.dnetlib.dhp.migration.utils;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.util.Objects;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import eu.dnetlib.dhp.schema.oaf.DataInfo;
|
||||
import eu.dnetlib.dhp.schema.oaf.ExtraInfo;
|
||||
import eu.dnetlib.dhp.schema.oaf.Field;
|
||||
import eu.dnetlib.dhp.schema.oaf.Journal;
|
||||
import eu.dnetlib.dhp.schema.oaf.KeyValue;
|
||||
import eu.dnetlib.dhp.schema.oaf.OAIProvenance;
|
||||
import eu.dnetlib.dhp.schema.oaf.OriginDescription;
|
||||
import eu.dnetlib.dhp.schema.oaf.Qualifier;
|
||||
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
||||
import eu.dnetlib.dhp.utils.DHPUtils;
|
||||
|
||||
public class OafMapperUtils {
|
||||
|
||||
public static KeyValue keyValue(final String k, final String v) {
|
||||
final KeyValue kv = new KeyValue();
|
||||
kv.setKey(k);
|
||||
kv.setValue(v);
|
||||
return kv;
|
||||
}
|
||||
|
||||
public static List<KeyValue> listKeyValues(final String... s) {
|
||||
if (s.length % 2 > 0) { throw new RuntimeException("Invalid number of parameters (k,v,k,v,....)"); }
|
||||
|
||||
final List<KeyValue> list = new ArrayList<>();
|
||||
for (int i = 0; i < s.length; i += 2) {
|
||||
list.add(keyValue(s[i], s[i + 1]));
|
||||
}
|
||||
return list;
|
||||
}
|
||||
|
||||
public static <T> Field<T> field(final T value, final DataInfo info) {
|
||||
if (value == null || StringUtils.isBlank(value.toString())) { return null; }
|
||||
|
||||
final Field<T> field = new Field<>();
|
||||
field.setValue(value);
|
||||
field.setDataInfo(info);
|
||||
return field;
|
||||
}
|
||||
|
||||
public static List<Field<String>> listFields(final DataInfo info, final String... values) {
|
||||
return Arrays.stream(values).map(v -> field(v, info)).filter(Objects::nonNull).collect(Collectors.toList());
|
||||
}
|
||||
|
||||
public static List<Field<String>> listFields(final DataInfo info, final List<String> values) {
|
||||
return values.stream().map(v -> field(v, info)).filter(Objects::nonNull).collect(Collectors.toList());
|
||||
}
|
||||
|
||||
public static Qualifier qualifier(final String classid, final String classname, final String schemeid, final String schemename) {
|
||||
final Qualifier q = new Qualifier();
|
||||
q.setClassid(classid);
|
||||
q.setClassname(classname);
|
||||
q.setSchemeid(schemeid);
|
||||
q.setSchemename(schemename);
|
||||
return q;
|
||||
}
|
||||
|
||||
public static StructuredProperty structuredProperty(final String value,
|
||||
final String classid,
|
||||
final String classname,
|
||||
final String schemeid,
|
||||
final String schemename,
|
||||
final DataInfo dataInfo) {
|
||||
|
||||
return structuredProperty(value, qualifier(classid, classname, schemeid, schemename), dataInfo);
|
||||
}
|
||||
|
||||
public static StructuredProperty structuredProperty(final String value, final Qualifier qualifier, final DataInfo dataInfo) {
|
||||
if (value == null) { return null; }
|
||||
final StructuredProperty sp = new StructuredProperty();
|
||||
sp.setValue(value);
|
||||
sp.setQualifier(qualifier);
|
||||
sp.setDataInfo(dataInfo);
|
||||
return sp;
|
||||
}
|
||||
|
||||
public static ExtraInfo extraInfo(final String name, final String value, final String typology, final String provenance, final String trust) {
|
||||
final ExtraInfo info = new ExtraInfo();
|
||||
info.setName(name);
|
||||
info.setValue(value);
|
||||
info.setTypology(typology);
|
||||
info.setProvenance(provenance);
|
||||
info.setTrust(trust);
|
||||
return info;
|
||||
}
|
||||
|
||||
public static OAIProvenance oaiIProvenance(final String identifier,
|
||||
final String baseURL,
|
||||
final String metadataNamespace,
|
||||
final Boolean altered,
|
||||
final String datestamp,
|
||||
final String harvestDate) {
|
||||
|
||||
final OriginDescription desc = new OriginDescription();
|
||||
desc.setIdentifier(identifier);
|
||||
desc.setBaseURL(baseURL);
|
||||
desc.setMetadataNamespace(metadataNamespace);
|
||||
desc.setAltered(altered);
|
||||
desc.setDatestamp(datestamp);
|
||||
desc.setHarvestDate(harvestDate);
|
||||
|
||||
final OAIProvenance p = new OAIProvenance();
|
||||
p.setOriginDescription(desc);
|
||||
|
||||
return p;
|
||||
}
|
||||
|
||||
public static Journal journal(final String name,
|
||||
final String issnPrinted,
|
||||
final String issnOnline,
|
||||
final String issnLinking,
|
||||
final String ep,
|
||||
final String iss,
|
||||
final String sp,
|
||||
final String vol,
|
||||
final String edition,
|
||||
final String conferenceplace,
|
||||
final String conferencedate,
|
||||
final DataInfo dataInfo) {
|
||||
|
||||
if (StringUtils.isNotBlank(name) || StringUtils.isNotBlank(issnPrinted) || StringUtils.isNotBlank(issnOnline) || StringUtils.isNotBlank(issnLinking)) {
|
||||
final Journal j = new Journal();
|
||||
j.setName(name);
|
||||
j.setIssnPrinted(issnPrinted);
|
||||
j.setIssnOnline(issnOnline);
|
||||
j.setIssnLinking(issnLinking);
|
||||
j.setEp(ep);
|
||||
j.setIss(iss);
|
||||
j.setSp(sp);
|
||||
j.setVol(vol);
|
||||
j.setEdition(edition);
|
||||
j.setConferenceplace(conferenceplace);
|
||||
j.setConferencedate(conferencedate);
|
||||
j.setDataInfo(dataInfo);
|
||||
return j;
|
||||
} else {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
public static DataInfo dataInfo(final Boolean deletedbyinference,
|
||||
final String inferenceprovenance,
|
||||
final Boolean inferred,
|
||||
final Boolean invisible,
|
||||
final Qualifier provenanceaction,
|
||||
final String trust) {
|
||||
final DataInfo d = new DataInfo();
|
||||
d.setDeletedbyinference(deletedbyinference);
|
||||
d.setInferenceprovenance(inferenceprovenance);
|
||||
d.setInferred(inferred);
|
||||
d.setInvisible(invisible);
|
||||
d.setProvenanceaction(provenanceaction);
|
||||
d.setTrust(trust);
|
||||
return d;
|
||||
}
|
||||
|
||||
public static String createOpenaireId(final int prefix, final String originalId, final boolean to_md5) {
|
||||
if (to_md5) {
|
||||
final String nsPrefix = StringUtils.substringBefore(originalId, "::");
|
||||
final String rest = StringUtils.substringAfter(originalId, "::");
|
||||
return String.format("%s|%s::%s", prefix, nsPrefix, DHPUtils.md5(rest));
|
||||
} else {
|
||||
return String.format("%s|%s", prefix, originalId);
|
||||
}
|
||||
}
|
||||
|
||||
public static String createOpenaireId(final String type, final String originalId, final boolean to_md5) {
|
||||
switch (type) {
|
||||
case "datasource":
|
||||
return createOpenaireId(10, originalId, to_md5);
|
||||
case "organization":
|
||||
return createOpenaireId(20, originalId, to_md5);
|
||||
case "person":
|
||||
return createOpenaireId(30, originalId, to_md5);
|
||||
case "project":
|
||||
return createOpenaireId(40, originalId, to_md5);
|
||||
default:
|
||||
return createOpenaireId(50, originalId, to_md5);
|
||||
}
|
||||
}
|
||||
|
||||
public static String asString(final Object o) {
|
||||
return o == null ? "" : o.toString();
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,176 @@
|
|||
package eu.dnetlib.dhp.migration.utils;
|
||||
|
||||
import java.nio.charset.Charset;
|
||||
import java.text.Normalizer;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.commons.lang3.text.WordUtils;
|
||||
|
||||
import com.google.common.base.Joiner;
|
||||
import com.google.common.base.Splitter;
|
||||
import com.google.common.collect.Iterables;
|
||||
import com.google.common.collect.Lists;
|
||||
import com.google.common.hash.Hashing;
|
||||
|
||||
public class PacePerson {
|
||||
|
||||
private static final String UTF8 = "UTF-8";
|
||||
private List<String> name = Lists.newArrayList();
|
||||
private List<String> surname = Lists.newArrayList();
|
||||
private List<String> fullname = Lists.newArrayList();
|
||||
private final String original;
|
||||
|
||||
private static Set<String> particles = null;
|
||||
|
||||
public static final String capitalize(final String s) {
|
||||
return WordUtils.capitalize(s.toLowerCase(), ' ', '-');
|
||||
}
|
||||
|
||||
public static final String dotAbbreviations(final String s) {
|
||||
return s.length() == 1 ? s + "." : s;
|
||||
}
|
||||
|
||||
public static Set<String> loadFromClasspath(final String classpath) {
|
||||
final Set<String> h = new HashSet<>();
|
||||
try {
|
||||
for (final String s : IOUtils.readLines(PacePerson.class.getResourceAsStream(classpath))) {
|
||||
h.add(s);
|
||||
}
|
||||
} catch (final Throwable e) {
|
||||
return new HashSet<>();
|
||||
}
|
||||
return h;
|
||||
}
|
||||
|
||||
public PacePerson(String s, final boolean aggressive) {
|
||||
original = s;
|
||||
s = Normalizer.normalize(s, Normalizer.Form.NFD);
|
||||
s = s.replaceAll("\\(.+\\)", "");
|
||||
s = s.replaceAll("\\[.+\\]", "");
|
||||
s = s.replaceAll("\\{.+\\}", "");
|
||||
s = s.replaceAll("\\s+-\\s+", "-");
|
||||
s = s.replaceAll("[\\p{Punct}&&[^,-]]", " ");
|
||||
s = s.replaceAll("\\d", " ");
|
||||
s = s.replaceAll("\\n", " ");
|
||||
s = s.replaceAll("\\.", " ");
|
||||
s = s.replaceAll("\\s+", " ");
|
||||
|
||||
if (aggressive) {
|
||||
s = s.replaceAll("[\\p{InCombiningDiacriticalMarks}&&[^,-]]", "");
|
||||
// s = s.replaceAll("[\\W&&[^,-]]", "");
|
||||
}
|
||||
|
||||
if (s.contains(",")) {
|
||||
final String[] arr = s.split(",");
|
||||
if (arr.length == 1) {
|
||||
fullname = splitTerms(arr[0]);
|
||||
} else if (arr.length > 1) {
|
||||
surname = splitTerms(arr[0]);
|
||||
name = splitTerms(arr[1]);
|
||||
fullname.addAll(surname);
|
||||
fullname.addAll(name);
|
||||
}
|
||||
} else {
|
||||
fullname = splitTerms(s);
|
||||
|
||||
int lastInitialPosition = fullname.size();
|
||||
boolean hasSurnameInUpperCase = false;
|
||||
|
||||
for (int i = 0; i < fullname.size(); i++) {
|
||||
final String term = fullname.get(i);
|
||||
if (term.length() == 1) {
|
||||
lastInitialPosition = i;
|
||||
} else if (term.equals(term.toUpperCase())) {
|
||||
hasSurnameInUpperCase = true;
|
||||
}
|
||||
}
|
||||
|
||||
if (lastInitialPosition < fullname.size() - 1) { // Case: Michele G. Artini
|
||||
name = fullname.subList(0, lastInitialPosition + 1);
|
||||
surname = fullname.subList(lastInitialPosition + 1, fullname.size());
|
||||
} else if (hasSurnameInUpperCase) { // Case: Michele ARTINI
|
||||
for (final String term : fullname) {
|
||||
if (term.length() > 1 && term.equals(term.toUpperCase())) {
|
||||
surname.add(term);
|
||||
} else {
|
||||
name.add(term);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private List<String> splitTerms(final String s) {
|
||||
if (particles == null) {
|
||||
particles = loadFromClasspath("/eu/dnetlib/dhp/migration/pace/name_particles.txt");
|
||||
}
|
||||
|
||||
final List<String> list = Lists.newArrayList();
|
||||
for (final String part : Splitter.on(" ").omitEmptyStrings().split(s)) {
|
||||
if (!particles.contains(part.toLowerCase())) {
|
||||
list.add(part);
|
||||
}
|
||||
}
|
||||
return list;
|
||||
}
|
||||
|
||||
public List<String> getName() {
|
||||
return name;
|
||||
}
|
||||
|
||||
public String getNameString() {
|
||||
return Joiner.on(" ").join(getName());
|
||||
}
|
||||
|
||||
public List<String> getSurname() {
|
||||
return surname;
|
||||
}
|
||||
|
||||
public List<String> getFullname() {
|
||||
return fullname;
|
||||
}
|
||||
|
||||
public String getOriginal() {
|
||||
return original;
|
||||
}
|
||||
|
||||
public String hash() {
|
||||
return Hashing.murmur3_128().hashString(getNormalisedFullname(), Charset.forName(UTF8)).toString();
|
||||
}
|
||||
|
||||
public String getNormalisedFirstName() {
|
||||
return Joiner.on(" ").join(getCapitalFirstnames());
|
||||
}
|
||||
|
||||
public String getNormalisedSurname() {
|
||||
return Joiner.on(" ").join(getCapitalSurname());
|
||||
}
|
||||
|
||||
public String getSurnameString() {
|
||||
return Joiner.on(" ").join(getSurname());
|
||||
}
|
||||
|
||||
public String getNormalisedFullname() {
|
||||
return isAccurate() ? getNormalisedSurname() + ", " + getNormalisedFirstName() : Joiner.on(" ").join(fullname);
|
||||
}
|
||||
|
||||
public List<String> getCapitalFirstnames() {
|
||||
return Lists.newArrayList(Iterables.transform(getNameWithAbbreviations(), PacePerson::capitalize));
|
||||
}
|
||||
|
||||
public List<String> getCapitalSurname() {
|
||||
return Lists.newArrayList(Iterables.transform(surname, PacePerson::capitalize));
|
||||
}
|
||||
|
||||
public List<String> getNameWithAbbreviations() {
|
||||
return Lists.newArrayList(Iterables.transform(name, PacePerson::dotAbbreviations));
|
||||
}
|
||||
|
||||
public boolean isAccurate() {
|
||||
return name != null && surname != null && !name.isEmpty() && !surname.isEmpty();
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,20 @@
|
|||
[
|
||||
{
|
||||
"paramName": "s",
|
||||
"paramLongName": "sourcePath",
|
||||
"paramDescription": "the source path",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "mt",
|
||||
"paramLongName": "master",
|
||||
"paramDescription": "should be local or yarn",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "g",
|
||||
"paramLongName": "graphRawPath",
|
||||
"paramDescription": "the path of the graph Raw in hdfs",
|
||||
"paramRequired": true
|
||||
}
|
||||
]
|
|
@ -0,0 +1,39 @@
|
|||
[
|
||||
{
|
||||
"paramName": "s",
|
||||
"paramLongName": "sourcePaths",
|
||||
"paramDescription": "the HDFS source paths which contains the sequential file (comma separated)",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "mt",
|
||||
"paramLongName": "master",
|
||||
"paramDescription": "should be local or yarn",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "t",
|
||||
"paramLongName": "targetPath",
|
||||
"paramDescription": "the path of the target file",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "pgurl",
|
||||
"paramLongName": "postgresUrl",
|
||||
"paramDescription": "postgres url, example: jdbc:postgresql://localhost:5432/testdb",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "pguser",
|
||||
"paramLongName": "postgresUser",
|
||||
"paramDescription": "postgres user",
|
||||
"paramRequired": false
|
||||
},
|
||||
{
|
||||
"paramName": "pgpasswd",
|
||||
"paramLongName": "postgresPassword",
|
||||
"paramDescription": "postgres password",
|
||||
"paramRequired": false
|
||||
}
|
||||
|
||||
]
|
|
@ -0,0 +1,10 @@
|
|||
[
|
||||
{"paramName":"is", "paramLongName":"isLookupUrl", "paramDescription": "URL of the isLookUp Service", "paramRequired": true},
|
||||
{"paramName":"sn", "paramLongName":"sourceNameNode", "paramDescription": "nameNode of the source cluster", "paramRequired": true},
|
||||
{"paramName":"tn", "paramLongName":"targetNameNode", "paramDescription": "namoNode of the target cluster", "paramRequired": true},
|
||||
{"paramName":"w", "paramLongName":"workingDirectory", "paramDescription": "working directory", "paramRequired": true},
|
||||
{"paramName":"nm", "paramLongName":"distcp_num_maps", "paramDescription": "maximum number of map tasks used in the distcp process", "paramRequired": true},
|
||||
{"paramName":"mm", "paramLongName":"distcp_memory_mb", "paramDescription": "memory for distcp action copying actionsets from remote cluster", "paramRequired": true},
|
||||
{"paramName":"tt", "paramLongName":"distcp_task_timeout", "paramDescription": "timeout for distcp copying actions from remote cluster", "paramRequired": true},
|
||||
{"paramName":"tr", "paramLongName":"transform_only", "paramDescription": "activate tranform-only mode. Only apply transformation step", "paramRequired": true}
|
||||
]
|
|
@ -0,0 +1,32 @@
|
|||
[
|
||||
{
|
||||
"paramName": "p",
|
||||
"paramLongName": "hdfsPath",
|
||||
"paramDescription": "the path where storing the sequential file",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "pgurl",
|
||||
"paramLongName": "postgresUrl",
|
||||
"paramDescription": "postgres url, example: jdbc:postgresql://localhost:5432/testdb",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "pguser",
|
||||
"paramLongName": "postgresUser",
|
||||
"paramDescription": "postgres user",
|
||||
"paramRequired": false
|
||||
},
|
||||
{
|
||||
"paramName": "pgpasswd",
|
||||
"paramLongName": "postgresPassword",
|
||||
"paramDescription": "postgres password",
|
||||
"paramRequired": false
|
||||
},
|
||||
{
|
||||
"paramName": "a",
|
||||
"paramLongName": "action",
|
||||
"paramDescription": "process claims",
|
||||
"paramRequired": false
|
||||
}
|
||||
]
|
|
@ -0,0 +1,38 @@
|
|||
[
|
||||
{
|
||||
"paramName": "p",
|
||||
"paramLongName": "hdfsPath",
|
||||
"paramDescription": "the path where storing the sequential file",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "mongourl",
|
||||
"paramLongName": "mongoBaseUrl",
|
||||
"paramDescription": "mongoDB url, example: mongodb://[username:password@]host[:port]",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "mongodb",
|
||||
"paramLongName": "mongoDb",
|
||||
"paramDescription": "mongo database",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "f",
|
||||
"paramLongName": "mdFormat",
|
||||
"paramDescription": "metadata format",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "l",
|
||||
"paramLongName": "mdLayout",
|
||||
"paramDescription": "metadata layout",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "i",
|
||||
"paramLongName": "mdInterpretation",
|
||||
"paramDescription": "metadata interpretation",
|
||||
"paramRequired": true
|
||||
}
|
||||
]
|
|
@ -0,0 +1,7 @@
|
|||
van
|
||||
der
|
||||
de
|
||||
dell
|
||||
sig
|
||||
mr
|
||||
mrs
|
|
@ -0,0 +1 @@
|
|||
SELECT source_type, source_id, target_type, target_id, semantics FROM claim WHERE approved=TRUE;
|
|
@ -0,0 +1,17 @@
|
|||
SELECT
|
||||
dor.datasource AS datasource,
|
||||
dor.organization AS organization,
|
||||
NULL AS startdate,
|
||||
NULL AS enddate,
|
||||
false AS inferred,
|
||||
false AS deletedbyinference,
|
||||
0.9 AS trust,
|
||||
NULL AS inferenceprovenance,
|
||||
dc.id AS collectedfromid,
|
||||
dc.officialname AS collectedfromname,
|
||||
'providedBy@@@provided by@@@dnet:datasources_organizations_typologies@@@dnet:datasources_organizations_typologies' AS semantics,
|
||||
d.provenanceaction || '@@@' || d.provenanceaction || '@@@dnet:provenanceActions@@@dnet:provenanceActions' AS provenanceaction
|
||||
|
||||
FROM dsm_datasource_organization dor
|
||||
LEFT OUTER JOIN dsm_datasources d ON (dor.datasource = d.id)
|
||||
LEFT OUTER JOIN dsm_datasources dc ON (dc.id = d.collectedfrom)
|
|
@ -0,0 +1,147 @@
|
|||
SELECT
|
||||
d.id AS datasourceid,
|
||||
d.id || array_agg(distinct di.pid) AS identities,
|
||||
d.officialname AS officialname,
|
||||
d.englishname AS englishname,
|
||||
d.contactemail AS contactemail,
|
||||
CASE
|
||||
WHEN (array_agg(DISTINCT COALESCE (a.compatibility_override, a.compatibility):: TEXT) @> ARRAY ['openaire-cris_1.1'])
|
||||
THEN
|
||||
'openaire-cris_1.1@@@OpenAIRE CRIS v1.1@@@dnet:datasourceCompatibilityLevel@@@dnet:datasourceCompatibilityLevel'
|
||||
WHEN (array_agg(DISTINCT COALESCE (a.compatibility_override, a.compatibility):: TEXT) @> ARRAY ['driver', 'openaire2.0'])
|
||||
THEN
|
||||
'driver-openaire2.0@@@OpenAIRE 2.0+ (DRIVER OA, EC funding)@@@dnet:datasourceCompatibilityLevel@@@dnet:datasourceCompatibilityLevel'
|
||||
WHEN (array_agg(DISTINCT COALESCE (a.compatibility_override, a.compatibility) :: TEXT) @> ARRAY ['driver'])
|
||||
THEN
|
||||
'driver@@@OpenAIRE Basic (DRIVER OA)@@@dnet:datasourceCompatibilityLevel@@@dnet:datasourceCompatibilityLevel'
|
||||
WHEN (array_agg(DISTINCT COALESCE (a.compatibility_override, a.compatibility) :: TEXT) @> ARRAY ['openaire2.0'])
|
||||
THEN
|
||||
'openaire2.0@@@OpenAIRE 2.0 (EC funding)@@@dnet:datasourceCompatibilityLevel@@@dnet:datasourceCompatibilityLevel'
|
||||
WHEN (array_agg(DISTINCT COALESCE (a.compatibility_override, a.compatibility) :: TEXT) @> ARRAY ['openaire3.0'])
|
||||
THEN
|
||||
'openaire3.0@@@OpenAIRE 3.0 (OA, funding)@@@dnet:datasourceCompatibilityLevel@@@dnet:datasourceCompatibilityLevel'
|
||||
WHEN (array_agg(DISTINCT COALESCE (a.compatibility_override, a.compatibility) :: TEXT) @> ARRAY ['openaire2.0_data'])
|
||||
THEN
|
||||
'openaire2.0_data@@@OpenAIRE Data (funded, referenced datasets)@@@dnet:datasourceCompatibilityLevel@@@dnet:datasourceCompatibilityLevel'
|
||||
WHEN (array_agg(DISTINCT COALESCE (a.compatibility_override, a.compatibility) :: TEXT) @> ARRAY ['native'])
|
||||
THEN
|
||||
'native@@@proprietary@@@dnet:datasourceCompatibilityLevel@@@dnet:datasourceCompatibilityLevel'
|
||||
WHEN (array_agg(DISTINCT COALESCE (a.compatibility_override, a.compatibility) :: TEXT) @> ARRAY ['hostedBy'])
|
||||
THEN
|
||||
'hostedBy@@@collected from a compatible aggregator@@@dnet:datasourceCompatibilityLevel@@@dnet:datasourceCompatibilityLevel'
|
||||
WHEN (array_agg(DISTINCT COALESCE (a.compatibility_override, a.compatibility) :: TEXT) @> ARRAY ['notCompatible'])
|
||||
THEN
|
||||
'notCompatible@@@under validation@@@dnet:datasourceCompatibilityLevel@@@dnet:datasourceCompatibilityLevel'
|
||||
ELSE
|
||||
'UNKNOWN@@@not available@@@dnet:datasourceCompatibilityLevel@@@dnet:datasourceCompatibilityLevel'
|
||||
END AS openairecompatibility,
|
||||
d.websiteurl AS websiteurl,
|
||||
d.logourl AS logourl,
|
||||
array_agg(DISTINCT CASE WHEN a.protocol = 'oai' and last_aggregation_date is not null THEN a.baseurl ELSE NULL END) AS accessinfopackage,
|
||||
d.latitude AS latitude,
|
||||
d.longitude AS longitude,
|
||||
d.namespaceprefix AS namespaceprefix,
|
||||
NULL AS odnumberofitems,
|
||||
NULL AS odnumberofitemsdate,
|
||||
|
||||
(SELECT array_agg(s|| '###keywords@@@keywords@@@dnet:subject_classification_typologies@@@dnet:subject_classification_typologies')
|
||||
FROM UNNEST(
|
||||
ARRAY(
|
||||
SELECT trim(s)
|
||||
FROM unnest(string_to_array(d.subjects, '@@')) AS s)) AS s) AS subjects,
|
||||
|
||||
d.description AS description,
|
||||
NULL AS odpolicies,
|
||||
ARRAY(SELECT trim(s)
|
||||
FROM unnest(string_to_array(d.languages, ',')) AS s) AS odlanguages,
|
||||
ARRAY(SELECT trim(s)
|
||||
FROM unnest(string_to_array(d.od_contenttypes, '-')) AS s) AS odcontenttypes,
|
||||
false AS inferred,
|
||||
false AS deletedbyinference,
|
||||
0.9 AS trust,
|
||||
NULL AS inferenceprovenance,
|
||||
d.dateofcollection AS dateofcollection,
|
||||
d.dateofvalidation AS dateofvalidation,
|
||||
-- re3data fields
|
||||
d.releasestartdate AS releasestartdate,
|
||||
d.releaseenddate AS releaseenddate,
|
||||
d.missionstatementurl AS missionstatementurl,
|
||||
d.dataprovider AS dataprovider,
|
||||
d.serviceprovider AS serviceprovider,
|
||||
d.databaseaccesstype AS databaseaccesstype,
|
||||
d.datauploadtype AS datauploadtype,
|
||||
d.databaseaccessrestriction AS databaseaccessrestriction,
|
||||
d.datauploadrestriction AS datauploadrestriction,
|
||||
d.versioning AS versioning,
|
||||
d.citationguidelineurl AS citationguidelineurl,
|
||||
d.qualitymanagementkind AS qualitymanagementkind,
|
||||
d.pidsystems AS pidsystems,
|
||||
d.certificates AS certificates,
|
||||
ARRAY[]::text[] AS policies,
|
||||
dc.id AS collectedfromid,
|
||||
dc.officialname AS collectedfromname,
|
||||
d.typology || '@@@' || CASE
|
||||
WHEN (d.typology = 'crissystem') THEN 'CRIS System'
|
||||
WHEN (d.typology = 'datarepository::unknown') THEN 'Data Repository'
|
||||
WHEN (d.typology = 'aggregator::datarepository') THEN 'Data Repository Aggregator'
|
||||
WHEN (d.typology = 'infospace') THEN 'Information Space'
|
||||
WHEN (d.typology = 'pubsrepository::institutional') THEN 'Institutional Repository'
|
||||
WHEN (d.typology = 'aggregator::pubsrepository::institutional') THEN 'Institutional Repository Aggregator'
|
||||
WHEN (d.typology = 'pubsrepository::journal') THEN 'Journal'
|
||||
WHEN (d.typology = 'aggregator::pubsrepository::journals') THEN 'Journal Aggregator/Publisher'
|
||||
WHEN (d.typology = 'pubsrepository::mock') THEN 'Other'
|
||||
WHEN (d.typology = 'pubscatalogue::unknown') THEN 'Publication Catalogue'
|
||||
WHEN (d.typology = 'pubsrepository::unknown') THEN 'Publication Repository'
|
||||
WHEN (d.typology = 'aggregator::pubsrepository::unknown') THEN 'Publication Repository Aggregator'
|
||||
WHEN (d.typology = 'entityregistry') THEN 'Registry'
|
||||
WHEN (d.typology = 'scholarcomminfra') THEN 'Scholarly Comm. Infrastructure'
|
||||
WHEN (d.typology = 'pubsrepository::thematic') THEN 'Thematic Repository'
|
||||
WHEN (d.typology = 'websource') THEN 'Web Source'
|
||||
WHEN (d.typology = 'entityregistry::projects') THEN 'Funder database'
|
||||
WHEN (d.typology = 'entityregistry::repositories') THEN 'Registry of repositories'
|
||||
WHEN (d.typology = 'softwarerepository') THEN 'Software Repository'
|
||||
WHEN (d.typology = 'aggregator::softwarerepository') THEN 'Software Repository Aggregator'
|
||||
WHEN (d.typology = 'orprepository') THEN 'Repository'
|
||||
ELSE 'Other'
|
||||
END || '@@@dnet:datasource_typologies@@@dnet:datasource_typologies' AS datasourcetype,
|
||||
'sysimport:crosswalk:entityregistry@@@sysimport:crosswalk:entityregistry@@@dnet:provenance_actions@@@dnet:provenance_actions' AS provenanceaction,
|
||||
CONCAT(d.issn, '@@@', d.eissn, '@@@', d.lissn) AS journal
|
||||
|
||||
FROM dsm_datasources d
|
||||
|
||||
LEFT OUTER JOIN dsm_datasources dc on (d.collectedfrom = dc.id)
|
||||
LEFT OUTER JOIN dsm_api a ON (d.id = a.datasource)
|
||||
LEFT OUTER JOIN dsm_datasourcepids di ON (d.id = di.datasource)
|
||||
|
||||
GROUP BY
|
||||
d.id,
|
||||
d.officialname,
|
||||
d.englishname,
|
||||
d.websiteurl,
|
||||
d.logourl,
|
||||
d.contactemail,
|
||||
d.namespaceprefix,
|
||||
d.description,
|
||||
d.latitude,
|
||||
d.longitude,
|
||||
d.dateofcollection,
|
||||
d.dateofvalidation,
|
||||
d.releasestartdate,
|
||||
d.releaseenddate,
|
||||
d.missionstatementurl,
|
||||
d.dataprovider,
|
||||
d.serviceprovider,
|
||||
d.databaseaccesstype,
|
||||
d.datauploadtype,
|
||||
d.databaseaccessrestriction,
|
||||
d.datauploadrestriction,
|
||||
d.versioning,
|
||||
d.citationguidelineurl,
|
||||
d.qualitymanagementkind,
|
||||
d.pidsystems,
|
||||
d.certificates,
|
||||
dc.id,
|
||||
dc.officialname,
|
||||
d.issn,
|
||||
d.eissn,
|
||||
d.lissn
|
|
@ -0,0 +1,35 @@
|
|||
SELECT
|
||||
o.id AS organizationid,
|
||||
o.legalshortname AS legalshortname,
|
||||
o.legalname AS legalname,
|
||||
o.websiteurl AS websiteurl,
|
||||
o.logourl AS logourl,
|
||||
o.ec_legalbody AS eclegalbody,
|
||||
o.ec_legalperson AS eclegalperson,
|
||||
o.ec_nonprofit AS ecnonprofit,
|
||||
o.ec_researchorganization AS ecresearchorganization,
|
||||
o.ec_highereducation AS echighereducation,
|
||||
o.ec_internationalorganizationeurinterests AS ecinternationalorganizationeurinterests,
|
||||
o.ec_internationalorganization AS ecinternationalorganization,
|
||||
o.ec_enterprise AS ecenterprise,
|
||||
o.ec_smevalidated AS ecsmevalidated,
|
||||
o.ec_nutscode AS ecnutscode,
|
||||
o.dateofcollection AS dateofcollection,
|
||||
o.lastupdate AS dateoftransformation,
|
||||
false AS inferred,
|
||||
false AS deletedbyinference,
|
||||
o.trust AS trust,
|
||||
'' AS inferenceprovenance,
|
||||
d.id AS collectedfromid,
|
||||
d.officialname AS collectedfromname,
|
||||
o.country || '@@@' || o.country || '@@@dnet:countries@@@dnet:countries' AS country,
|
||||
'sysimport:crosswalk:entityregistry@@@sysimport:crosswalk:entityregistry@@@dnet:provenance_actions@@@dnet:provenance_actions' AS provenanceaction,
|
||||
|
||||
ARRAY[]::text[] AS pid
|
||||
FROM dsm_organizations o
|
||||
LEFT OUTER JOIN dsm_datasources d ON (d.id = o.collectedfrom)
|
||||
|
||||
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,53 @@
|
|||
SELECT
|
||||
o.id AS organizationid,
|
||||
coalesce((array_agg(a.acronym))[1], o.name) AS legalshortname,
|
||||
o.name AS legalname,
|
||||
array_agg(DISTINCT n.name) AS "alternativeNames",
|
||||
(array_agg(u.url))[1] AS websiteurl,
|
||||
o.modification_date AS dateoftransformation,
|
||||
false AS inferred,
|
||||
false AS deletedbyinference,
|
||||
0.95 AS trust,
|
||||
'' AS inferenceprovenance,
|
||||
'openaire____::openorgs' AS collectedfromid,
|
||||
'OpenOrgs Database' AS collectedfromname,
|
||||
o.country || '@@@' || o.country || '@@@dnet:countries@@@dnet:countries' AS country,
|
||||
'sysimport:crosswalk:entityregistry@@@sysimport:crosswalk:entityregistry@@@dnet:provenance_actions@@@dnet:provenance_actions' AS provenanceaction,
|
||||
array_agg(DISTINCT i.otherid || '###' || i.type || '@@@dnet:pid_types') AS pid
|
||||
FROM organizations o
|
||||
LEFT OUTER JOIN acronyms a ON (a.id = o.id)
|
||||
LEFT OUTER JOIN urls u ON (u.id = o.id)
|
||||
LEFT OUTER JOIN other_ids i ON (i.id = o.id)
|
||||
LEFT OUTER JOIN other_names n ON (n.id = o.id)
|
||||
GROUP BY
|
||||
o.id,
|
||||
o.name,
|
||||
o.modification_date,
|
||||
o.country
|
||||
|
||||
UNION ALL
|
||||
|
||||
SELECT
|
||||
'openorgsmesh'||substring(o.id, 13)||'-'||md5(n.name) AS organizationid,
|
||||
n.name AS legalshortname,
|
||||
n.name AS legalname,
|
||||
ARRAY[]::text[] AS "alternativeNames",
|
||||
(array_agg(u.url))[1] AS websiteurl,
|
||||
o.modification_date AS dateoftransformation,
|
||||
false AS inferred,
|
||||
false AS deletedbyinference,
|
||||
0.88 AS trust,
|
||||
'' AS inferenceprovenance,
|
||||
'openaire____::openorgs' AS collectedfromid,
|
||||
'OpenOrgs Database' AS collectedfromname,
|
||||
o.country || '@@@' || o.country || '@@@dnet:countries@@@dnet:countries' AS country,
|
||||
'sysimport:crosswalk:entityregistry@@@sysimport:crosswalk:entityregistry@@@dnet:provenance_actions@@@dnet:provenance_actions' AS provenanceaction,
|
||||
array_agg(DISTINCT i.otherid || '###' || i.type || '@@@dnet:pid_types') AS pid
|
||||
FROM other_names n
|
||||
LEFT OUTER JOIN organizations o ON (n.id = o.id)
|
||||
LEFT OUTER JOIN urls u ON (u.id = o.id)
|
||||
LEFT OUTER JOIN other_ids i ON (i.id = o.id)
|
||||
GROUP BY
|
||||
o.id, o.modification_date, o.country, n.name
|
||||
|
||||
|
|
@ -0,0 +1,19 @@
|
|||
SELECT
|
||||
po.project AS project,
|
||||
po.resporganization AS resporganization,
|
||||
po.participantnumber AS participantnumber,
|
||||
po.contribution AS contribution,
|
||||
NULL AS startdate,
|
||||
NULL AS enddate,
|
||||
false AS inferred,
|
||||
false AS deletedbyinference,
|
||||
po.trust AS trust,
|
||||
NULL AS inferenceprovenance,
|
||||
dc.id AS collectedfromid,
|
||||
dc.officialname AS collectedfromname,
|
||||
po.semanticclass || '@@@' || po.semanticclass || '@@@dnet:project_organization_relations@@@dnet:project_organization_relations' AS semantics,
|
||||
'sysimport:crosswalk:entityregistry@@@sysimport:crosswalk:entityregistry@@@dnet:provenance_actions@@@dnet:provenance_actions' AS provenanceaction
|
||||
|
||||
FROM project_organization po
|
||||
LEFT OUTER JOIN projects p ON (p.id = po.project)
|
||||
LEFT OUTER JOIN dsm_datasources dc ON (dc.id = p.collectedfrom)
|
|
@ -0,0 +1,89 @@
|
|||
SELECT
|
||||
p.id AS projectid,
|
||||
p.code AS code,
|
||||
p.websiteurl AS websiteurl,
|
||||
p.acronym AS acronym,
|
||||
p.title AS title,
|
||||
p.startdate AS startdate,
|
||||
p.enddate AS enddate,
|
||||
p.call_identifier AS callidentifier,
|
||||
p.keywords AS keywords,
|
||||
p.duration AS duration,
|
||||
p.ec_sc39 AS ecsc39,
|
||||
p.oa_mandate_for_publications AS oamandatepublications,
|
||||
p.ec_article29_3 AS ecarticle29_3,
|
||||
p.dateofcollection AS dateofcollection,
|
||||
p.lastupdate AS dateoftransformation,
|
||||
p.inferred AS inferred,
|
||||
p.deletedbyinference AS deletedbyinference,
|
||||
p.trust AS trust,
|
||||
p.inferenceprovenance AS inferenceprovenance,
|
||||
p.optional1 AS optional1,
|
||||
p.optional2 AS optional2,
|
||||
p.jsonextrainfo AS jsonextrainfo,
|
||||
p.contactfullname AS contactfullname,
|
||||
p.contactfax AS contactfax,
|
||||
p.contactphone AS contactphone,
|
||||
p.contactemail AS contactemail,
|
||||
p.summary AS summary,
|
||||
p.currency AS currency,
|
||||
p.totalcost AS totalcost,
|
||||
p.fundedamount AS fundedamount,
|
||||
dc.id AS collectedfromid,
|
||||
dc.officialname AS collectedfromname,
|
||||
p.contracttype || '@@@' || p.contracttypename || '@@@' || p.contracttypescheme || '@@@' || p.contracttypescheme AS contracttype,
|
||||
pac.code || '@@@' || pac.name || '@@@' || pas.code || '@@@' || pas.name AS provenanceaction,
|
||||
array_agg(DISTINCT i.pid || '###' || i.issuertype) AS pid,
|
||||
array_agg(DISTINCT s.name || '###' || sc.code || '@@@' || sc.name || '@@@' || ss.code || '@@@' || ss.name) AS subjects,
|
||||
array_agg(DISTINCT fp.path) AS fundingtree
|
||||
|
||||
FROM projects p
|
||||
|
||||
LEFT OUTER JOIN class pac ON (pac.code = p.provenanceactionclass)
|
||||
LEFT OUTER JOIN scheme pas ON (pas.code = p.provenanceactionscheme)
|
||||
|
||||
LEFT OUTER JOIN projectpids pp ON (pp.project = p.id)
|
||||
LEFT OUTER JOIN dsm_identities i ON (i.pid = pp.pid)
|
||||
|
||||
LEFT OUTER JOIN dsm_datasources dc ON (dc.id = p.collectedfrom)
|
||||
|
||||
LEFT OUTER JOIN project_fundingpath pf ON (pf.project = p.id)
|
||||
LEFT OUTER JOIN fundingpaths fp ON (fp.id = pf.funding)
|
||||
|
||||
LEFT OUTER JOIN project_subject ps ON (ps.project = p.id)
|
||||
LEFT OUTER JOIN subjects s ON (s.id = ps.subject)
|
||||
|
||||
LEFT OUTER JOIN class sc ON (sc.code = s.semanticclass)
|
||||
LEFT OUTER JOIN scheme ss ON (ss.code = s.semanticscheme)
|
||||
|
||||
GROUP BY
|
||||
p.id,
|
||||
p.code,
|
||||
p.websiteurl,
|
||||
p.acronym,
|
||||
p.title,
|
||||
p.startdate,
|
||||
p.enddate,
|
||||
p.call_identifier,
|
||||
p.keywords,
|
||||
p.duration,
|
||||
p.ec_sc39,
|
||||
p.oa_mandate_for_publications,
|
||||
p.ec_article29_3,
|
||||
p.dateofcollection,
|
||||
p.inferred,
|
||||
p.deletedbyinference,
|
||||
p.trust,
|
||||
p.inferenceprovenance,
|
||||
p.contactfullname,
|
||||
p.contactfax,
|
||||
p.contactphone,
|
||||
p.contactemail,
|
||||
p.summary,
|
||||
p.currency,
|
||||
p.totalcost,
|
||||
p.fundedamount,
|
||||
dc.id,
|
||||
dc.officialname,
|
||||
pac.code, pac.name, pas.code, pas.name,
|
||||
p.contracttype , p.contracttypename, p.contracttypescheme;
|
|
@ -0,0 +1,90 @@
|
|||
SELECT
|
||||
p.id AS projectid,
|
||||
p.code AS code,
|
||||
p.websiteurl AS websiteurl,
|
||||
p.acronym AS acronym,
|
||||
p.title AS title,
|
||||
p.startdate AS startdate,
|
||||
p.enddate AS enddate,
|
||||
p.call_identifier AS callidentifier,
|
||||
p.keywords AS keywords,
|
||||
p.duration AS duration,
|
||||
p.ec_sc39 AS ecsc39,
|
||||
p.oa_mandate_for_publications AS oamandatepublications,
|
||||
p.ec_article29_3 AS ecarticle29_3,
|
||||
p.dateofcollection AS dateofcollection,
|
||||
p.lastupdate AS dateoftransformation,
|
||||
p.inferred AS inferred,
|
||||
p.deletedbyinference AS deletedbyinference,
|
||||
p.trust AS trust,
|
||||
p.inferenceprovenance AS inferenceprovenance,
|
||||
p.optional1 AS optional1,
|
||||
p.optional2 AS optional2,
|
||||
p.jsonextrainfo AS jsonextrainfo,
|
||||
p.contactfullname AS contactfullname,
|
||||
p.contactfax AS contactfax,
|
||||
p.contactphone AS contactphone,
|
||||
p.contactemail AS contactemail,
|
||||
p.summary AS summary,
|
||||
p.currency AS currency,
|
||||
p.totalcost AS totalcost,
|
||||
p.fundedamount AS fundedamount,
|
||||
dc.id AS collectedfromid,
|
||||
dc.officialname AS collectedfromname,
|
||||
ctc.code || '@@@' || ctc.name || '@@@' || cts.code || '@@@' || cts.name AS contracttype,
|
||||
pac.code || '@@@' || pac.name || '@@@' || pas.code || '@@@' || pas.name AS provenanceaction,
|
||||
array_agg(DISTINCT i.pid || '###' || i.issuertype) AS pid,
|
||||
array_agg(DISTINCT s.name || '###' || sc.code || '@@@' || sc.name || '@@@' || ss.code || '@@@' || ss.name) AS subjects,
|
||||
array_agg(DISTINCT fp.path) AS fundingtree
|
||||
FROM projects p
|
||||
LEFT OUTER JOIN class pac ON (pac.code = p.provenanceactionclass)
|
||||
LEFT OUTER JOIN scheme pas ON (pas.code = p.provenanceactionscheme)
|
||||
|
||||
LEFT OUTER JOIN projectpids pp ON (pp.project = p.id)
|
||||
LEFT OUTER JOIN dsm_identities i ON (i.pid = pp.pid)
|
||||
|
||||
LEFT OUTER JOIN dsm_datasources dc ON (dc.id = p.collectedfrom)
|
||||
|
||||
LEFT OUTER JOIN project_fundingpath pf ON (pf.project = p.id)
|
||||
LEFT OUTER JOIN fundingpaths fp ON (fp.id = pf.funding)
|
||||
|
||||
LEFT OUTER JOIN project_subject ps ON (ps.project = p.id)
|
||||
LEFT OUTER JOIN subjects s ON (s.id = ps.subject)
|
||||
|
||||
LEFT OUTER JOIN class sc ON (sc.code = s.semanticclass)
|
||||
LEFT OUTER JOIN scheme ss ON (ss.code = s.semanticscheme)
|
||||
|
||||
LEFT OUTER JOIN class ctc ON (ctc.code = p.contracttypeclass)
|
||||
LEFT OUTER JOIN scheme cts ON (cts.code = p.contracttypescheme)
|
||||
|
||||
GROUP BY
|
||||
p.id,
|
||||
p.code,
|
||||
p.websiteurl,
|
||||
p.acronym,
|
||||
p.title,
|
||||
p.startdate,
|
||||
p.enddate,
|
||||
p.call_identifier,
|
||||
p.keywords,
|
||||
p.duration,
|
||||
p.ec_sc39,
|
||||
p.oa_mandate_for_publications,
|
||||
p.ec_article29_3,
|
||||
p.dateofcollection,
|
||||
p.inferred,
|
||||
p.deletedbyinference,
|
||||
p.trust,
|
||||
p.inferenceprovenance,
|
||||
p.contactfullname,
|
||||
p.contactfax,
|
||||
p.contactphone,
|
||||
p.contactemail,
|
||||
p.summary,
|
||||
p.currency,
|
||||
p.totalcost,
|
||||
p.fundedamount,
|
||||
dc.id,
|
||||
dc.officialname,
|
||||
pac.code, pac.name, pas.code, pas.name,
|
||||
ctc.code, ctc.name, cts.code, cts.name;
|
|
@ -0,0 +1,17 @@
|
|||
SELECT local_id AS id1, oa_original_id AS id2 FROM openaire_simrels WHERE reltype = 'is_similar'
|
||||
|
||||
UNION ALL
|
||||
|
||||
SELECT
|
||||
o.id AS id1,
|
||||
'openorgsmesh'||substring(o.id, 13)||'-'||md5(a.acronym) AS id2
|
||||
FROM acronyms a
|
||||
LEFT OUTER JOIN organizations o ON (a.id = o.id)
|
||||
|
||||
UNION ALL
|
||||
|
||||
SELECT
|
||||
o.id AS id1,
|
||||
'openorgsmesh'||substring(o.id, 13)||'-'||md5(n.name) AS id2
|
||||
FROM other_names n
|
||||
LEFT OUTER JOIN organizations o ON (n.id = o.id)
|
|
@ -0,0 +1,5 @@
|
|||
[
|
||||
{"paramName":"mt", "paramLongName":"master", "paramDescription": "should be local or yarn", "paramRequired": true},
|
||||
{"paramName":"is", "paramLongName":"isLookupUrl", "paramDescription": "URL of the isLookUp Service", "paramRequired": true},
|
||||
{"paramName":"i", "paramLongName":"inputPaths", "paramDescription": "URL of the isLookUp Service", "paramRequired": true}
|
||||
]
|
|
@ -0,0 +1,30 @@
|
|||
<configuration>
|
||||
<property>
|
||||
<name>jobTracker</name>
|
||||
<value>yarnRM</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>nameNode</name>
|
||||
<value>hdfs://nameservice1</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>sourceNN</name>
|
||||
<value>webhdfs://namenode2.hadoop.dm.openaire.eu:50071</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.use.system.libpath</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.action.sharelib.for.spark</name>
|
||||
<value>spark2</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2YarnHistoryServerAddress</name>
|
||||
<value>http://iis-cdh5-test-gw.ocean.icm.edu.pl:18088</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2EventLogDir</name>
|
||||
<value>/user/spark/applicationHistory</value>
|
||||
</property>
|
||||
</configuration>
|
|
@ -0,0 +1,122 @@
|
|||
<workflow-app xmlns='uri:oozie:workflow:0.5' name='migrate_actions'>
|
||||
<parameters>
|
||||
<property>
|
||||
<name>sourceNN</name>
|
||||
<description>the source name node</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>isLookupUrl</name>
|
||||
<description>the isLookup service endpoint</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>workingDirectory</name>
|
||||
<value>/tmp/actionsets</value>
|
||||
<description>working directory</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>distcp_memory_mb</name>
|
||||
<value>6144</value>
|
||||
<description>memory for distcp copying actionsets from remote cluster</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>distcp_task_timeout</name>
|
||||
<value>60000000</value>
|
||||
<description>timeout for distcp copying actions from remote cluster</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>distcp_num_maps</name>
|
||||
<value>1</value>
|
||||
<description>mmaximum number of map tasks used in the distcp process</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>transform_only</name>
|
||||
<description>activate tranform-only mode. Only apply transformation step</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkDriverMemory</name>
|
||||
<description>memory for driver process</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkExecutorMemory</name>
|
||||
<description>memory for individual executor</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkExecutorCores</name>
|
||||
<description>number of cores used by single executor</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2YarnHistoryServerAddress</name>
|
||||
<description>spark 2.* yarn history server address</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2EventLogDir</name>
|
||||
<description>spark 2.* event log dir location</description>
|
||||
</property>
|
||||
</parameters>
|
||||
|
||||
<global>
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<configuration>
|
||||
<property>
|
||||
<name>mapreduce.job.queuename</name>
|
||||
<value>${queueName}</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.launcher.mapred.job.queue.name</name>
|
||||
<value>${oozieLauncherQueueName}</value>
|
||||
</property>
|
||||
</configuration>
|
||||
</global>
|
||||
|
||||
<start to='migrate_actionsets' />
|
||||
|
||||
<action name='migrate_actionsets'>
|
||||
<java>
|
||||
<main-class>eu.dnetlib.dhp.migration.actions.MigrateActionSet</main-class>
|
||||
<java-opt>-Dmapred.task.timeout=${distcp_task_timeout}</java-opt>
|
||||
<arg>-is</arg><arg>${isLookupUrl}</arg>
|
||||
<arg>-sn</arg><arg>${sourceNN}</arg>
|
||||
<arg>-tn</arg><arg>${nameNode}</arg>
|
||||
<arg>-w</arg><arg>${workingDirectory}</arg>
|
||||
<arg>-nm</arg><arg>${distcp_num_maps}</arg>
|
||||
<arg>-mm</arg><arg>${distcp_memory_mb}</arg>
|
||||
<arg>-tt</arg><arg>${distcp_task_timeout}</arg>
|
||||
<arg>-tr</arg><arg>${transform_only}</arg>
|
||||
<capture-output/>
|
||||
</java>
|
||||
<ok to="transform_actions" />
|
||||
<error to="fail" />
|
||||
</action>
|
||||
|
||||
<action name="transform_actions">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>transform_actions</name>
|
||||
<class>eu.dnetlib.dhp.migration.actions.TransformActions</class>
|
||||
<jar>dhp-aggregation-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-cores ${sparkExecutorCores}
|
||||
--executor-memory ${sparkExecutorMemory}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener"
|
||||
--conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener"
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
</spark-opts>
|
||||
<arg>-mt</arg><arg>yarn</arg>
|
||||
<arg>-is</arg><arg>${isLookupUrl}</arg>
|
||||
<arg>--inputPaths</arg><arg>${wf:actionData('migrate_actionsets')['target_paths']}</arg>
|
||||
</spark>
|
||||
<ok to="end"/>
|
||||
<error to="fail"/>
|
||||
</action>
|
||||
|
||||
<kill name="fail">
|
||||
<message>migrate_actions failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||
</kill>
|
||||
|
||||
<end name="end" />
|
||||
|
||||
</workflow-app>
|
|
@ -0,0 +1,18 @@
|
|||
<configuration>
|
||||
<property>
|
||||
<name>jobTracker</name>
|
||||
<value>yarnRM</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>nameNode</name>
|
||||
<value>hdfs://nameservice1</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.use.system.libpath</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.action.sharelib.for.spark</name>
|
||||
<value>spark2</value>
|
||||
</property>
|
||||
</configuration>
|
|
@ -0,0 +1,169 @@
|
|||
<workflow-app name="import Claims as Graph" xmlns="uri:oozie:workflow:0.5">
|
||||
<parameters>
|
||||
<property>
|
||||
<name>migrationClaimsPathStep1</name>
|
||||
<description>the base path to store hdfs file</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>migrationClaimsPathStep2</name>
|
||||
<description>the temporary path to store entities before dispatching</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>migrationClaimsPathStep3</name>
|
||||
<description>the graph Raw base path</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>postgresURL</name>
|
||||
<description>the postgres URL to access to the database</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>postgresUser</name>
|
||||
<description>the user postgres</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>postgresPassword</name>
|
||||
<description>the password postgres</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>mongoURL</name>
|
||||
<description>mongoDB url, example: mongodb://[username:password@]host[:port]</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>mongoDb</name>
|
||||
<description>mongo database</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkDriverMemory</name>
|
||||
<description>memory for driver process</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkExecutorMemory</name>
|
||||
<description>memory for individual executor</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkExecutorCores</name>
|
||||
<description>number of cores used by single executor</description>
|
||||
</property>
|
||||
</parameters>
|
||||
|
||||
<start to="ResetWorkingPath"/>
|
||||
|
||||
<kill name="Kill">
|
||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||
</kill>
|
||||
|
||||
<action name="ResetWorkingPath">
|
||||
<fs>
|
||||
<delete path='${migrationClaimsPathStep1}'/>
|
||||
<mkdir path='${migrationClaimsPathStep1}'/>
|
||||
</fs>
|
||||
<ok to="ImportDBClaims"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="ImportDBClaims">
|
||||
<java>
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<main-class>eu.dnetlib.dhp.migration.step1.MigrateDbEntitiesApplication</main-class>
|
||||
<arg>-p</arg><arg>${migrationClaimsPathStep1}/db_claims</arg>
|
||||
<arg>-pgurl</arg><arg>${postgresURL}</arg>
|
||||
<arg>-pguser</arg><arg>${postgresUser}</arg>
|
||||
<arg>-pgpasswd</arg><arg>${postgresPassword}</arg>
|
||||
<arg>-a</arg><arg>claims</arg>
|
||||
</java>
|
||||
<ok to="ImportODFClaims"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="ImportODFClaims">
|
||||
<java>
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<main-class>eu.dnetlib.dhp.migration.step1.MigrateMongoMdstoresApplication</main-class>
|
||||
<arg>-p</arg><arg>${migrationClaimsPathStep1}/odf_claims</arg>
|
||||
<arg>-mongourl</arg><arg>${mongoURL}</arg>
|
||||
<arg>-mongodb</arg><arg>${mongoDb}</arg>
|
||||
<arg>-f</arg><arg>ODF</arg>
|
||||
<arg>-l</arg><arg>store</arg>
|
||||
<arg>-i</arg><arg>claim</arg>
|
||||
</java>
|
||||
<ok to="ImportOAFClaims"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="ImportOAFClaims">
|
||||
<java>
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<main-class>eu.dnetlib.dhp.migration.step1.MigrateMongoMdstoresApplication</main-class>
|
||||
<arg>-p</arg><arg>${migrationClaimsPathStep1}/oaf_claims</arg>
|
||||
<arg>-mongourl</arg><arg>${mongoURL}</arg>
|
||||
<arg>-mongodb</arg><arg>${mongoDb}</arg>
|
||||
<arg>-f</arg><arg>OAF</arg>
|
||||
<arg>-l</arg><arg>store</arg>
|
||||
<arg>-i</arg><arg>claim</arg>
|
||||
</java>
|
||||
<ok to="ResetClaimEntities"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="ResetClaimEntities">
|
||||
<fs>
|
||||
<delete path='${migrationClaimsPathStep2}'/>
|
||||
<mkdir path='${migrationClaimsPathStep2}'/>
|
||||
</fs>
|
||||
<ok to="GenerateClaimEntities"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="GenerateClaimEntities">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<master>yarn-cluster</master>
|
||||
<mode>cluster</mode>
|
||||
<name>GenerateClaimEntities</name>
|
||||
<class>eu.dnetlib.dhp.migration.step2.GenerateEntitiesApplication</class>
|
||||
<jar>dhp-aggregation-${projectVersion}.jar</jar>
|
||||
<spark-opts>--executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores} --driver-memory=${sparkDriverMemory} --conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" --conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" --conf spark.sql.warehouse.dir="/user/hive/warehouse"</spark-opts>
|
||||
<arg>-mt</arg> <arg>yarn-cluster</arg>
|
||||
<arg>-s</arg><arg>${migrationClaimsPathStep1}/db_claims,${migrationClaimsPathStep1}/oaf_claims,${migrationClaimsPathStep1}/odf_claims</arg>
|
||||
<arg>-t</arg><arg>${migrationClaimsPathStep2}/claim_entities</arg>
|
||||
<arg>-pgurl</arg><arg>${postgresURL}</arg>
|
||||
<arg>-pguser</arg><arg>${postgresUser}</arg>
|
||||
<arg>-pgpasswd</arg><arg>${postgresPassword}</arg>
|
||||
</spark>
|
||||
<ok to="ResetClaimGraph"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="ResetClaimGraph">
|
||||
<fs>
|
||||
<delete path='${migrationClaimsPathStep3}'/>
|
||||
<mkdir path='${migrationClaimsPathStep3}'/>
|
||||
</fs>
|
||||
<ok to="GenerateClaimGraph"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="GenerateClaimGraph">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<master>yarn-cluster</master>
|
||||
<mode>cluster</mode>
|
||||
<name>GenerateClaimGraph</name>
|
||||
<class>eu.dnetlib.dhp.migration.step3.DispatchEntitiesApplication</class>
|
||||
<jar>dhp-aggregation-${projectVersion}.jar</jar>
|
||||
<spark-opts>--executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores} --driver-memory=${sparkDriverMemory} --conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" --conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" --conf spark.sql.warehouse.dir="/user/hive/warehouse"</spark-opts>
|
||||
<arg>-mt</arg> <arg>yarn-cluster</arg>
|
||||
<arg>-s</arg><arg>${migrationClaimsPathStep2}/claim_entities</arg>
|
||||
<arg>-g</arg><arg>${migrationClaimsPathStep3}</arg>
|
||||
</spark>
|
||||
<ok to="End"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<end name="End"/>
|
||||
</workflow-app>
|
|
@ -0,0 +1,18 @@
|
|||
<configuration>
|
||||
<property>
|
||||
<name>jobTracker</name>
|
||||
<value>yarnRM</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>nameNode</name>
|
||||
<value>hdfs://nameservice1</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.use.system.libpath</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.action.sharelib.for.spark</name>
|
||||
<value>spark2</value>
|
||||
</property>
|
||||
</configuration>
|
|
@ -0,0 +1,197 @@
|
|||
<workflow-app name="import regular entities as Graph (all steps)" xmlns="uri:oozie:workflow:0.5">
|
||||
|
||||
<parameters>
|
||||
<property>
|
||||
<name>workingPath</name>
|
||||
<value>/tmp/dhp_migration</value>
|
||||
<description>the base path to store temporary intermediate data</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>graphBasePath</name>
|
||||
<description>the target path to store raw graph</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>reuseContent</name>
|
||||
<value>false</value>
|
||||
<description>should import content from the aggregator or reuse a previous version</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>postgresURL</name>
|
||||
<description>the postgres URL to access to the database</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>postgresUser</name>
|
||||
<description>the user postgres</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>postgresPassword</name>
|
||||
<description>the password postgres</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>mongoURL</name>
|
||||
<description>mongoDB url, example: mongodb://[username:password@]host[:port]</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>mongoDb</name>
|
||||
<description>mongo database</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkDriverMemory</name>
|
||||
<description>memory for driver process</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkExecutorMemory</name>
|
||||
<description>memory for individual executor</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkExecutorCores</name>
|
||||
<description>number of cores used by single executor</description>
|
||||
</property>
|
||||
</parameters>
|
||||
|
||||
<global>
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<configuration>
|
||||
<property>
|
||||
<name>mapreduce.job.queuename</name>
|
||||
<value>${queueName}</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.launcher.mapred.job.queue.name</name>
|
||||
<value>${oozieLauncherQueueName}</value>
|
||||
</property>
|
||||
</configuration>
|
||||
</global>
|
||||
|
||||
<start to="ReuseContent"/>
|
||||
|
||||
<kill name="Kill">
|
||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||
</kill>
|
||||
|
||||
<decision name="ReuseContent">
|
||||
<switch>
|
||||
<case to="ResetWorkingPath">${wf:conf('reuseContent') eq false}</case>
|
||||
<case to="ResetAllEntitiesPath">${wf:conf('reuseContent') eq true}</case>
|
||||
<default to="ResetWorkingPath"/>
|
||||
</switch>
|
||||
</decision>
|
||||
|
||||
<action name="ResetWorkingPath">
|
||||
<fs>
|
||||
<delete path="${workingPath}"/>
|
||||
<mkdir path="${workingPath}"/>
|
||||
</fs>
|
||||
<ok to="ImportDB"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="ImportDB">
|
||||
<java>
|
||||
<main-class>eu.dnetlib.dhp.migration.step1.MigrateDbEntitiesApplication</main-class>
|
||||
<arg>-p</arg><arg>${workingPath}/db_records</arg>
|
||||
<arg>-pgurl</arg><arg>${postgresURL}</arg>
|
||||
<arg>-pguser</arg><arg>${postgresUser}</arg>
|
||||
<arg>-pgpasswd</arg><arg>${postgresPassword}</arg>
|
||||
</java>
|
||||
<ok to="ImportODF"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="ImportODF">
|
||||
<java>
|
||||
<main-class>eu.dnetlib.dhp.migration.step1.MigrateMongoMdstoresApplication</main-class>
|
||||
<arg>-p</arg><arg>${workingPath}/odf_records</arg>
|
||||
<arg>-mongourl</arg><arg>${mongoURL}</arg>
|
||||
<arg>-mongodb</arg><arg>${mongoDb}</arg>
|
||||
<arg>-f</arg><arg>ODF</arg>
|
||||
<arg>-l</arg><arg>store</arg>
|
||||
<arg>-i</arg><arg>cleaned</arg>
|
||||
</java>
|
||||
<ok to="ImportOAF"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="ImportOAF">
|
||||
<java>
|
||||
<main-class>eu.dnetlib.dhp.migration.step1.MigrateMongoMdstoresApplication</main-class>
|
||||
<arg>-p</arg><arg>${workingPath}/oaf_records</arg>
|
||||
<arg>-mongourl</arg><arg>${mongoURL}</arg>
|
||||
<arg>-mongodb</arg><arg>${mongoDb}</arg>
|
||||
<arg>-f</arg><arg>OAF</arg>
|
||||
<arg>-l</arg><arg>store</arg>
|
||||
<arg>-i</arg><arg>cleaned</arg>
|
||||
</java>
|
||||
<ok to="ResetAllEntitiesPath"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="ResetAllEntitiesPath">
|
||||
<fs>
|
||||
<delete path="${workingPath}/all_entities"/>
|
||||
</fs>
|
||||
<ok to="GenerateEntities"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="GenerateEntities">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>GenerateEntities</name>
|
||||
<class>eu.dnetlib.dhp.migration.step2.GenerateEntitiesApplication</class>
|
||||
<jar>dhp-aggregation-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-memory ${sparkExecutorMemory}
|
||||
--executor-cores ${sparkExecutorCores}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener"
|
||||
--conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener"
|
||||
--conf spark.sql.warehouse.dir="/user/hive/warehouse"
|
||||
</spark-opts>
|
||||
<arg>-mt</arg> <arg>yarn-cluster</arg>
|
||||
<arg>-s</arg><arg>${workingPath}/db_records,${workingPath}/oaf_records,${workingPath}/odf_records</arg>
|
||||
<arg>-t</arg><arg>${workingPath}/all_entities</arg>
|
||||
<arg>-pgurl</arg><arg>${postgresURL}</arg>
|
||||
<arg>-pguser</arg><arg>${postgresUser}</arg>
|
||||
<arg>-pgpasswd</arg><arg>${postgresPassword}</arg>
|
||||
</spark>
|
||||
<ok to="ResetGraphPath"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="ResetGraphPath">
|
||||
<fs>
|
||||
<delete path="${graphBasePath}/graph_raw"/>
|
||||
<mkdir path="${graphBasePath}/graph_raw"/>
|
||||
</fs>
|
||||
<ok to="GenerateGraph"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="GenerateGraph">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>GenerateGraph</name>
|
||||
<class>eu.dnetlib.dhp.migration.step3.DispatchEntitiesApplication</class>
|
||||
<jar>dhp-aggregation-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-memory ${sparkExecutorMemory}
|
||||
--executor-cores ${sparkExecutorCores}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener"
|
||||
--conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener"
|
||||
--conf spark.sql.warehouse.dir="/user/hive/warehouse"
|
||||
</spark-opts>
|
||||
<arg>-mt</arg> <arg>yarn-cluster</arg>
|
||||
<arg>-s</arg><arg>${workingPath}/all_entities</arg>
|
||||
<arg>-g</arg><arg>${graphBasePath}/graph_raw</arg>
|
||||
</spark>
|
||||
<ok to="End"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<end name="End"/>
|
||||
</workflow-app>
|
|
@ -0,0 +1,18 @@
|
|||
<configuration>
|
||||
<property>
|
||||
<name>jobTracker</name>
|
||||
<value>yarnRM</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>nameNode</name>
|
||||
<value>hdfs://nameservice1</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.use.system.libpath</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.action.sharelib.for.spark</name>
|
||||
<value>spark2</value>
|
||||
</property>
|
||||
</configuration>
|
|
@ -0,0 +1,103 @@
|
|||
<workflow-app name="import regular entities as Graph (step 1)" xmlns="uri:oozie:workflow:0.5">
|
||||
<parameters>
|
||||
<property>
|
||||
<name>migrationPathStep1</name>
|
||||
<description>the base path to store hdfs file</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>postgresURL</name>
|
||||
<description>the postgres URL to access to the database</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>postgresUser</name>
|
||||
<description>the user postgres</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>postgresPassword</name>
|
||||
<description>the password postgres</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>mongoURL</name>
|
||||
<description>mongoDB url, example: mongodb://[username:password@]host[:port]</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>mongoDb</name>
|
||||
<description>mongo database</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkDriverMemory</name>
|
||||
<description>memory for driver process</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkExecutorMemory</name>
|
||||
<description>memory for individual executor</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkExecutorCores</name>
|
||||
<description>number of cores used by single executor</description>
|
||||
</property>
|
||||
</parameters>
|
||||
|
||||
<start to="ResetWorkingPath"/>
|
||||
|
||||
<kill name="Kill">
|
||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||
</kill>
|
||||
|
||||
<action name="ResetWorkingPath">
|
||||
<fs>
|
||||
<delete path='${migrationPathStep1}'/>
|
||||
<mkdir path='${migrationPathStep1}'/>
|
||||
</fs>
|
||||
<ok to="ImportDB"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="ImportDB">
|
||||
<java>
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<main-class>eu.dnetlib.dhp.migration.step1.MigrateDbEntitiesApplication</main-class>
|
||||
<arg>-p</arg><arg>${migrationPathStep1}/db_records</arg>
|
||||
<arg>-pgurl</arg><arg>${postgresURL}</arg>
|
||||
<arg>-pguser</arg><arg>${postgresUser}</arg>
|
||||
<arg>-pgpasswd</arg><arg>${postgresPassword}</arg>
|
||||
</java>
|
||||
<ok to="ImportODF"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="ImportODF">
|
||||
<java>
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<main-class>eu.dnetlib.dhp.migration.step1.MigrateMongoMdstoresApplication</main-class>
|
||||
<arg>-p</arg><arg>${migrationPathStep1}/odf_records</arg>
|
||||
<arg>-mongourl</arg><arg>${mongoURL}</arg>
|
||||
<arg>-mongodb</arg><arg>${mongoDb}</arg>
|
||||
<arg>-f</arg><arg>ODF</arg>
|
||||
<arg>-l</arg><arg>store</arg>
|
||||
<arg>-i</arg><arg>cleaned</arg>
|
||||
</java>
|
||||
<ok to="ImportOAF"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="ImportOAF">
|
||||
<java>
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<main-class>eu.dnetlib.dhp.migration.step1.MigrateMongoMdstoresApplication</main-class>
|
||||
<arg>-p</arg><arg>${migrationPathStep1}/oaf_records</arg>
|
||||
<arg>-mongourl</arg><arg>${mongoURL}</arg>
|
||||
<arg>-mongodb</arg><arg>${mongoDb}</arg>
|
||||
<arg>-f</arg><arg>OAF</arg>
|
||||
<arg>-l</arg><arg>store</arg>
|
||||
<arg>-i</arg><arg>cleaned</arg>
|
||||
</java>
|
||||
<ok to="End"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<end name="End"/>
|
||||
</workflow-app>
|
|
@ -0,0 +1,18 @@
|
|||
<configuration>
|
||||
<property>
|
||||
<name>jobTracker</name>
|
||||
<value>yarnRM</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>nameNode</name>
|
||||
<value>hdfs://nameservice1</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.use.system.libpath</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.action.sharelib.for.spark</name>
|
||||
<value>spark2</value>
|
||||
</property>
|
||||
</configuration>
|
|
@ -0,0 +1,62 @@
|
|||
<workflow-app name="import db entities (step 1)" xmlns="uri:oozie:workflow:0.5">
|
||||
<parameters>
|
||||
<property>
|
||||
<name>migrationPathStep1</name>
|
||||
<description>the base path to store hdfs file</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>postgresURL</name>
|
||||
<description>the postgres URL to access to the database</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>postgresUser</name>
|
||||
<description>the user postgres</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>postgresPassword</name>
|
||||
<description>the password postgres</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkDriverMemory</name>
|
||||
<description>memory for driver process</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkExecutorMemory</name>
|
||||
<description>memory for individual executor</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkExecutorCores</name>
|
||||
<description>number of cores used by single executor</description>
|
||||
</property>
|
||||
</parameters>
|
||||
|
||||
<start to="ResetWorkingPath"/>
|
||||
|
||||
<kill name="Kill">
|
||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||
</kill>
|
||||
|
||||
<action name="ResetWorkingPath">
|
||||
<fs>
|
||||
<delete path='${migrationPathStep1}/db_records'/>
|
||||
</fs>
|
||||
<ok to="ImportDB"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="ImportDB">
|
||||
<java>
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<main-class>eu.dnetlib.dhp.migration.step1.MigrateDbEntitiesApplication</main-class>
|
||||
<arg>-p</arg><arg>${migrationPathStep1}/db_records</arg>
|
||||
<arg>-pgurl</arg><arg>${postgresURL}</arg>
|
||||
<arg>-pguser</arg><arg>${postgresUser}</arg>
|
||||
<arg>-pgpasswd</arg><arg>${postgresPassword}</arg>
|
||||
</java>
|
||||
<ok to="End"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<end name="End"/>
|
||||
</workflow-app>
|
|
@ -0,0 +1,18 @@
|
|||
<configuration>
|
||||
<property>
|
||||
<name>jobTracker</name>
|
||||
<value>yarnRM</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>nameNode</name>
|
||||
<value>hdfs://nameservice1</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.use.system.libpath</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.action.sharelib.for.spark</name>
|
||||
<value>spark2</value>
|
||||
</property>
|
||||
</configuration>
|
|
@ -0,0 +1,74 @@
|
|||
<workflow-app name="import regular entities as Graph (step 2)" xmlns="uri:oozie:workflow:0.5">
|
||||
<parameters>
|
||||
<property>
|
||||
<name>migrationPathStep1</name>
|
||||
<description>the base path to store hdfs file</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>migrationPathStep2</name>
|
||||
<description>the temporary path to store entities before dispatching</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>postgresURL</name>
|
||||
<description>the postgres URL to access to the database</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>postgresUser</name>
|
||||
<description>the user postgres</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>postgresPassword</name>
|
||||
<description>the password postgres</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkDriverMemory</name>
|
||||
<description>memory for driver process</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkExecutorMemory</name>
|
||||
<description>memory for individual executor</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkExecutorCores</name>
|
||||
<description>number of cores used by single executor</description>
|
||||
</property>
|
||||
</parameters>
|
||||
|
||||
<start to="ResetEntities"/>
|
||||
|
||||
<kill name="Kill">
|
||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||
</kill>
|
||||
|
||||
<action name="ResetEntities">
|
||||
<fs>
|
||||
<delete path='${migrationPathStep2}'/>
|
||||
<mkdir path='${migrationPathStep2}'/>
|
||||
</fs>
|
||||
<ok to="GenerateEntities"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="GenerateEntities">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<master>yarn-cluster</master>
|
||||
<mode>cluster</mode>
|
||||
<name>GenerateEntities</name>
|
||||
<class>eu.dnetlib.dhp.migration.step2.GenerateEntitiesApplication</class>
|
||||
<jar>dhp-aggregation-${projectVersion}.jar</jar>
|
||||
<spark-opts>--executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores} --driver-memory=${sparkDriverMemory} --conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" --conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" --conf spark.sql.warehouse.dir="/user/hive/warehouse"</spark-opts>
|
||||
<arg>-mt</arg> <arg>yarn-cluster</arg>
|
||||
<arg>-s</arg><arg>${migrationPathStep1}/db_records,${migrationPathStep1}/oaf_records,${migrationPathStep1}/odf_records</arg>
|
||||
<arg>-t</arg><arg>${migrationPathStep2}/all_entities</arg>
|
||||
<arg>-pgurl</arg><arg>${postgresURL}</arg>
|
||||
<arg>-pguser</arg><arg>${postgresUser}</arg>
|
||||
<arg>-pgpasswd</arg><arg>${postgresPassword}</arg>
|
||||
</spark>
|
||||
<ok to="End"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<end name="End"/>
|
||||
</workflow-app>
|
|
@ -0,0 +1,18 @@
|
|||
<configuration>
|
||||
<property>
|
||||
<name>jobTracker</name>
|
||||
<value>yarnRM</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>nameNode</name>
|
||||
<value>hdfs://nameservice1</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.use.system.libpath</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.action.sharelib.for.spark</name>
|
||||
<value>spark2</value>
|
||||
</property>
|
||||
</configuration>
|
|
@ -0,0 +1,60 @@
|
|||
<workflow-app name="import regular entities as Graph (step 3)" xmlns="uri:oozie:workflow:0.5">
|
||||
<parameters>
|
||||
|
||||
<property>
|
||||
<name>migrationPathStep2</name>
|
||||
<description>the temporary path to store entities before dispatching</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>migrationPathStep3</name>
|
||||
<description>the graph Raw base path</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkDriverMemory</name>
|
||||
<description>memory for driver process</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkExecutorMemory</name>
|
||||
<description>memory for individual executor</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkExecutorCores</name>
|
||||
<description>number of cores used by single executor</description>
|
||||
</property>
|
||||
</parameters>
|
||||
|
||||
<start to="ResetGraph"/>
|
||||
|
||||
<kill name="Kill">
|
||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||
</kill>
|
||||
|
||||
<action name="ResetGraph">
|
||||
<fs>
|
||||
<delete path='${migrationPathStep3}'/>
|
||||
<mkdir path='${migrationPathStep3}'/>
|
||||
</fs>
|
||||
<ok to="GenerateGraph"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="GenerateGraph">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<master>yarn-cluster</master>
|
||||
<mode>cluster</mode>
|
||||
<name>GenerateGraph</name>
|
||||
<class>eu.dnetlib.dhp.migration.step3.DispatchEntitiesApplication</class>
|
||||
<jar>dhp-aggregation-${projectVersion}.jar</jar>
|
||||
<spark-opts>--executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores} --driver-memory=${sparkDriverMemory} --conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" --conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" --conf spark.sql.warehouse.dir="/user/hive/warehouse"</spark-opts>
|
||||
<arg>-mt</arg> <arg>yarn-cluster</arg>
|
||||
<arg>-s</arg><arg>${migrationPathStep2}/all_entities</arg>
|
||||
<arg>-g</arg><arg>${migrationPathStep3}</arg>
|
||||
</spark>
|
||||
<ok to="End"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<end name="End"/>
|
||||
</workflow-app>
|
|
@ -0,0 +1,9 @@
|
|||
# Set root logger level to DEBUG and its only appender to A1.
|
||||
log4j.rootLogger=INFO, A1
|
||||
|
||||
# A1 is set to be a ConsoleAppender.
|
||||
log4j.appender.A1=org.apache.log4j.ConsoleAppender
|
||||
|
||||
# A1 uses PatternLayout.
|
||||
log4j.appender.A1.layout=org.apache.log4j.PatternLayout
|
||||
log4j.appender.A1.layout.ConversionPattern=%-4r [%t] %-5p %c %x - %m%n
|
|
@ -1,79 +1,89 @@
|
|||
package eu.dnetlib.dhp.collection;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import eu.dnetlib.dhp.model.mdstore.MetadataRecord;
|
||||
import eu.dnetlib.dhp.model.mdstore.Provenance;
|
||||
import org.apache.commons.io.FileUtils;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.junit.*;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
|
||||
import org.apache.commons.io.FileUtils;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.junit.jupiter.api.AfterEach;
|
||||
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
import eu.dnetlib.dhp.model.mdstore.MetadataRecord;
|
||||
import eu.dnetlib.dhp.model.mdstore.Provenance;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.*;
|
||||
|
||||
public class CollectionJobTest {
|
||||
private Path testDir;
|
||||
|
||||
@Before
|
||||
public void setup() throws IOException {
|
||||
testDir = Files.createTempDirectory("dhp-collection");
|
||||
}
|
||||
private Path testDir;
|
||||
|
||||
@After
|
||||
public void teadDown() throws IOException {
|
||||
FileUtils.deleteDirectory(testDir.toFile());
|
||||
}
|
||||
@BeforeEach
|
||||
public void setup() throws IOException {
|
||||
testDir = Files.createTempDirectory("dhp-collection");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void tesCollection() throws Exception {
|
||||
Provenance provenance = new Provenance("pippo", "puppa", "ns_prefix");
|
||||
GenerateNativeStoreSparkJob.main(new String[] {
|
||||
"-mt", "local",
|
||||
"-w", "wid",
|
||||
"-e", "XML",
|
||||
"-d", ""+System.currentTimeMillis(),
|
||||
"-p", new ObjectMapper().writeValueAsString(provenance),
|
||||
"-x", "./*[local-name()='record']/*[local-name()='header']/*[local-name()='identifier']",
|
||||
"-i", this.getClass().getResource("/eu/dnetlib/dhp/collection/native.seq").toString(),
|
||||
"-o", testDir.toString()+"/store",
|
||||
"-t", "true",
|
||||
"-ru", "",
|
||||
"-rp", "",
|
||||
"-rh", "",
|
||||
"-ro", "",
|
||||
"-rr", ""});
|
||||
System.out.println(new ObjectMapper().writeValueAsString(provenance));
|
||||
}
|
||||
@AfterEach
|
||||
public void teadDown() throws IOException {
|
||||
FileUtils.deleteDirectory(testDir.toFile());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void tesCollection() throws Exception {
|
||||
final Provenance provenance = new Provenance("pippo", "puppa", "ns_prefix");
|
||||
GenerateNativeStoreSparkJob.main(new String[] {
|
||||
"-mt", "local",
|
||||
"-w", "wid",
|
||||
"-e", "XML",
|
||||
"-d", "" + System.currentTimeMillis(),
|
||||
"-p", new ObjectMapper().writeValueAsString(provenance),
|
||||
"-x", "./*[local-name()='record']/*[local-name()='header']/*[local-name()='identifier']",
|
||||
"-i", this.getClass().getResource("/eu/dnetlib/dhp/collection/native.seq").toString(),
|
||||
"-o", testDir.toString() + "/store",
|
||||
"-t", "true",
|
||||
"-ru", "",
|
||||
"-rp", "",
|
||||
"-rh", "",
|
||||
"-ro", "",
|
||||
"-rr", "" });
|
||||
System.out.println(new ObjectMapper().writeValueAsString(provenance));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testGenerationMetadataRecord() throws Exception {
|
||||
|
||||
@Test
|
||||
public void testGenerationMetadataRecord() throws Exception {
|
||||
final String xml = IOUtils.toString(this.getClass().getResourceAsStream("./record.xml"));
|
||||
|
||||
final String xml = IOUtils.toString(this.getClass().getResourceAsStream("./record.xml"));
|
||||
final MetadataRecord record = GenerateNativeStoreSparkJob
|
||||
.parseRecord(xml, "./*[local-name()='record']/*[local-name()='header']/*[local-name()='identifier']", "XML", new Provenance("foo", "bar",
|
||||
"ns_prefix"), System.currentTimeMillis(), null, null);
|
||||
|
||||
MetadataRecord record = GenerateNativeStoreSparkJob.parseRecord(xml, "./*[local-name()='record']/*[local-name()='header']/*[local-name()='identifier']", "XML", new Provenance("foo", "bar", "ns_prefix"), System.currentTimeMillis(), null,null);
|
||||
assert record != null;
|
||||
System.out.println(record.getId());
|
||||
System.out.println(record.getOriginalId());
|
||||
|
||||
assert record != null;
|
||||
System.out.println(record.getId());
|
||||
System.out.println(record.getOriginalId());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void TestEquals() throws IOException {
|
||||
|
||||
}
|
||||
final String xml = IOUtils.toString(this.getClass().getResourceAsStream("./record.xml"));
|
||||
final MetadataRecord record = GenerateNativeStoreSparkJob
|
||||
.parseRecord(xml, "./*[local-name()='record']/*[local-name()='header']/*[local-name()='identifier']", "XML", new Provenance("foo", "bar",
|
||||
"ns_prefix"), System.currentTimeMillis(), null, null);
|
||||
final MetadataRecord record1 = GenerateNativeStoreSparkJob
|
||||
.parseRecord(xml, "./*[local-name()='record']/*[local-name()='header']/*[local-name()='identifier']", "XML", new Provenance("foo", "bar",
|
||||
"ns_prefix"), System.currentTimeMillis(), null, null);
|
||||
assert record != null;
|
||||
record.setBody("ciao");
|
||||
assert record1 != null;
|
||||
record1.setBody("mondo");
|
||||
assertEquals(record, record1);
|
||||
|
||||
|
||||
@Test
|
||||
public void TestEquals () throws IOException {
|
||||
|
||||
final String xml = IOUtils.toString(this.getClass().getResourceAsStream("./record.xml"));
|
||||
MetadataRecord record = GenerateNativeStoreSparkJob.parseRecord(xml, "./*[local-name()='record']/*[local-name()='header']/*[local-name()='identifier']", "XML", new Provenance("foo", "bar", "ns_prefix"), System.currentTimeMillis(), null,null);
|
||||
MetadataRecord record1 = GenerateNativeStoreSparkJob.parseRecord(xml, "./*[local-name()='record']/*[local-name()='header']/*[local-name()='identifier']", "XML", new Provenance("foo", "bar", "ns_prefix"), System.currentTimeMillis(), null,null);
|
||||
assert record != null;
|
||||
record.setBody("ciao");
|
||||
assert record1 != null;
|
||||
record1.setBody("mondo");
|
||||
Assert.assertEquals(record, record1);
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -7,13 +7,13 @@ import eu.dnetlib.dhp.collection.worker.DnetCollectorWorker;
|
|||
import eu.dnetlib.dhp.collection.worker.utils.CollectorPluginFactory;
|
||||
import eu.dnetlib.message.Message;
|
||||
import eu.dnetlib.message.MessageManager;
|
||||
import org.junit.After;
|
||||
import org.junit.Before;
|
||||
import org.junit.Test;
|
||||
import org.junit.jupiter.api.AfterEach;
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import java.io.File;
|
||||
|
||||
import static org.junit.Assert.assertNotNull;
|
||||
import static org.junit.jupiter.api.Assertions.assertNotNull;
|
||||
import static org.mockito.Mockito.*;
|
||||
|
||||
|
||||
|
@ -24,7 +24,7 @@ public class DnetCollectorWorkerApplicationTests {
|
|||
private MessageManager messageManager = mock(MessageManager.class);
|
||||
|
||||
private DnetCollectorWorker worker;
|
||||
@Before
|
||||
@BeforeEach
|
||||
public void setup() throws Exception {
|
||||
ObjectMapper mapper = new ObjectMapper();
|
||||
final String apiJson = mapper.writeValueAsString(getApi());
|
||||
|
@ -47,7 +47,7 @@ public class DnetCollectorWorkerApplicationTests {
|
|||
}
|
||||
|
||||
|
||||
@After
|
||||
@AfterEach
|
||||
public void dropDown(){
|
||||
File f = new File("/tmp/file.seq");
|
||||
f.delete();
|
||||
|
|
|
@ -0,0 +1,293 @@
|
|||
package eu.dnetlib.dhp.migration.step1;
|
||||
|
||||
import com.fasterxml.jackson.core.type.TypeReference;
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import eu.dnetlib.dhp.schema.oaf.*;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.junit.jupiter.api.extension.ExtendWith;
|
||||
import org.mockito.Mock;
|
||||
import org.mockito.Mockito;
|
||||
import org.mockito.junit.jupiter.MockitoExtension;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.sql.Array;
|
||||
import java.sql.Date;
|
||||
import java.sql.ResultSet;
|
||||
import java.sql.SQLException;
|
||||
import java.util.List;
|
||||
import java.util.Objects;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
|
||||
@ExtendWith(MockitoExtension.class)
|
||||
public class MigrateDbEntitiesApplicationTest {
|
||||
|
||||
private MigrateDbEntitiesApplication app;
|
||||
|
||||
@Mock
|
||||
private ResultSet rs;
|
||||
|
||||
@BeforeEach
|
||||
public void setUp() {
|
||||
this.app = new MigrateDbEntitiesApplication();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testProcessDatasource() throws Exception {
|
||||
final List<TypedField> fields = prepareMocks("datasources_resultset_entry.json");
|
||||
|
||||
final List<Oaf> list = app.processDatasource(rs);
|
||||
assertEquals(1, list.size());
|
||||
verifyMocks(fields);
|
||||
|
||||
final Datasource ds = (Datasource) list.get(0);
|
||||
assertValidId(ds.getId());
|
||||
assertEquals(ds.getOfficialname().getValue(), getValueAsString("officialname", fields));
|
||||
assertEquals(ds.getEnglishname().getValue(), getValueAsString("englishname", fields));
|
||||
assertEquals(ds.getContactemail().getValue(), getValueAsString("contactemail", fields));
|
||||
assertEquals(ds.getWebsiteurl().getValue(), getValueAsString("websiteurl", fields));
|
||||
assertEquals(ds.getNamespaceprefix().getValue(), getValueAsString("namespaceprefix", fields));
|
||||
assertEquals(ds.getCollectedfrom().get(0).getKey(), getValueAsString("collectedfromid", fields));
|
||||
assertEquals(ds.getCollectedfrom().get(0).getValue(), getValueAsString("collectedfromname", fields));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testProcessProject() throws Exception {
|
||||
final List<TypedField> fields = prepareMocks("projects_resultset_entry.json");
|
||||
|
||||
final List<Oaf> list = app.processProject(rs);
|
||||
assertEquals(1, list.size());
|
||||
verifyMocks(fields);
|
||||
|
||||
final Project p = (Project) list.get(0);
|
||||
assertValidId(p.getId());
|
||||
assertEquals(p.getAcronym().getValue(), getValueAsString("acronym", fields));
|
||||
assertEquals(p.getTitle().getValue(), getValueAsString("title", fields));
|
||||
assertEquals(p.getCollectedfrom().get(0).getKey(), getValueAsString("collectedfromid", fields));
|
||||
assertEquals(p.getCollectedfrom().get(0).getValue(), getValueAsString("collectedfromname", fields));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testProcessOrganization() throws Exception {
|
||||
final List<TypedField> fields = prepareMocks("organizations_resultset_entry.json");
|
||||
|
||||
final List<Oaf> list = app.processOrganization(rs);
|
||||
|
||||
assertEquals(1, list.size());
|
||||
|
||||
verifyMocks(fields);
|
||||
|
||||
final Organization o = (Organization) list.get(0);
|
||||
assertValidId(o.getId());
|
||||
assertEquals(o.getLegalshortname().getValue(), getValueAsString("legalshortname", fields));
|
||||
assertEquals(o.getLegalname().getValue(), getValueAsString("legalname", fields));
|
||||
assertEquals(o.getWebsiteurl().getValue(), getValueAsString("websiteurl", fields));
|
||||
assertEquals(o.getCountry().getClassid(), getValueAsString("country", fields).split("@@@")[0]);
|
||||
assertEquals(o.getCountry().getClassname(), getValueAsString("country", fields).split("@@@")[1]);
|
||||
assertEquals(o.getCountry().getSchemeid(), getValueAsString("country", fields).split("@@@")[2]);
|
||||
assertEquals(o.getCountry().getSchemename(), getValueAsString("country", fields).split("@@@")[3]);
|
||||
assertEquals(o.getCollectedfrom().get(0).getKey(), getValueAsString("collectedfromid", fields));
|
||||
assertEquals(o.getCollectedfrom().get(0).getValue(), getValueAsString("collectedfromname", fields));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testProcessDatasourceOrganization() throws Exception {
|
||||
final List<TypedField> fields = prepareMocks("datasourceorganization_resultset_entry.json");
|
||||
|
||||
final List<Oaf> list = app.processDatasourceOrganization(rs);
|
||||
|
||||
assertEquals(2, list.size());
|
||||
verifyMocks(fields);
|
||||
|
||||
final Relation r1 = (Relation) list.get(0);
|
||||
final Relation r2 = (Relation) list.get(1);
|
||||
assertValidId(r1.getSource());
|
||||
assertValidId(r2.getSource());
|
||||
assertEquals(r1.getSource(), r2.getTarget());
|
||||
assertEquals(r2.getSource(), r1.getTarget());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testProcessProjectOrganization() throws Exception {
|
||||
final List<TypedField> fields = prepareMocks("projectorganization_resultset_entry.json");
|
||||
|
||||
final List<Oaf> list = app.processProjectOrganization(rs);
|
||||
|
||||
assertEquals(2, list.size());
|
||||
verifyMocks(fields);
|
||||
|
||||
final Relation r1 = (Relation) list.get(0);
|
||||
final Relation r2 = (Relation) list.get(1);
|
||||
assertValidId(r1.getSource());
|
||||
assertValidId(r2.getSource());
|
||||
assertEquals(r1.getSource(), r2.getTarget());
|
||||
assertEquals(r2.getSource(), r1.getTarget());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testProcessClaims_context() throws Exception {
|
||||
final List<TypedField> fields = prepareMocks("claimscontext_resultset_entry.json");
|
||||
|
||||
final List<Oaf> list = app.processClaims(rs);
|
||||
|
||||
assertEquals(1, list.size());
|
||||
verifyMocks(fields);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testProcessClaims_rels() throws Exception {
|
||||
final List<TypedField> fields = prepareMocks("claimsrel_resultset_entry.json");
|
||||
|
||||
final List<Oaf> list = app.processClaims(rs);
|
||||
|
||||
assertEquals(2, list.size());
|
||||
verifyMocks(fields);
|
||||
}
|
||||
|
||||
private List<TypedField> prepareMocks(final String jsonFile) throws IOException, SQLException {
|
||||
final String json = IOUtils.toString(getClass().getResourceAsStream(jsonFile));
|
||||
final ObjectMapper mapper = new ObjectMapper();
|
||||
final List<TypedField> list = mapper.readValue(json, new TypeReference<List<TypedField>>() {});
|
||||
|
||||
for (final TypedField tf : list) {
|
||||
if (tf.getValue() == null) {
|
||||
switch (tf.getType()) {
|
||||
case "not_used":
|
||||
break;
|
||||
case "boolean":
|
||||
Mockito.when(rs.getBoolean(tf.getField())).thenReturn(false);
|
||||
break;
|
||||
case "date":
|
||||
Mockito.when(rs.getDate(tf.getField())).thenReturn(null);
|
||||
break;
|
||||
case "int":
|
||||
Mockito.when(rs.getInt(tf.getField())).thenReturn(0);
|
||||
break;
|
||||
case "double":
|
||||
Mockito.when(rs.getDouble(tf.getField())).thenReturn(0.0);
|
||||
break;
|
||||
case "array":
|
||||
Mockito.when(rs.getArray(tf.getField())).thenReturn(null);
|
||||
break;
|
||||
case "string":
|
||||
default:
|
||||
Mockito.when(rs.getString(tf.getField())).thenReturn(null);
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
switch (tf.getType()) {
|
||||
case "not_used":
|
||||
break;
|
||||
case "boolean":
|
||||
Mockito.when(rs.getBoolean(tf.getField())).thenReturn(Boolean.parseBoolean(tf.getValue().toString()));
|
||||
break;
|
||||
case "date":
|
||||
Mockito.when(rs.getDate(tf.getField())).thenReturn(Date.valueOf(tf.getValue().toString()));
|
||||
break;
|
||||
case "int":
|
||||
Mockito.when(rs.getInt(tf.getField())).thenReturn(new Integer(tf.getValue().toString()));
|
||||
break;
|
||||
case "double":
|
||||
Mockito.when(rs.getDouble(tf.getField())).thenReturn(new Double(tf.getValue().toString()));
|
||||
break;
|
||||
case "array":
|
||||
final Array arr = Mockito.mock(Array.class);
|
||||
final String[] values = ((List<?>) tf.getValue()).stream()
|
||||
.filter(Objects::nonNull)
|
||||
.map(o -> o.toString())
|
||||
.toArray(String[]::new);
|
||||
|
||||
Mockito.when(arr.getArray()).thenReturn(values);
|
||||
Mockito.when(rs.getArray(tf.getField())).thenReturn(arr);
|
||||
break;
|
||||
case "string":
|
||||
default:
|
||||
Mockito.when(rs.getString(tf.getField())).thenReturn(tf.getValue().toString());
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return list;
|
||||
}
|
||||
|
||||
private void verifyMocks(final List<TypedField> list) throws SQLException {
|
||||
for (final TypedField tf : list) {
|
||||
|
||||
switch (tf.getType()) {
|
||||
case "not_used":
|
||||
break;
|
||||
case "boolean":
|
||||
Mockito.verify(rs, Mockito.atLeastOnce()).getBoolean(tf.getField());
|
||||
break;
|
||||
case "date":
|
||||
Mockito.verify(rs, Mockito.atLeastOnce()).getDate(tf.getField());
|
||||
break;
|
||||
case "int":
|
||||
Mockito.verify(rs, Mockito.atLeastOnce()).getInt(tf.getField());
|
||||
break;
|
||||
case "double":
|
||||
Mockito.verify(rs, Mockito.atLeastOnce()).getDouble(tf.getField());
|
||||
break;
|
||||
case "array":
|
||||
Mockito.verify(rs, Mockito.atLeastOnce()).getArray(tf.getField());
|
||||
break;
|
||||
case "string":
|
||||
default:
|
||||
Mockito.verify(rs, Mockito.atLeastOnce()).getString(tf.getField());
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void assertValidId(final String id) {
|
||||
assertEquals(49, id.length());
|
||||
assertEquals('|', id.charAt(2));
|
||||
assertEquals(':', id.charAt(15));
|
||||
assertEquals(':', id.charAt(16));
|
||||
}
|
||||
|
||||
private String getValueAsString(final String name, final List<TypedField> fields) {
|
||||
return fields.stream()
|
||||
.filter(f -> f.getField().equals(name))
|
||||
.map(TypedField::getValue)
|
||||
.filter(Objects::nonNull)
|
||||
.map(o -> o.toString())
|
||||
.findFirst()
|
||||
.get();
|
||||
}
|
||||
}
|
||||
|
||||
class TypedField {
|
||||
|
||||
private String field;
|
||||
private String type;
|
||||
private Object value;
|
||||
|
||||
public String getField() {
|
||||
return field;
|
||||
}
|
||||
|
||||
public void setField(final String field) {
|
||||
this.field = field;
|
||||
}
|
||||
|
||||
public String getType() {
|
||||
return type;
|
||||
}
|
||||
|
||||
public void setType(final String type) {
|
||||
this.type = type;
|
||||
}
|
||||
|
||||
public Object getValue() {
|
||||
return value;
|
||||
}
|
||||
|
||||
public void setValue(final Object value) {
|
||||
this.value = value;
|
||||
}
|
||||
|
||||
}
|
|
@ -6,47 +6,32 @@ import eu.dnetlib.dhp.transformation.vocabulary.Vocabulary;
|
|||
import eu.dnetlib.dhp.transformation.vocabulary.VocabularyHelper;
|
||||
import eu.dnetlib.dhp.utils.DHPUtils;
|
||||
import net.sf.saxon.s9api.*;
|
||||
import org.apache.commons.io.FileUtils;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.spark.util.LongAccumulator;
|
||||
import org.dom4j.Document;
|
||||
import org.dom4j.Node;
|
||||
import org.dom4j.io.SAXReader;
|
||||
import org.junit.*;
|
||||
import org.junit.jupiter.api.DisplayName;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.junit.jupiter.api.extension.ExtendWith;
|
||||
import org.junit.jupiter.api.io.TempDir;
|
||||
import org.mockito.Mock;
|
||||
import org.mockito.junit.MockitoJUnit;
|
||||
import org.mockito.junit.MockitoRule;
|
||||
import org.mockito.junit.jupiter.MockitoExtension;
|
||||
|
||||
import javax.xml.transform.stream.StreamSource;
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.io.StringWriter;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.Comparator;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertNotNull;
|
||||
|
||||
@ExtendWith(MockitoExtension.class)
|
||||
public class TransformationJobTest {
|
||||
|
||||
@Mock
|
||||
LongAccumulator accumulator;
|
||||
|
||||
@Rule
|
||||
public MockitoRule mockitoRule = MockitoJUnit.rule();
|
||||
|
||||
private Path testDir;
|
||||
|
||||
@Before
|
||||
public void setup() throws IOException {
|
||||
testDir = Files.createTempDirectory("dhp-collection");
|
||||
}
|
||||
|
||||
@After
|
||||
public void tearDown() throws IOException {
|
||||
FileUtils.deleteDirectory(testDir.toFile());
|
||||
}
|
||||
|
||||
private LongAccumulator accumulator;
|
||||
|
||||
@Test
|
||||
public void testTransformSaxonHE() throws Exception {
|
||||
|
@ -70,9 +55,9 @@ public class TransformationJobTest {
|
|||
System.out.println(output.toString());
|
||||
}
|
||||
|
||||
|
||||
@DisplayName("Test TransformSparkJobNode.main")
|
||||
@Test
|
||||
public void transformTest() throws Exception {
|
||||
public void transformTest(@TempDir Path testDir) throws Exception {
|
||||
final String mdstore_input = this.getClass().getResource("/eu/dnetlib/dhp/transform/mdstorenative").getFile();
|
||||
final String mdstore_output = testDir.toString()+"/version";
|
||||
final String xslt = DHPUtils.compressString(IOUtils.toString(this.getClass().getResourceAsStream("/eu/dnetlib/dhp/transform/tr.xml")));
|
||||
|
@ -89,8 +74,6 @@ public class TransformationJobTest {
|
|||
"-rh", "",
|
||||
"-ro", "",
|
||||
"-rr", ""});
|
||||
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
|
@ -121,7 +104,7 @@ public class TransformationJobTest {
|
|||
record.setBody(IOUtils.toString(this.getClass().getResourceAsStream("/eu/dnetlib/dhp/transform/input.xml")));
|
||||
|
||||
final MetadataRecord result = tf.call(record);
|
||||
Assert.assertNotNull(result.getBody());
|
||||
assertNotNull(result.getBody());
|
||||
|
||||
System.out.println(result.getBody());
|
||||
}
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
package eu.dnetlib.dhp.transformation.vocabulary;
|
||||
|
||||
import org.junit.Test;
|
||||
import static org.junit.Assert.*;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import static org.junit.jupiter.api.Assertions.*;
|
||||
public class VocabularyTest {
|
||||
|
||||
|
||||
|
|
|
@ -0,0 +1,27 @@
|
|||
[
|
||||
{
|
||||
"field": "source_type",
|
||||
"type": "string",
|
||||
"value": "context"
|
||||
},
|
||||
{
|
||||
"field": "source_id",
|
||||
"type": "string",
|
||||
"value": "oa-pg"
|
||||
},
|
||||
{
|
||||
"field": "target_type",
|
||||
"type": "string",
|
||||
"value": "publication"
|
||||
},
|
||||
{
|
||||
"field": "target_id",
|
||||
"type": "string",
|
||||
"value": "userclaim___::d99de49026e79d271f3e7451d8de18b6"
|
||||
},
|
||||
{
|
||||
"field": "semantics",
|
||||
"type": "not_used",
|
||||
"value": "isRelevantTo"
|
||||
}
|
||||
]
|
|
@ -0,0 +1,27 @@
|
|||
[
|
||||
{
|
||||
"field": "source_type",
|
||||
"type": "string",
|
||||
"value": "project"
|
||||
},
|
||||
{
|
||||
"field": "source_id",
|
||||
"type": "string",
|
||||
"value": "corda__h2020::b38a638a93b505d670fcacc47a0283d6"
|
||||
},
|
||||
{
|
||||
"field": "target_type",
|
||||
"type": "string",
|
||||
"value": "publication"
|
||||
},
|
||||
{
|
||||
"field": "target_id",
|
||||
"type": "string",
|
||||
"value": "userclaim___::5b5117253d3c64c79809d0b92fa287b4"
|
||||
},
|
||||
{
|
||||
"field": "semantics",
|
||||
"type": "not_used",
|
||||
"value": "resultProject_outcome_produces"
|
||||
}
|
||||
]
|
|
@ -0,0 +1,62 @@
|
|||
[
|
||||
{
|
||||
"field": "datasource",
|
||||
"type": "string",
|
||||
"value": "openaire____::revistasunicauca"
|
||||
},
|
||||
{
|
||||
"field": "organization",
|
||||
"type": "string",
|
||||
"value": "openaire____::openaire____::revistasunicauca"
|
||||
},
|
||||
{
|
||||
"field": "startdate",
|
||||
"type": "not_used",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"field": "enddate",
|
||||
"type": "not_used",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"field": "inferred",
|
||||
"type": "boolean",
|
||||
"value": false
|
||||
},
|
||||
{
|
||||
"field": "deletedbyinference",
|
||||
"type": "boolean",
|
||||
"value": false
|
||||
},
|
||||
{
|
||||
"field": "trust",
|
||||
"type": "string",
|
||||
"value": "0.9"
|
||||
},
|
||||
{
|
||||
"field": "inferenceprovenance",
|
||||
"type": "string",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"field": "collectedfromid",
|
||||
"type": "string",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"field": "collectedfromname",
|
||||
"type": "string",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"field": "semantics",
|
||||
"type": "not_used",
|
||||
"value": "providedBy@@@provided by@@@dnet:datasources_organizations_typologies@@@dnet:datasources_organizations_typologies"
|
||||
},
|
||||
{
|
||||
"field": "provenanceaction",
|
||||
"type": "not_used",
|
||||
"value": null
|
||||
}
|
||||
]
|
|
@ -0,0 +1,234 @@
|
|||
[
|
||||
{
|
||||
"field": "datasourceid",
|
||||
"type": "string",
|
||||
"value": "274269ac6f3b::2579-5449"
|
||||
},
|
||||
{
|
||||
"field": "identities",
|
||||
"type": "not_used",
|
||||
"value": [
|
||||
"274269ac6f3b::2579-5449",
|
||||
null
|
||||
]
|
||||
},
|
||||
{
|
||||
"field": "officialname",
|
||||
"type": "string",
|
||||
"value": "Jurnal Ilmiah Pendidikan Scholastic"
|
||||
},
|
||||
{
|
||||
"field": "englishname",
|
||||
"type": "string",
|
||||
"value": "Jurnal Ilmiah Pendidikan Scholastic"
|
||||
},
|
||||
{
|
||||
"field": "contactemail",
|
||||
"type": "string",
|
||||
"value": "test@test.it"
|
||||
},
|
||||
{
|
||||
"field": "openairecompatibility",
|
||||
"type": "string",
|
||||
"value": "hostedBy@@@collected from a compatible aggregator@@@dnet:datasourceCompatibilityLevel@@@dnet:datasourceCompatibilityLevel"
|
||||
},
|
||||
{
|
||||
"field": "websiteurl",
|
||||
"type": "string",
|
||||
"value": "http://e-journal.sastra-unes.com/index.php/JIPS/index"
|
||||
},
|
||||
{
|
||||
"field": "logourl",
|
||||
"type": "string",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"field": "accessinfopackage",
|
||||
"type": "array",
|
||||
"value": [
|
||||
null
|
||||
]
|
||||
},
|
||||
{
|
||||
"field": "latitude",
|
||||
"type": "double",
|
||||
"value": 0
|
||||
},
|
||||
{
|
||||
"field": "longitude",
|
||||
"type": "double",
|
||||
"value": 0
|
||||
},
|
||||
{
|
||||
"field": "namespaceprefix",
|
||||
"type": "string",
|
||||
"value": "ojs_25795449"
|
||||
},
|
||||
{
|
||||
"field": "odnumberofitems",
|
||||
"type": "int",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"field": "odnumberofitemsdate",
|
||||
"type": "date",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"field": "subjects",
|
||||
"type": "array",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"field": "description",
|
||||
"type": "string",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"field": "odpolicies",
|
||||
"type": "string",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"field": "odlanguages",
|
||||
"type": "array",
|
||||
"value": []
|
||||
},
|
||||
{
|
||||
"field": "odcontenttypes",
|
||||
"type": "array",
|
||||
"value": [
|
||||
"Journal articles"
|
||||
]
|
||||
},
|
||||
{
|
||||
"field": "inferred",
|
||||
"type": "boolean",
|
||||
"value": false
|
||||
},
|
||||
{
|
||||
"field": "deletedbyinference",
|
||||
"type": "boolean",
|
||||
"value": false
|
||||
},
|
||||
{
|
||||
"field": "trust",
|
||||
"type": "string",
|
||||
"value": "0.9"
|
||||
},
|
||||
{
|
||||
"field": "inferenceprovenance",
|
||||
"type": "string",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"field": "dateofcollection",
|
||||
"type": "date",
|
||||
"value": "2020-01-21"
|
||||
},
|
||||
{
|
||||
"field": "dateofvalidation",
|
||||
"type": "date",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"field": "releasestartdate",
|
||||
"type": "date",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"field": "releaseenddate",
|
||||
"type": "date",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"field": "missionstatementurl",
|
||||
"type": "string",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"field": "dataprovider",
|
||||
"type": "boolean",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"field": "serviceprovider",
|
||||
"type": "boolean",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"field": "databaseaccesstype",
|
||||
"type": "string",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"field": "datauploadtype",
|
||||
"type": "string",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"field": "databaseaccessrestriction",
|
||||
"type": "string",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"field": "datauploadrestriction",
|
||||
"type": "string",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"field": "versioning",
|
||||
"type": "boolean",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"field": "citationguidelineurl",
|
||||
"type": "string",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"field": "qualitymanagementkind",
|
||||
"type": "string",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"field": "pidsystems",
|
||||
"type": "string",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"field": "certificates",
|
||||
"type": "string",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"field": "policies",
|
||||
"type": "not_used",
|
||||
"value": []
|
||||
},
|
||||
{
|
||||
"field": "collectedfromid",
|
||||
"type": "string",
|
||||
"value": "openaire____::SnVybmFsIEZha3VsdGFzIFNhc3RyYSBVbml2ZXJzaXRhcyBFa2FzYWt0aQ=="
|
||||
},
|
||||
{
|
||||
"field": "collectedfromname",
|
||||
"type": "string",
|
||||
"value": "Jurnal Fakultas Sastra Universitas Ekasakti"
|
||||
},
|
||||
{
|
||||
"field": "datasourcetype",
|
||||
"type": "string",
|
||||
"value": "pubsrepository::journal@@@Journal@@@dnet:datasource_typologies@@@dnet:datasource_typologies"
|
||||
},
|
||||
{
|
||||
"field": "provenanceaction",
|
||||
"type": "not_used",
|
||||
"value": "sysimport:crosswalk:entityregistry@@@sysimport:crosswalk:entityregistry@@@dnet:provenance_actions@@@dnet:provenance_actions"
|
||||
},
|
||||
{
|
||||
"field": "journal",
|
||||
"type": "string",
|
||||
"value": "2579-5449@@@2597-6540@@@"
|
||||
}
|
||||
]
|
|
@ -0,0 +1,127 @@
|
|||
[
|
||||
{
|
||||
"field": "organizationid",
|
||||
"type": "string",
|
||||
"value": "openaire____::openaire____::microsoft"
|
||||
},
|
||||
{
|
||||
"field": "legalshortname",
|
||||
"type": "string",
|
||||
"value": "MSFTResearch"
|
||||
},
|
||||
{
|
||||
"field": "legalname",
|
||||
"type": "string",
|
||||
"value": "Microsoft Research"
|
||||
},
|
||||
{
|
||||
"field": "websiteurl",
|
||||
"type": "string",
|
||||
"value": "https://www.microsoft.com/en-us/research/"
|
||||
},
|
||||
{
|
||||
"field": "logourl",
|
||||
"type": "string",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"field": "eclegalbody",
|
||||
"type": "boolean",
|
||||
"value": false
|
||||
},
|
||||
{
|
||||
"field": "eclegalperson",
|
||||
"type": "boolean",
|
||||
"value": false
|
||||
},
|
||||
{
|
||||
"field": "ecnonprofit",
|
||||
"type": "boolean",
|
||||
"value": false
|
||||
},
|
||||
{
|
||||
"field": "ecresearchorganization",
|
||||
"type": "boolean",
|
||||
"value": false
|
||||
},
|
||||
{
|
||||
"field": "echighereducation",
|
||||
"type": "boolean",
|
||||
"value": false
|
||||
},
|
||||
{
|
||||
"field": "ecinternationalorganizationeurinterests",
|
||||
"type": "boolean",
|
||||
"value": false
|
||||
},
|
||||
{
|
||||
"field": "ecinternationalorganization",
|
||||
"type": "boolean",
|
||||
"value": false
|
||||
},
|
||||
{
|
||||
"field": "ecenterprise",
|
||||
"type": "boolean",
|
||||
"value": false
|
||||
},
|
||||
{
|
||||
"field": "ecsmevalidated",
|
||||
"type": "boolean",
|
||||
"value": false
|
||||
},
|
||||
{
|
||||
"field": "ecnutscode",
|
||||
"type": "boolean",
|
||||
"value": false
|
||||
},
|
||||
{
|
||||
"field": "dateofcollection",
|
||||
"type": "date",
|
||||
"value": "2018-10-19"
|
||||
},
|
||||
{
|
||||
"field": "dateoftransformation",
|
||||
"type": "date",
|
||||
"value": "2018-10-19"
|
||||
},
|
||||
{
|
||||
"field": "inferred",
|
||||
"type": "boolean",
|
||||
"value": false
|
||||
},
|
||||
{
|
||||
"field": "deletedbyinference",
|
||||
"type": "boolean",
|
||||
"value": false
|
||||
},
|
||||
{
|
||||
"field": "trust",
|
||||
"type": "string",
|
||||
"value": "0.9"
|
||||
},
|
||||
{
|
||||
"field": "inferenceprovenance",
|
||||
"type": "string",
|
||||
"value": ""
|
||||
},
|
||||
{
|
||||
"field": "collectedfromid",
|
||||
"type": "string",
|
||||
"value": "openaire____::TEST"
|
||||
},
|
||||
{
|
||||
"field": "collectedfromname",
|
||||
"type": "string",
|
||||
"value": "TEST"
|
||||
},
|
||||
{
|
||||
"field": "country",
|
||||
"type": "string",
|
||||
"value": "US@@@US@@@dnet:countries@@@dnet:countries"
|
||||
},
|
||||
{
|
||||
"field": "provenanceaction",
|
||||
"type": "not_used",
|
||||
"value": "sysimport:crosswalk:entityregistry@@@sysimport:crosswalk:entityregistry@@@dnet:provenance_actions@@@dnet:provenance_actions"
|
||||
}
|
||||
]
|
|
@ -0,0 +1,72 @@
|
|||
[
|
||||
{
|
||||
"field": "project",
|
||||
"type": "string",
|
||||
"value": "nsf_________::1700003"
|
||||
},
|
||||
{
|
||||
"field": "resporganization",
|
||||
"type": "string",
|
||||
"value": "nsf_________::University_of_Notre_Dame"
|
||||
},
|
||||
{
|
||||
"field": "participantnumber",
|
||||
"type": "not_used",
|
||||
"value": 1
|
||||
},
|
||||
{
|
||||
"field": "contribution",
|
||||
"type": "not_used",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"field": "startdate",
|
||||
"type": "not_used",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"field": "enddate",
|
||||
"type": "not_used",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"field": "inferred",
|
||||
"type": "boolean",
|
||||
"value": false
|
||||
},
|
||||
{
|
||||
"field": "deletedbyinference",
|
||||
"type": "boolean",
|
||||
"value": false
|
||||
},
|
||||
{
|
||||
"field": "trust",
|
||||
"type": "string",
|
||||
"value": "0.9"
|
||||
},
|
||||
{
|
||||
"field": "inferenceprovenance",
|
||||
"type": "string",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"field": "collectedfromid",
|
||||
"type": "string",
|
||||
"value": "openaire____::nsf"
|
||||
},
|
||||
{
|
||||
"field": "collectedfromname",
|
||||
"type": "string",
|
||||
"value": "NSF - National Science Foundation"
|
||||
},
|
||||
{
|
||||
"field": "semantics",
|
||||
"type": "not_used",
|
||||
"value": "coordinator@@@coordinator@@@dnet:project_organization_relations@@@dnet:project_organization_relations"
|
||||
},
|
||||
{
|
||||
"field": "provenanceaction",
|
||||
"type": "not_used",
|
||||
"value": "sysimport:crosswalk:entityregistry@@@sysimport:crosswalk:entityregistry@@@dnet:provenance_actions@@@dnet:provenance_actions"
|
||||
}
|
||||
]
|
|
@ -0,0 +1,193 @@
|
|||
[
|
||||
{
|
||||
"field": "projectid",
|
||||
"type": "string",
|
||||
"value": "aka_________::100469"
|
||||
},
|
||||
{
|
||||
"field": "code",
|
||||
"type": "string",
|
||||
"value": "100469"
|
||||
},
|
||||
{
|
||||
"field": "websiteurl",
|
||||
"type": "string",
|
||||
"value": "http://test"
|
||||
},
|
||||
{
|
||||
"field": "acronym",
|
||||
"type": "string",
|
||||
"value": "RMCAG"
|
||||
},
|
||||
{
|
||||
"field": "title",
|
||||
"type": "string",
|
||||
"value": "Regulation of melanoma cell autonomous growth"
|
||||
},
|
||||
{
|
||||
"field": "startdate",
|
||||
"type": "date",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"field": "enddate",
|
||||
"type": "date",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"field": "callidentifier",
|
||||
"type": "string",
|
||||
"value": "Tutkijankoulutus ja työskentely ulkomailla/kevät TT"
|
||||
},
|
||||
{
|
||||
"field": "keywords",
|
||||
"type": "string",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"field": "duration",
|
||||
"type": "int",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"field": "ecsc39",
|
||||
"type": "boolean",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"field": "oamandatepublications",
|
||||
"type": "boolean",
|
||||
"value": false
|
||||
},
|
||||
{
|
||||
"field": "ecarticle29_3",
|
||||
"type": "boolean",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"field": "dateofcollection",
|
||||
"type": "date",
|
||||
"value": "2019-01-25"
|
||||
},
|
||||
{
|
||||
"field": "dateoftransformation",
|
||||
"type": "date",
|
||||
"value": "2019-04-16"
|
||||
},
|
||||
{
|
||||
"field": "inferred",
|
||||
"type": "boolean",
|
||||
"value": false
|
||||
},
|
||||
{
|
||||
"field": "deletedbyinference",
|
||||
"type": "boolean",
|
||||
"value": false
|
||||
},
|
||||
{
|
||||
"field": "trust",
|
||||
"type": "string",
|
||||
"value": "0.9"
|
||||
},
|
||||
{
|
||||
"field": "inferenceprovenance",
|
||||
"type": "string",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"field": "optional1",
|
||||
"type": "string",
|
||||
"value": "9,284 €"
|
||||
},
|
||||
{
|
||||
"field": "optional2",
|
||||
"type": "string",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"field": "jsonextrainfo",
|
||||
"type": "string",
|
||||
"value": "{}"
|
||||
},
|
||||
{
|
||||
"field": "contactfullname",
|
||||
"type": "string",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"field": "contactfax",
|
||||
"type": "string",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"field": "contactphone",
|
||||
"type": "string",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"field": "contactemail",
|
||||
"type": "string",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"field": "summary",
|
||||
"type": "string",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"field": "currency",
|
||||
"type": "string",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"field": "totalcost",
|
||||
"type": "double",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"field": "fundedamount",
|
||||
"type": "double",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"field": "collectedfromid",
|
||||
"type": "string",
|
||||
"value": "openaire____::aka"
|
||||
},
|
||||
{
|
||||
"field": "collectedfromname",
|
||||
"type": "string",
|
||||
"value": "Academy of Finland"
|
||||
},
|
||||
{
|
||||
"field": "contracttype",
|
||||
"type": "string",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"field": "provenanceaction",
|
||||
"type": "not_used",
|
||||
"value": "sysimport:crosswalk:entityregistry@@@Harvested@@@dnet:provenanceActions@@@dnet:provenanceActions"
|
||||
},
|
||||
{
|
||||
"field": "pid",
|
||||
"type": "not_used",
|
||||
"value": [
|
||||
null
|
||||
]
|
||||
},
|
||||
{
|
||||
"field": "subjects",
|
||||
"type": "array",
|
||||
"value": [
|
||||
null
|
||||
]
|
||||
},
|
||||
{
|
||||
"field": "fundingtree",
|
||||
"type": "array",
|
||||
"value": [
|
||||
"<fundingtree><funder>\n <id>aka_________::AKA</id>\n <shortname>AKA</shortname>\n <name>Academy of Finland</name>\n <originalname>Academy of Finland</originalname>\n <jurisdiction>FI</jurisdiction>\n </funder></fundingtree>"
|
||||
]
|
||||
}
|
||||
]
|
|
@ -0,0 +1,97 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||
<parent>
|
||||
<artifactId>dhp-workflows</artifactId>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<version>1.1.6-SNAPSHOT</version>
|
||||
</parent>
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
<artifactId>dhp-dedup-openaire</artifactId>
|
||||
|
||||
<build>
|
||||
<plugins>
|
||||
<plugin>
|
||||
<groupId>net.alchim31.maven</groupId>
|
||||
<artifactId>scala-maven-plugin</artifactId>
|
||||
<version>4.0.1</version>
|
||||
<executions>
|
||||
<execution>
|
||||
<id>scala-compile-first</id>
|
||||
<phase>initialize</phase>
|
||||
<goals>
|
||||
<goal>add-source</goal>
|
||||
<goal>compile</goal>
|
||||
</goals>
|
||||
</execution>
|
||||
<execution>
|
||||
<id>scala-test-compile</id>
|
||||
<phase>process-test-resources</phase>
|
||||
<goals>
|
||||
<goal>testCompile</goal>
|
||||
</goals>
|
||||
</execution>
|
||||
</executions>
|
||||
<configuration>
|
||||
<scalaVersion>${scala.version}</scalaVersion>
|
||||
</configuration>
|
||||
</plugin>
|
||||
</plugins>
|
||||
|
||||
</build>
|
||||
|
||||
<dependencies>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.apache.spark</groupId>
|
||||
<artifactId>spark-core_2.11</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.spark</groupId>
|
||||
<artifactId>spark-sql_2.11</artifactId>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<artifactId>dhp-common</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<artifactId>dhp-schemas</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.arakelian</groupId>
|
||||
<artifactId>java-jq</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>dom4j</groupId>
|
||||
<artifactId>dom4j</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>jaxen</groupId>
|
||||
<artifactId>jaxen</artifactId>
|
||||
</dependency>
|
||||
|
||||
|
||||
<dependency>
|
||||
<groupId>eu.dnetlib</groupId>
|
||||
<artifactId>dnet-pace-core</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.spark</groupId>
|
||||
<artifactId>spark-graphx_2.11</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.fasterxml.jackson.core</groupId>
|
||||
<artifactId>jackson-databind</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.fasterxml.jackson.core</groupId>
|
||||
<artifactId>jackson-core</artifactId>
|
||||
</dependency>
|
||||
|
||||
</dependencies>
|
||||
|
||||
|
||||
</project>
|
|
@ -1,4 +1,4 @@
|
|||
package eu.dnetlib.dedup;
|
||||
package eu.dnetlib.dhp.dedup;
|
||||
|
||||
import eu.dnetlib.dhp.schema.oaf.Field;
|
||||
import org.apache.commons.lang.StringUtils;
|
|
@ -1,6 +1,5 @@
|
|||
package eu.dnetlib.dedup;
|
||||
package eu.dnetlib.dhp.dedup;
|
||||
|
||||
import com.fasterxml.jackson.databind.DeserializationFeature;
|
||||
import com.google.common.collect.Lists;
|
||||
import eu.dnetlib.dhp.schema.oaf.*;
|
||||
import eu.dnetlib.pace.config.DedupConfig;
|
||||
|
@ -11,7 +10,7 @@ import org.apache.spark.api.java.JavaSparkContext;
|
|||
import org.apache.spark.api.java.function.PairFunction;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import org.codehaus.jackson.map.ObjectMapper;
|
||||
import scala.Tuple2;
|
||||
|
||||
import java.util.Collection;
|
||||
|
@ -69,8 +68,6 @@ public class DedupRecordFactory {
|
|||
p.setId(e._1());
|
||||
|
||||
final ObjectMapper mapper = new ObjectMapper();
|
||||
mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
|
||||
|
||||
|
||||
final Collection<String> dateofacceptance = Lists.newArrayList();
|
||||
|
||||
|
@ -103,7 +100,6 @@ public class DedupRecordFactory {
|
|||
d.setId(e._1());
|
||||
|
||||
final ObjectMapper mapper = new ObjectMapper();
|
||||
mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
|
||||
|
||||
final Collection<String> dateofacceptance = Lists.newArrayList();
|
||||
|
||||
|
@ -136,7 +132,6 @@ public class DedupRecordFactory {
|
|||
p.setId(e._1());
|
||||
|
||||
final ObjectMapper mapper = new ObjectMapper();
|
||||
mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
|
||||
if (e._2() != null)
|
||||
e._2().forEach(proj -> {
|
||||
try {
|
||||
|
@ -160,7 +155,6 @@ public class DedupRecordFactory {
|
|||
|
||||
s.setId(e._1());
|
||||
final ObjectMapper mapper = new ObjectMapper();
|
||||
mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
|
||||
final Collection<String> dateofacceptance = Lists.newArrayList();
|
||||
if (e._2() != null)
|
||||
e._2().forEach(soft -> {
|
||||
|
@ -188,7 +182,6 @@ public class DedupRecordFactory {
|
|||
Datasource d = new Datasource(); //the result of the merge, to be returned at the end
|
||||
d.setId(e._1());
|
||||
final ObjectMapper mapper = new ObjectMapper();
|
||||
mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
|
||||
if (e._2() != null)
|
||||
e._2().forEach(dat -> {
|
||||
try {
|
||||
|
@ -213,7 +206,6 @@ public class DedupRecordFactory {
|
|||
o.setId(e._1());
|
||||
|
||||
final ObjectMapper mapper = new ObjectMapper();
|
||||
mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
|
||||
|
||||
|
||||
StringBuilder trust = new StringBuilder("0.0");
|
||||
|
@ -254,7 +246,6 @@ public class DedupRecordFactory {
|
|||
o.setId(e._1());
|
||||
|
||||
final ObjectMapper mapper = new ObjectMapper();
|
||||
mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
|
||||
|
||||
final Collection<String> dateofacceptance = Lists.newArrayList();
|
||||
|
|
@ -1,35 +1,32 @@
|
|||
package eu.dnetlib.dedup;
|
||||
package eu.dnetlib.dhp.dedup;
|
||||
|
||||
import com.google.common.collect.Sets;
|
||||
import com.wcohen.ss.JaroWinkler;
|
||||
import eu.dnetlib.dhp.schema.oaf.Author;
|
||||
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
||||
import eu.dnetlib.dhp.utils.ISLookupClientFactory;
|
||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
|
||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
|
||||
import eu.dnetlib.pace.clustering.BlacklistAwareClusteringCombiner;
|
||||
import eu.dnetlib.pace.config.DedupConfig;
|
||||
|
||||
import eu.dnetlib.pace.model.MapDocument;
|
||||
import eu.dnetlib.pace.model.Person;
|
||||
import org.apache.commons.codec.binary.Hex;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FSDataInputStream;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.spark.SparkContext;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
import org.apache.spark.util.LongAccumulator;
|
||||
import org.dom4j.Document;
|
||||
import org.dom4j.DocumentException;
|
||||
import org.dom4j.Element;
|
||||
import org.dom4j.io.SAXReader;
|
||||
import scala.Tuple2;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.StringWriter;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.io.StringReader;
|
||||
import java.security.MessageDigest;
|
||||
import java.text.Normalizer;
|
||||
import java.util.*;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
public class DedupUtility {
|
||||
private static final Double THRESHOLD = 0.95;
|
||||
|
@ -54,38 +51,6 @@ public class DedupUtility {
|
|||
return accumulators;
|
||||
}
|
||||
|
||||
public static JavaRDD<String> loadDataFromHDFS(String path, JavaSparkContext context) {
|
||||
return context.textFile(path);
|
||||
}
|
||||
|
||||
public static void deleteIfExists(String path) throws IOException {
|
||||
Configuration conf = new Configuration();
|
||||
FileSystem fileSystem = FileSystem.get(conf);
|
||||
if (fileSystem.exists(new Path(path))) {
|
||||
fileSystem.delete(new Path(path), true);
|
||||
}
|
||||
}
|
||||
|
||||
public static DedupConfig loadConfigFromHDFS(String path) throws IOException {
|
||||
|
||||
Configuration conf = new Configuration();
|
||||
FileSystem fileSystem = FileSystem.get(conf);
|
||||
FSDataInputStream inputStream = new FSDataInputStream(fileSystem.open(new Path(path)));
|
||||
|
||||
return DedupConfig.load(IOUtils.toString(inputStream, StandardCharsets.UTF_8.name()));
|
||||
|
||||
}
|
||||
|
||||
static <T> String readFromClasspath(final String filename, final Class<T> clazz) {
|
||||
final StringWriter sw = new StringWriter();
|
||||
try {
|
||||
IOUtils.copy(clazz.getResourceAsStream(filename), sw);
|
||||
return sw.toString();
|
||||
} catch (final IOException e) {
|
||||
throw new RuntimeException("cannot load resource from classpath: " + filename);
|
||||
}
|
||||
}
|
||||
|
||||
static Set<String> getGroupingKeys(DedupConfig conf, MapDocument doc) {
|
||||
return Sets.newHashSet(BlacklistAwareClusteringCombiner.filterAndCombine(doc, conf));
|
||||
}
|
||||
|
@ -146,16 +111,20 @@ public class DedupUtility {
|
|||
});
|
||||
}
|
||||
|
||||
public static String createDedupRecordPath(final String basePath, final String actionSetId, final String entityType) {
|
||||
return String.format("%s/%s/%s_deduprecord", basePath, actionSetId, entityType);
|
||||
}
|
||||
|
||||
public static String createEntityPath(final String basePath, final String entityType) {
|
||||
return String.format("%s/%s", basePath, entityType);
|
||||
}
|
||||
|
||||
public static String createSimRelPath(final String basePath, final String entityType) {
|
||||
return String.format("%s/%s/simRel", basePath, entityType);
|
||||
public static String createSimRelPath(final String basePath, final String actionSetId,final String entityType) {
|
||||
return String.format("%s/%s/%s_simrel", basePath, actionSetId, entityType);
|
||||
}
|
||||
|
||||
public static String createMergeRelPath(final String basePath, final String entityType) {
|
||||
return String.format("%s/%s/mergeRel", basePath, entityType);
|
||||
public static String createMergeRelPath(final String basePath, final String actionSetId, final String entityType) {
|
||||
return String.format("%s/%s/%s_mergerel", basePath, actionSetId, entityType);
|
||||
}
|
||||
|
||||
private static Double sim(Author a, Author b) {
|
||||
|
@ -216,4 +185,37 @@ public class DedupUtility {
|
|||
return false;
|
||||
return a.getPid().stream().anyMatch(p -> p != null && StringUtils.isNotBlank(p.getValue()));
|
||||
}
|
||||
|
||||
public static List<DedupConfig> getConfigurations(String isLookUpUrl, String orchestrator) throws ISLookUpException, DocumentException {
|
||||
final ISLookUpService isLookUpService = ISLookupClientFactory.getLookUpService(isLookUpUrl);
|
||||
|
||||
final String xquery = String.format("/RESOURCE_PROFILE[.//DEDUPLICATION/ACTION_SET/@id = '%s']", orchestrator);
|
||||
|
||||
String orchestratorProfile = isLookUpService.getResourceProfileByQuery(xquery);
|
||||
|
||||
final Document doc = new SAXReader().read(new StringReader(orchestratorProfile));
|
||||
|
||||
final String actionSetId = doc.valueOf("//DEDUPLICATION/ACTION_SET/@id");
|
||||
final List<DedupConfig> configurations = new ArrayList<>();
|
||||
|
||||
for (final Object o : doc.selectNodes("//SCAN_SEQUENCE/SCAN")) {
|
||||
configurations.add(loadConfig(isLookUpService, actionSetId, o));
|
||||
}
|
||||
|
||||
return configurations;
|
||||
|
||||
}
|
||||
|
||||
private static DedupConfig loadConfig(final ISLookUpService isLookUpService, final String actionSetId, final Object o)
|
||||
throws ISLookUpException {
|
||||
final Element s = (Element) o;
|
||||
final String configProfileId = s.attributeValue("id");
|
||||
final String conf =
|
||||
isLookUpService.getResourceProfileByQuery(String.format(
|
||||
"for $x in /RESOURCE_PROFILE[.//RESOURCE_IDENTIFIER/@value = '%s'] return $x//DEDUPLICATION/text()",
|
||||
configProfileId));
|
||||
final DedupConfig dedupConfig = DedupConfig.load(conf);
|
||||
dedupConfig.getWf().setConfigurationId(actionSetId);
|
||||
return dedupConfig;
|
||||
}
|
||||
}
|
|
@ -1,7 +1,6 @@
|
|||
package eu.dnetlib.dedup;
|
||||
package eu.dnetlib.dhp.dedup;
|
||||
|
||||
import eu.dnetlib.pace.config.DedupConfig;
|
||||
import eu.dnetlib.pace.model.Field;
|
||||
import eu.dnetlib.pace.model.MapDocument;
|
||||
import eu.dnetlib.pace.util.BlockProcessor;
|
||||
import eu.dnetlib.pace.util.MapDocumentUtil;
|
|
@ -1,4 +1,4 @@
|
|||
package eu.dnetlib.dedup;
|
||||
package eu.dnetlib.dhp.dedup;
|
||||
|
||||
public enum OafEntityType {
|
||||
|
|
@ -0,0 +1,100 @@
|
|||
package eu.dnetlib.dhp.dedup;
|
||||
|
||||
import com.google.common.hash.Hashing;
|
||||
import eu.dnetlib.dhp.dedup.graph.ConnectedComponent;
|
||||
import eu.dnetlib.dhp.dedup.graph.GraphProcessor;
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
|
||||
import eu.dnetlib.pace.config.DedupConfig;
|
||||
import eu.dnetlib.pace.util.MapDocumentUtil;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.JavaPairRDD;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
import org.apache.spark.api.java.function.FlatMapFunction;
|
||||
import org.apache.spark.api.java.function.PairFunction;
|
||||
import org.apache.spark.graphx.Edge;
|
||||
import org.apache.spark.rdd.RDD;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
import org.dom4j.DocumentException;
|
||||
import scala.Tuple2;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
public class SparkCreateConnectedComponent {
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(IOUtils.toString(SparkCreateConnectedComponent.class.getResourceAsStream("/eu/dnetlib/dhp/dedup/createCC_parameters.json")));
|
||||
parser.parseArgument(args);
|
||||
|
||||
new SparkCreateConnectedComponent().run(parser);
|
||||
}
|
||||
|
||||
private void run(ArgumentApplicationParser parser) throws ISLookUpException, DocumentException {
|
||||
|
||||
final String graphBasePath = parser.get("graphBasePath");
|
||||
final String workingPath = parser.get("workingPath");
|
||||
final String isLookUpUrl = parser.get("isLookUpUrl");
|
||||
final String actionSetId = parser.get("actionSetId");
|
||||
|
||||
try (SparkSession spark = getSparkSession(parser)) {
|
||||
|
||||
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
|
||||
|
||||
for (DedupConfig dedupConf: DedupUtility.getConfigurations(isLookUpUrl, actionSetId)) {
|
||||
|
||||
final String entity = dedupConf.getWf().getEntityType();
|
||||
final String subEntity = dedupConf.getWf().getSubEntityValue();
|
||||
|
||||
final JavaPairRDD<Object, String> vertexes = sc.textFile(graphBasePath + "/" + subEntity)
|
||||
.map(s -> MapDocumentUtil.getJPathString(dedupConf.getWf().getIdPath(), s))
|
||||
.mapToPair((PairFunction<String, Object, String>)
|
||||
s -> new Tuple2<Object, String>(getHashcode(s), s)
|
||||
);
|
||||
|
||||
final Dataset<Relation> similarityRelations = spark.read().load(DedupUtility.createSimRelPath(workingPath, actionSetId, subEntity)).as(Encoders.bean(Relation.class));
|
||||
final RDD<Edge<String>> edgeRdd = similarityRelations.javaRDD().map(it -> new Edge<>(getHashcode(it.getSource()), getHashcode(it.getTarget()), it.getRelClass())).rdd();
|
||||
final JavaRDD<ConnectedComponent> cc = GraphProcessor.findCCs(vertexes.rdd(), edgeRdd, dedupConf.getWf().getMaxIterations()).toJavaRDD();
|
||||
final Dataset<Relation> mergeRelation = spark.createDataset(cc.filter(k -> k.getDocIds().size() > 1).flatMap((FlatMapFunction<ConnectedComponent, Relation>) c ->
|
||||
c.getDocIds()
|
||||
.stream()
|
||||
.flatMap(id -> {
|
||||
List<Relation> tmp = new ArrayList<>();
|
||||
Relation r = new Relation();
|
||||
r.setSource(c.getCcId());
|
||||
r.setTarget(id);
|
||||
r.setRelClass("merges");
|
||||
tmp.add(r);
|
||||
r = new Relation();
|
||||
r.setTarget(c.getCcId());
|
||||
r.setSource(id);
|
||||
r.setRelClass("isMergedIn");
|
||||
tmp.add(r);
|
||||
return tmp.stream();
|
||||
}).iterator()).rdd(), Encoders.bean(Relation.class));
|
||||
mergeRelation.write().mode("overwrite").save(DedupUtility.createMergeRelPath(workingPath, actionSetId, entity));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public static long getHashcode(final String id) {
|
||||
return Hashing.murmur3_128().hashString(id).asLong();
|
||||
}
|
||||
|
||||
private static SparkSession getSparkSession(ArgumentApplicationParser parser) {
|
||||
SparkConf conf = new SparkConf();
|
||||
|
||||
return SparkSession
|
||||
.builder()
|
||||
.appName(SparkCreateSimRels.class.getSimpleName())
|
||||
.master(parser.get("master"))
|
||||
.config(conf)
|
||||
.enableHiveSupport()
|
||||
.getOrCreate();
|
||||
}
|
||||
}
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue