Merge branch 'master' into przemyslawjacewicz_actionmanager_impl_prototype

This commit is contained in:
przemek 2020-03-31 12:04:58 +02:00
commit 9d1d18d4b9
215 changed files with 10396 additions and 1160 deletions

5
.gitignore vendored
View File

@ -1,9 +1,12 @@
.DS_Store .DS_Store
.idea .idea
*.iws
*.ipr
*.iml *.iml
*.ipr *.ipr
*.iws *.iws
*~ *~
.vscode
.classpath .classpath
/*/.classpath /*/.classpath
/*/*/.classpath /*/*/.classpath
@ -11,7 +14,6 @@
/*/.metadata /*/.metadata
/*/*/.metadata /*/*/.metadata
.project .project
.log
.settings .settings
/*/*/target /*/*/target
/*/target /*/target
@ -21,4 +23,5 @@
/build /build
spark-warehouse spark-warehouse
/**/job-override.properties /**/job-override.properties
/**/*.log

View File

@ -6,7 +6,7 @@
<parent> <parent>
<groupId>eu.dnetlib.dhp</groupId> <groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-build</artifactId> <artifactId>dhp-build</artifactId>
<version>1.1.6-SNAPSHOT</version> <version>1.1.7-SNAPSHOT</version>
</parent> </parent>
<artifactId>dhp-build-assembly-resources</artifactId> <artifactId>dhp-build-assembly-resources</artifactId>

View File

@ -6,7 +6,7 @@
<parent> <parent>
<groupId>eu.dnetlib.dhp</groupId> <groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-build</artifactId> <artifactId>dhp-build</artifactId>
<version>1.1.6-SNAPSHOT</version> <version>1.1.7-SNAPSHOT</version>
</parent> </parent>
<artifactId>dhp-build-properties-maven-plugin</artifactId> <artifactId>dhp-build-properties-maven-plugin</artifactId>
@ -102,7 +102,7 @@
</goals> </goals>
</pluginExecutionFilter> </pluginExecutionFilter>
<action> <action>
<ignore></ignore> <ignore />
</action> </action>
</pluginExecution> </pluginExecution>
</pluginExecutions> </pluginExecutions>

View File

@ -1,22 +1,21 @@
package eu.dnetlib.maven.plugin.properties; package eu.dnetlib.maven.plugin.properties;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import static eu.dnetlib.maven.plugin.properties.GenerateOoziePropertiesMojo.PROPERTY_NAME_SANDBOX_NAME; import static eu.dnetlib.maven.plugin.properties.GenerateOoziePropertiesMojo.PROPERTY_NAME_SANDBOX_NAME;
import static eu.dnetlib.maven.plugin.properties.GenerateOoziePropertiesMojo.PROPERTY_NAME_WF_SOURCE_DIR; import static eu.dnetlib.maven.plugin.properties.GenerateOoziePropertiesMojo.PROPERTY_NAME_WF_SOURCE_DIR;
import static org.junit.Assert.assertEquals; import static org.junit.jupiter.api.Assertions.*;
import static org.junit.Assert.assertNull;
import org.junit.Before;
import org.junit.Test;
/** /**
* @author mhorst * @author mhorst, claudio.atzori
* *
*/ */
public class GenerateOoziePropertiesMojoTest { public class GenerateOoziePropertiesMojoTest {
private GenerateOoziePropertiesMojo mojo = new GenerateOoziePropertiesMojo(); private GenerateOoziePropertiesMojo mojo = new GenerateOoziePropertiesMojo();
@Before @BeforeEach
public void clearSystemProperties() { public void clearSystemProperties() {
System.clearProperty(PROPERTY_NAME_SANDBOX_NAME); System.clearProperty(PROPERTY_NAME_SANDBOX_NAME);
System.clearProperty(PROPERTY_NAME_WF_SOURCE_DIR); System.clearProperty(PROPERTY_NAME_WF_SOURCE_DIR);
@ -28,7 +27,7 @@ public class GenerateOoziePropertiesMojoTest {
mojo.execute(); mojo.execute();
// assert // assert
assertNull(System.getProperty(PROPERTY_NAME_SANDBOX_NAME)); assertNull(System.getProperty(PROPERTY_NAME_SANDBOX_NAME));
} }
@Test @Test

View File

@ -1,51 +1,41 @@
package eu.dnetlib.maven.plugin.properties; package eu.dnetlib.maven.plugin.properties;
import static eu.dnetlib.maven.plugin.properties.WritePredefinedProjectProperties.PROPERTY_PREFIX_ENV;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertTrue;
import static org.mockito.Mockito.doReturn;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.util.Properties;
import org.apache.maven.plugin.MojoExecutionException; import org.apache.maven.plugin.MojoExecutionException;
import org.apache.maven.project.MavenProject; import org.apache.maven.project.MavenProject;
import org.junit.Before; import org.junit.jupiter.api.*;
import org.junit.Rule; import org.junit.jupiter.api.extension.ExtendWith;
import org.junit.Test; import org.junit.jupiter.api.io.TempDir;
import org.junit.rules.TemporaryFolder;
import org.junit.runner.RunWith;
import org.mockito.Mock; import org.mockito.Mock;
import org.mockito.runners.MockitoJUnitRunner; import org.mockito.MockitoAnnotations;
import org.mockito.junit.jupiter.MockitoExtension;
import java.io.*;
import java.util.Properties;
import static eu.dnetlib.maven.plugin.properties.WritePredefinedProjectProperties.PROPERTY_PREFIX_ENV;
import static org.junit.jupiter.api.Assertions.*;
import static org.mockito.Mockito.doReturn;
import static org.mockito.Mockito.lenient;
/** /**
* @author mhorst * @author mhorst, claudio.atzori
* *
*/ */
@RunWith(MockitoJUnitRunner.class) @ExtendWith(MockitoExtension.class)
public class WritePredefinedProjectPropertiesTest { public class WritePredefinedProjectPropertiesTest {
@Rule
public TemporaryFolder testFolder = new TemporaryFolder();
@Mock @Mock
private MavenProject mavenProject; private MavenProject mavenProject;
private WritePredefinedProjectProperties mojo; private WritePredefinedProjectProperties mojo;
@Before @BeforeEach
public void init() { public void init(@TempDir File testFolder) {
MockitoAnnotations.initMocks(this);
mojo = new WritePredefinedProjectProperties(); mojo = new WritePredefinedProjectProperties();
mojo.outputFile = getPropertiesFileLocation(); mojo.outputFile = getPropertiesFileLocation(testFolder);
mojo.project = mavenProject; mojo.project = mavenProject;
doReturn(new Properties()).when(mavenProject).getProperties(); lenient().doReturn(new Properties()).when(mavenProject).getProperties();
} }
// ----------------------------------- TESTS --------------------------------------------- // ----------------------------------- TESTS ---------------------------------------------
@ -57,7 +47,7 @@ public class WritePredefinedProjectPropertiesTest {
// assert // assert
assertTrue(mojo.outputFile.exists()); assertTrue(mojo.outputFile.exists());
Properties storedProperties = getStoredProperties(); Properties storedProperties = getStoredProperties(mojo.outputFile.getParentFile());
assertEquals(0, storedProperties.size()); assertEquals(0, storedProperties.size());
} }
@ -75,28 +65,28 @@ public class WritePredefinedProjectPropertiesTest {
// assert // assert
assertTrue(mojo.outputFile.exists()); assertTrue(mojo.outputFile.exists());
Properties storedProperties = getStoredProperties(); Properties storedProperties = getStoredProperties(mojo.outputFile.getParentFile());
assertEquals(1, storedProperties.size()); assertEquals(1, storedProperties.size());
assertTrue(storedProperties.containsKey(key)); assertTrue(storedProperties.containsKey(key));
assertEquals(value, storedProperties.getProperty(key)); assertEquals(value, storedProperties.getProperty(key));
} }
@Test(expected=MojoExecutionException.class) @Test()
public void testExecuteWithProjectPropertiesAndInvalidOutputFile() throws Exception { public void testExecuteWithProjectPropertiesAndInvalidOutputFile(@TempDir File testFolder) {
// given // given
String key = "projectPropertyKey"; String key = "projectPropertyKey";
String value = "projectPropertyValue"; String value = "projectPropertyValue";
Properties projectProperties = new Properties(); Properties projectProperties = new Properties();
projectProperties.setProperty(key, value); projectProperties.setProperty(key, value);
doReturn(projectProperties).when(mavenProject).getProperties(); doReturn(projectProperties).when(mavenProject).getProperties();
mojo.outputFile = testFolder.getRoot(); mojo.outputFile = testFolder;
// execute // execute
mojo.execute(); Assertions.assertThrows(MojoExecutionException.class, () -> mojo.execute());
} }
@Test @Test
public void testExecuteWithProjectPropertiesExclusion() throws Exception { public void testExecuteWithProjectPropertiesExclusion(@TempDir File testFolder) throws Exception {
// given // given
String key = "projectPropertyKey"; String key = "projectPropertyKey";
String value = "projectPropertyValue"; String value = "projectPropertyValue";
@ -113,14 +103,14 @@ public class WritePredefinedProjectPropertiesTest {
// assert // assert
assertTrue(mojo.outputFile.exists()); assertTrue(mojo.outputFile.exists());
Properties storedProperties = getStoredProperties(); Properties storedProperties = getStoredProperties(testFolder);
assertEquals(1, storedProperties.size()); assertEquals(1, storedProperties.size());
assertTrue(storedProperties.containsKey(key)); assertTrue(storedProperties.containsKey(key));
assertEquals(value, storedProperties.getProperty(key)); assertEquals(value, storedProperties.getProperty(key));
} }
@Test @Test
public void testExecuteWithProjectPropertiesInclusion() throws Exception { public void testExecuteWithProjectPropertiesInclusion(@TempDir File testFolder) throws Exception {
// given // given
String key = "projectPropertyKey"; String key = "projectPropertyKey";
String value = "projectPropertyValue"; String value = "projectPropertyValue";
@ -137,14 +127,14 @@ public class WritePredefinedProjectPropertiesTest {
// assert // assert
assertTrue(mojo.outputFile.exists()); assertTrue(mojo.outputFile.exists());
Properties storedProperties = getStoredProperties(); Properties storedProperties = getStoredProperties(testFolder);
assertEquals(1, storedProperties.size()); assertEquals(1, storedProperties.size());
assertTrue(storedProperties.containsKey(includedKey)); assertTrue(storedProperties.containsKey(includedKey));
assertEquals(includedValue, storedProperties.getProperty(includedKey)); assertEquals(includedValue, storedProperties.getProperty(includedKey));
} }
@Test @Test
public void testExecuteIncludingPropertyKeysFromFile() throws Exception { public void testExecuteIncludingPropertyKeysFromFile(@TempDir File testFolder) throws Exception {
// given // given
String key = "projectPropertyKey"; String key = "projectPropertyKey";
String value = "projectPropertyValue"; String value = "projectPropertyValue";
@ -155,7 +145,7 @@ public class WritePredefinedProjectPropertiesTest {
projectProperties.setProperty(includedKey, includedValue); projectProperties.setProperty(includedKey, includedValue);
doReturn(projectProperties).when(mavenProject).getProperties(); doReturn(projectProperties).when(mavenProject).getProperties();
File includedPropertiesFile = new File(testFolder.getRoot(), "included.properties"); File includedPropertiesFile = new File(testFolder, "included.properties");
Properties includedProperties = new Properties(); Properties includedProperties = new Properties();
includedProperties.setProperty(includedKey, "irrelevantValue"); includedProperties.setProperty(includedKey, "irrelevantValue");
includedProperties.store(new FileWriter(includedPropertiesFile), null); includedProperties.store(new FileWriter(includedPropertiesFile), null);
@ -167,14 +157,14 @@ public class WritePredefinedProjectPropertiesTest {
// assert // assert
assertTrue(mojo.outputFile.exists()); assertTrue(mojo.outputFile.exists());
Properties storedProperties = getStoredProperties(); Properties storedProperties = getStoredProperties(testFolder);
assertEquals(1, storedProperties.size()); assertEquals(1, storedProperties.size());
assertTrue(storedProperties.containsKey(includedKey)); assertTrue(storedProperties.containsKey(includedKey));
assertEquals(includedValue, storedProperties.getProperty(includedKey)); assertEquals(includedValue, storedProperties.getProperty(includedKey));
} }
@Test @Test
public void testExecuteIncludingPropertyKeysFromClasspathResource() throws Exception { public void testExecuteIncludingPropertyKeysFromClasspathResource(@TempDir File testFolder) throws Exception {
// given // given
String key = "projectPropertyKey"; String key = "projectPropertyKey";
String value = "projectPropertyValue"; String value = "projectPropertyValue";
@ -192,14 +182,14 @@ public class WritePredefinedProjectPropertiesTest {
// assert // assert
assertTrue(mojo.outputFile.exists()); assertTrue(mojo.outputFile.exists());
Properties storedProperties = getStoredProperties(); Properties storedProperties = getStoredProperties(testFolder);
assertEquals(1, storedProperties.size()); assertEquals(1, storedProperties.size());
assertTrue(storedProperties.containsKey(includedKey)); assertTrue(storedProperties.containsKey(includedKey));
assertEquals(includedValue, storedProperties.getProperty(includedKey)); assertEquals(includedValue, storedProperties.getProperty(includedKey));
} }
@Test(expected=MojoExecutionException.class) @Test
public void testExecuteIncludingPropertyKeysFromBlankLocation() throws Exception { public void testExecuteIncludingPropertyKeysFromBlankLocation() {
// given // given
String key = "projectPropertyKey"; String key = "projectPropertyKey";
String value = "projectPropertyValue"; String value = "projectPropertyValue";
@ -213,11 +203,11 @@ public class WritePredefinedProjectPropertiesTest {
mojo.setIncludePropertyKeysFromFiles(new String[] {""}); mojo.setIncludePropertyKeysFromFiles(new String[] {""});
// execute // execute
mojo.execute(); Assertions.assertThrows(MojoExecutionException.class, () -> mojo.execute());
} }
@Test @Test
public void testExecuteIncludingPropertyKeysFromXmlFile() throws Exception { public void testExecuteIncludingPropertyKeysFromXmlFile(@TempDir File testFolder) throws Exception {
// given // given
String key = "projectPropertyKey"; String key = "projectPropertyKey";
String value = "projectPropertyValue"; String value = "projectPropertyValue";
@ -228,7 +218,7 @@ public class WritePredefinedProjectPropertiesTest {
projectProperties.setProperty(includedKey, includedValue); projectProperties.setProperty(includedKey, includedValue);
doReturn(projectProperties).when(mavenProject).getProperties(); doReturn(projectProperties).when(mavenProject).getProperties();
File includedPropertiesFile = new File(testFolder.getRoot(), "included.xml"); File includedPropertiesFile = new File(testFolder, "included.xml");
Properties includedProperties = new Properties(); Properties includedProperties = new Properties();
includedProperties.setProperty(includedKey, "irrelevantValue"); includedProperties.setProperty(includedKey, "irrelevantValue");
includedProperties.storeToXML(new FileOutputStream(includedPropertiesFile), null); includedProperties.storeToXML(new FileOutputStream(includedPropertiesFile), null);
@ -240,14 +230,14 @@ public class WritePredefinedProjectPropertiesTest {
// assert // assert
assertTrue(mojo.outputFile.exists()); assertTrue(mojo.outputFile.exists());
Properties storedProperties = getStoredProperties(); Properties storedProperties = getStoredProperties(testFolder);
assertEquals(1, storedProperties.size()); assertEquals(1, storedProperties.size());
assertTrue(storedProperties.containsKey(includedKey)); assertTrue(storedProperties.containsKey(includedKey));
assertEquals(includedValue, storedProperties.getProperty(includedKey)); assertEquals(includedValue, storedProperties.getProperty(includedKey));
} }
@Test(expected=MojoExecutionException.class) @Test
public void testExecuteIncludingPropertyKeysFromInvalidXmlFile() throws Exception { public void testExecuteIncludingPropertyKeysFromInvalidXmlFile(@TempDir File testFolder) throws Exception {
// given // given
String key = "projectPropertyKey"; String key = "projectPropertyKey";
String value = "projectPropertyValue"; String value = "projectPropertyValue";
@ -258,7 +248,7 @@ public class WritePredefinedProjectPropertiesTest {
projectProperties.setProperty(includedKey, includedValue); projectProperties.setProperty(includedKey, includedValue);
doReturn(projectProperties).when(mavenProject).getProperties(); doReturn(projectProperties).when(mavenProject).getProperties();
File includedPropertiesFile = new File(testFolder.getRoot(), "included.xml"); File includedPropertiesFile = new File(testFolder, "included.xml");
Properties includedProperties = new Properties(); Properties includedProperties = new Properties();
includedProperties.setProperty(includedKey, "irrelevantValue"); includedProperties.setProperty(includedKey, "irrelevantValue");
includedProperties.store(new FileOutputStream(includedPropertiesFile), null); includedProperties.store(new FileOutputStream(includedPropertiesFile), null);
@ -266,11 +256,11 @@ public class WritePredefinedProjectPropertiesTest {
mojo.setIncludePropertyKeysFromFiles(new String[] {includedPropertiesFile.getAbsolutePath()}); mojo.setIncludePropertyKeysFromFiles(new String[] {includedPropertiesFile.getAbsolutePath()});
// execute // execute
mojo.execute(); Assertions.assertThrows(MojoExecutionException.class, () -> mojo.execute());
} }
@Test @Test
public void testExecuteWithQuietModeOn() throws Exception { public void testExecuteWithQuietModeOn(@TempDir File testFolder) throws Exception {
// given // given
mojo.setQuiet(true); mojo.setQuiet(true);
mojo.setIncludePropertyKeysFromFiles(new String[] {"invalid location"}); mojo.setIncludePropertyKeysFromFiles(new String[] {"invalid location"});
@ -280,21 +270,21 @@ public class WritePredefinedProjectPropertiesTest {
// assert // assert
assertTrue(mojo.outputFile.exists()); assertTrue(mojo.outputFile.exists());
Properties storedProperties = getStoredProperties(); Properties storedProperties = getStoredProperties(testFolder);
assertEquals(0, storedProperties.size()); assertEquals(0, storedProperties.size());
} }
@Test(expected=MojoExecutionException.class) @Test
public void testExecuteIncludingPropertyKeysFromInvalidFile() throws Exception { public void testExecuteIncludingPropertyKeysFromInvalidFile() {
// given // given
mojo.setIncludePropertyKeysFromFiles(new String[] {"invalid location"}); mojo.setIncludePropertyKeysFromFiles(new String[] {"invalid location"});
// execute // execute
mojo.execute(); Assertions.assertThrows(MojoExecutionException.class, () -> mojo.execute());
} }
@Test @Test
public void testExecuteWithEnvironmentProperties() throws Exception { public void testExecuteWithEnvironmentProperties(@TempDir File testFolder) throws Exception {
// given // given
mojo.setIncludeEnvironmentVariables(true); mojo.setIncludeEnvironmentVariables(true);
@ -303,7 +293,7 @@ public class WritePredefinedProjectPropertiesTest {
// assert // assert
assertTrue(mojo.outputFile.exists()); assertTrue(mojo.outputFile.exists());
Properties storedProperties = getStoredProperties(); Properties storedProperties = getStoredProperties(testFolder);
assertTrue(storedProperties.size() > 0); assertTrue(storedProperties.size() > 0);
for (Object currentKey : storedProperties.keySet()) { for (Object currentKey : storedProperties.keySet()) {
assertTrue(((String)currentKey).startsWith(PROPERTY_PREFIX_ENV)); assertTrue(((String)currentKey).startsWith(PROPERTY_PREFIX_ENV));
@ -311,7 +301,7 @@ public class WritePredefinedProjectPropertiesTest {
} }
@Test @Test
public void testExecuteWithSystemProperties() throws Exception { public void testExecuteWithSystemProperties(@TempDir File testFolder) throws Exception {
// given // given
String key = "systemPropertyKey"; String key = "systemPropertyKey";
String value = "systemPropertyValue"; String value = "systemPropertyValue";
@ -323,14 +313,14 @@ public class WritePredefinedProjectPropertiesTest {
// assert // assert
assertTrue(mojo.outputFile.exists()); assertTrue(mojo.outputFile.exists());
Properties storedProperties = getStoredProperties(); Properties storedProperties = getStoredProperties(testFolder);
assertTrue(storedProperties.size() > 0); assertTrue(storedProperties.size() > 0);
assertTrue(storedProperties.containsKey(key)); assertTrue(storedProperties.containsKey(key));
assertEquals(value, storedProperties.getProperty(key)); assertEquals(value, storedProperties.getProperty(key));
} }
@Test @Test
public void testExecuteWithSystemPropertiesAndEscapeChars() throws Exception { public void testExecuteWithSystemPropertiesAndEscapeChars(@TempDir File testFolder) throws Exception {
// given // given
String key = "systemPropertyKey "; String key = "systemPropertyKey ";
String value = "systemPropertyValue"; String value = "systemPropertyValue";
@ -344,7 +334,7 @@ public class WritePredefinedProjectPropertiesTest {
// assert // assert
assertTrue(mojo.outputFile.exists()); assertTrue(mojo.outputFile.exists());
Properties storedProperties = getStoredProperties(); Properties storedProperties = getStoredProperties(testFolder);
assertTrue(storedProperties.size() > 0); assertTrue(storedProperties.size() > 0);
assertFalse(storedProperties.containsKey(key)); assertFalse(storedProperties.containsKey(key));
assertTrue(storedProperties.containsKey(key.trim())); assertTrue(storedProperties.containsKey(key.trim()));
@ -353,13 +343,13 @@ public class WritePredefinedProjectPropertiesTest {
// ----------------------------------- PRIVATE ------------------------------------------- // ----------------------------------- PRIVATE -------------------------------------------
private File getPropertiesFileLocation() { private File getPropertiesFileLocation(File testFolder) {
return new File(testFolder.getRoot(), "test.properties"); return new File(testFolder, "test.properties");
} }
private Properties getStoredProperties() throws FileNotFoundException, IOException { private Properties getStoredProperties(File testFolder) throws FileNotFoundException, IOException {
Properties properties = new Properties(); Properties properties = new Properties();
properties.load(new FileInputStream(getPropertiesFileLocation())); properties.load(new FileInputStream(getPropertiesFileLocation(testFolder)));
return properties; return properties;
} }
} }

View File

@ -4,7 +4,7 @@
<parent> <parent>
<groupId>eu.dnetlib.dhp</groupId> <groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp</artifactId> <artifactId>dhp</artifactId>
<version>1.1.6-SNAPSHOT</version> <version>1.1.7-SNAPSHOT</version>
</parent> </parent>
<artifactId>dhp-build</artifactId> <artifactId>dhp-build</artifactId>
<packaging>pom</packaging> <packaging>pom</packaging>

View File

@ -5,7 +5,7 @@
<parent> <parent>
<groupId>eu.dnetlib.dhp</groupId> <groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp</artifactId> <artifactId>dhp</artifactId>
<version>1.1.6-SNAPSHOT</version> <version>1.1.7-SNAPSHOT</version>
<relativePath>../</relativePath> <relativePath>../</relativePath>
</parent> </parent>
@ -58,6 +58,15 @@
<groupId>eu.dnetlib</groupId> <groupId>eu.dnetlib</groupId>
<artifactId>cnr-rmi-api</artifactId> <artifactId>cnr-rmi-api</artifactId>
</dependency> </dependency>
<dependency>
<groupId>com.ximpleware</groupId>
<artifactId>vtd-xml</artifactId>
</dependency>
<dependency>
<groupId>com.jayway.jsonpath</groupId>
<artifactId>json-path</artifactId>
</dependency>
</dependencies> </dependencies>
</project> </project>

View File

@ -0,0 +1,12 @@
package eu.dnetlib.dhp.parser.utility;
public class VtdException extends Exception {
public VtdException(final Exception e) {
super(e);
}
public VtdException(final Throwable e) {
super(e);
}
}

View File

@ -0,0 +1,107 @@
package eu.dnetlib.dhp.parser.utility;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import com.ximpleware.AutoPilot;
import com.ximpleware.VTDNav;
/**
* Created by sandro on 9/29/16.
*/
public class VtdUtilityParser {
public static List<Node> getTextValuesWithAttributes(final AutoPilot ap, final VTDNav vn, final String xpath, final List<String> attributes)
throws VtdException {
final List<Node> results = new ArrayList<>();
try {
ap.selectXPath(xpath);
while (ap.evalXPath() != -1) {
final Node currentNode = new Node();
int t = vn.getText();
if (t >= 0) {
currentNode.setTextValue(vn.toNormalizedString(t));
}
currentNode.setAttributes(getAttributes(vn, attributes));
results.add(currentNode);
}
return results;
} catch (Exception e) {
throw new VtdException(e);
}
}
private static Map<String, String> getAttributes(final VTDNav vn, final List<String> attributes) {
final Map<String, String> currentAttributes = new HashMap<>();
if (attributes != null) {
attributes.forEach(attributeKey -> {
try {
int attr = vn.getAttrVal(attributeKey);
if (attr > -1) {
currentAttributes.put(attributeKey, vn.toNormalizedString(attr));
}
} catch (Throwable e) {
throw new RuntimeException(e);
}
});
}
return currentAttributes;
}
public static List<String> getTextValue(final AutoPilot ap, final VTDNav vn, final String xpath) throws VtdException {
List<String> results = new ArrayList<>();
try {
ap.selectXPath(xpath);
while (ap.evalXPath() != -1) {
int t = vn.getText();
if (t > -1) results.add(vn.toNormalizedString(t));
}
return results;
} catch (Exception e) {
throw new VtdException(e);
}
}
public static String getSingleValue(final AutoPilot ap, final VTDNav nav, final String xpath) throws VtdException {
try {
ap.selectXPath(xpath);
while (ap.evalXPath() != -1) {
int it = nav.getText();
if (it > -1)
return nav.toNormalizedString(it);
}
return null;
} catch (Exception e) {
throw new VtdException(e);
}
}
public static class Node {
private String textValue;
private Map<String, String> attributes;
public String getTextValue() {
return textValue;
}
public void setTextValue(final String textValue) {
this.textValue = textValue;
}
public Map<String, String> getAttributes() {
return attributes;
}
public void setAttributes(final Map<String, String> attributes) {
this.attributes = attributes;
}
}
}

View File

@ -1,5 +1,7 @@
package eu.dnetlib.dhp.utils; package eu.dnetlib.dhp.utils;
import com.jayway.jsonpath.JsonPath;
import net.minidev.json.JSONArray;
import org.apache.commons.codec.binary.Base64; import org.apache.commons.codec.binary.Base64;
import org.apache.commons.codec.binary.Base64OutputStream; import org.apache.commons.codec.binary.Base64OutputStream;
import org.apache.commons.codec.binary.Hex; import org.apache.commons.codec.binary.Hex;
@ -56,4 +58,17 @@ public class DHPUtils {
} }
public static String getJPathString(final String jsonPath, final String json) {
try {
Object o = JsonPath.read(json, jsonPath);
if (o instanceof String)
return (String) o;
if (o instanceof JSONArray && ((JSONArray) o).size() > 0)
return (String) ((JSONArray) o).get(0);
return o.toString();
} catch (Exception e) {
return "";
}
}
} }

View File

@ -0,0 +1,24 @@
package eu.dnetlib.scholexplorer.relation;
import java.io.Serializable;
public class RelInfo implements Serializable {
private String original;
private String inverse;
public String getOriginal() {
return original;
}
public void setOriginal(String original) {
this.original = original;
}
public String getInverse() {
return inverse;
}
public void setInverse(String inverse) {
this.inverse = inverse;
}
}

View File

@ -0,0 +1,19 @@
package eu.dnetlib.scholexplorer.relation;
import com.fasterxml.jackson.databind.ObjectMapper;
import org.apache.commons.io.IOUtils;
import java.io.Serializable;
import java.util.HashMap;
public class RelationMapper extends HashMap<String,RelInfo > implements Serializable {
public static RelationMapper load() throws Exception {
final String json = IOUtils.toString(RelationMapper.class.getResourceAsStream("relations.json"));
ObjectMapper mapper = new ObjectMapper();
return mapper.readValue(json, RelationMapper.class);
}
}

View File

@ -0,0 +1,158 @@
{
"cites":{
"original":"Cites",
"inverse":"IsCitedBy"
},
"compiles":{
"original":"Compiles",
"inverse":"IsCompiledBy"
},
"continues":{
"original":"Continues",
"inverse":"IsContinuedBy"
},
"derives":{
"original":"IsSourceOf",
"inverse":"IsDerivedFrom"
},
"describes":{
"original":"Describes",
"inverse":"IsDescribedBy"
},
"documents":{
"original":"Documents",
"inverse":"IsDocumentedBy"
},
"hasmetadata":{
"original":"HasMetadata",
"inverse":"IsMetadataOf"
},
"hasassociationwith":{
"original":"HasAssociationWith",
"inverse":"HasAssociationWith"
},
"haspart":{
"original":"HasPart",
"inverse":"IsPartOf"
},
"hasversion":{
"original":"HasVersion",
"inverse":"IsVersionOf"
},
"iscitedby":{
"original":"IsCitedBy",
"inverse":"Cites"
},
"iscompiledby":{
"original":"IsCompiledBy",
"inverse":"Compiles"
},
"iscontinuedby":{
"original":"IsContinuedBy",
"inverse":"Continues"
},
"isderivedfrom":{
"original":"IsDerivedFrom",
"inverse":"IsSourceOf"
},
"isdescribedby":{
"original":"IsDescribedBy",
"inverse":"Describes"
},
"isdocumentedby":{
"original":"IsDocumentedBy",
"inverse":"Documents"
},
"isidenticalto":{
"original":"IsIdenticalTo",
"inverse":"IsIdenticalTo"
},
"ismetadatafor":{
"original":"IsMetadataFor",
"inverse":"IsMetadataOf"
},
"ismetadataof":{
"original":"IsMetadataOf",
"inverse":"IsMetadataFor"
},
"isnewversionof":{
"original":"IsNewVersionOf",
"inverse":"IsPreviousVersionOf"
},
"isobsoletedby":{
"original":"IsObsoletedBy",
"inverse":"Obsoletes"
},
"isoriginalformof":{
"original":"IsOriginalFormOf",
"inverse":"IsVariantFormOf"
},
"ispartof":{
"original":"IsPartOf",
"inverse":"HasPart"
},
"ispreviousversionof":{
"original":"IsPreviousVersionOf",
"inverse":"IsNewVersionOf"
},
"isreferencedby":{
"original":"IsReferencedBy",
"inverse":"References"
},
"isrelatedto":{
"original":"IsRelatedTo",
"inverse":"IsRelatedTo"
},
"isrequiredby":{
"original":"IsRequiredBy",
"inverse":"Requires"
},
"isreviewedby":{
"original":"IsReviewedBy",
"inverse":"Reviews"
},
"issourceof":{
"original":"IsSourceOf",
"inverse":"IsDerivedFrom"
},
"issupplementedby":{
"original":"IsSupplementedBy",
"inverse":"IsSupplementTo"
},
"issupplementto":{
"original":"IsSupplementTo",
"inverse":"IsSupplementedBy"
},
"isvariantformof":{
"original":"IsVariantFormOf",
"inverse":"IsOriginalFormOf"
},
"isversionof":{
"original":"IsVersionOf",
"inverse":"HasVersion"
},
"obsoletes":{
"original":"Obsoletes",
"inverse":"IsObsoletedBy"
},
"references":{
"original":"References",
"inverse":"IsReferencedBy"
},
"requires":{
"original":"Requires",
"inverse":"IsRequiredBy"
},
"related":{
"original":"IsRelatedTo",
"inverse":"IsRelatedTo"
},
"reviews":{
"original":"Reviews",
"inverse":"IsReviewedBy"
},
"unknown":{
"original":"Unknown",
"inverse":"Unknown"
}
}

View File

@ -1,18 +1,13 @@
package eu.dnetlib.dhp.application; package eu.dnetlib.dhp.application;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.junit.Test; import org.junit.jupiter.api.Test;
import java.io.ByteArrayOutputStream; import static org.junit.jupiter.api.Assertions.assertEquals;
import java.util.Base64; import static org.junit.jupiter.api.Assertions.assertNotNull;
import java.util.zip.GZIPOutputStream;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNotNull;
public class ArgumentApplicationParserTest { public class ArgumentApplicationParserTest {
@Test @Test
public void testParseParameter() throws Exception { public void testParseParameter() throws Exception {
final String jsonConfiguration = IOUtils.toString(this.getClass().getResourceAsStream("/eu/dnetlib/application/parameters.json")); final String jsonConfiguration = IOUtils.toString(this.getClass().getResourceAsStream("/eu/dnetlib/application/parameters.json"));

View File

@ -1,8 +1,8 @@
package eu.dnetlib.dhp.model.mdstore; package eu.dnetlib.dhp.model.mdstore;
import org.junit.Test; import org.junit.jupiter.api.Test;
import static org.junit.Assert.assertTrue; import static org.junit.jupiter.api.Assertions.assertTrue;
public class MetadataRecordTest { public class MetadataRecordTest {
@ -10,6 +10,6 @@ public class MetadataRecordTest {
public void getTimestamp() { public void getTimestamp() {
MetadataRecord r = new MetadataRecord(); MetadataRecord r = new MetadataRecord();
assertTrue(r.getDateOfCollection() >0); assertTrue(r.getDateOfCollection() > 0);
} }
} }

View File

@ -1,12 +1,12 @@
package eu.dnetlib.message; package eu.dnetlib.message;
import org.junit.Test; import org.junit.jupiter.api.Test;
import java.io.IOException; import java.io.IOException;
import java.util.HashMap; import java.util.HashMap;
import java.util.Map; import java.util.Map;
import static org.junit.Assert.*; import static org.junit.jupiter.api.Assertions.*;
public class MessageTest { public class MessageTest {

View File

@ -0,0 +1,15 @@
package eu.dnetlib.scholexplorer.relation;
import org.junit.jupiter.api.Test;
public class RelationMapperTest {
@Test
public void testLoadRels() throws Exception{
RelationMapper relationMapper = RelationMapper.load();
relationMapper.keySet().forEach(System.out::println);
}
}

View File

@ -0,0 +1,158 @@
{
"cites":{
"original":"Cites",
"inverse":"IsCitedBy"
},
"compiles":{
"original":"Compiles",
"inverse":"IsCompiledBy"
},
"continues":{
"original":"Continues",
"inverse":"IsContinuedBy"
},
"derives":{
"original":"IsSourceOf",
"inverse":"IsDerivedFrom"
},
"describes":{
"original":"Describes",
"inverse":"IsDescribedBy"
},
"documents":{
"original":"Documents",
"inverse":"IsDocumentedBy"
},
"hasmetadata":{
"original":"HasMetadata",
"inverse":"IsMetadataOf"
},
"hasassociationwith":{
"original":"HasAssociationWith",
"inverse":"HasAssociationWith"
},
"haspart":{
"original":"HasPart",
"inverse":"IsPartOf"
},
"hasversion":{
"original":"HasVersion",
"inverse":"IsVersionOf"
},
"iscitedby":{
"original":"IsCitedBy",
"inverse":"Cites"
},
"iscompiledby":{
"original":"IsCompiledBy",
"inverse":"Compiles"
},
"iscontinuedby":{
"original":"IsContinuedBy",
"inverse":"Continues"
},
"isderivedfrom":{
"original":"IsDerivedFrom",
"inverse":"IsSourceOf"
},
"isdescribedby":{
"original":"IsDescribedBy",
"inverse":"Describes"
},
"isdocumentedby":{
"original":"IsDocumentedBy",
"inverse":"Documents"
},
"isidenticalto":{
"original":"IsIdenticalTo",
"inverse":"IsIdenticalTo"
},
"ismetadatafor":{
"original":"IsMetadataFor",
"inverse":"IsMetadataOf"
},
"ismetadataof":{
"original":"IsMetadataOf",
"inverse":"IsMetadataFor"
},
"isnewversionof":{
"original":"IsNewVersionOf",
"inverse":"IsPreviousVersionOf"
},
"isobsoletedby":{
"original":"IsObsoletedBy",
"inverse":"Obsoletes"
},
"isoriginalformof":{
"original":"IsOriginalFormOf",
"inverse":"IsVariantFormOf"
},
"ispartof":{
"original":"IsPartOf",
"inverse":"HasPart"
},
"ispreviousversionof":{
"original":"IsPreviousVersionOf",
"inverse":"IsNewVersionOf"
},
"isreferencedby":{
"original":"IsReferencedBy",
"inverse":"References"
},
"isrelatedto":{
"original":"IsRelatedTo",
"inverse":"IsRelatedTo"
},
"isrequiredby":{
"original":"IsRequiredBy",
"inverse":"Requires"
},
"isreviewedby":{
"original":"IsReviewedBy",
"inverse":"Reviews"
},
"issourceof":{
"original":"IsSourceOf",
"inverse":"IsDerivedFrom"
},
"issupplementedby":{
"original":"IsSupplementedBy",
"inverse":"IsSupplementTo"
},
"issupplementto":{
"original":"IsSupplementTo",
"inverse":"IsSupplementedBy"
},
"isvariantformof":{
"original":"IsVariantFormOf",
"inverse":"IsOriginalFormOf"
},
"isversionof":{
"original":"IsVersionOf",
"inverse":"HasVersion"
},
"obsoletes":{
"original":"Obsoletes",
"inverse":"IsObsoletedBy"
},
"references":{
"original":"References",
"inverse":"IsReferencedBy"
},
"requires":{
"original":"Requires",
"inverse":"IsRequiredBy"
},
"related":{
"original":"IsRelatedTo",
"inverse":"IsRelatedTo"
},
"reviews":{
"original":"Reviews",
"inverse":"IsReviewedBy"
},
"unknown":{
"original":"Unknown",
"inverse":"Unknown"
}
}

View File

@ -5,7 +5,7 @@
<parent> <parent>
<groupId>eu.dnetlib.dhp</groupId> <groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp</artifactId> <artifactId>dhp</artifactId>
<version>1.1.6-SNAPSHOT</version> <version>1.1.7-SNAPSHOT</version>
<relativePath>../</relativePath> <relativePath>../</relativePath>
</parent> </parent>
@ -36,19 +36,6 @@
<artifactId>guava</artifactId> <artifactId>guava</artifactId>
</dependency> </dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>${junit.version}</version>
</dependency>
<dependency>
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-common</artifactId>
<version>${project.version}</version>
</dependency>
</dependencies> </dependencies>

View File

@ -0,0 +1,80 @@
package eu.dnetlib.dhp.schema.scholexplorer;
import eu.dnetlib.dhp.schema.oaf.Dataset;
import eu.dnetlib.dhp.schema.oaf.OafEntity;
import org.apache.commons.lang3.StringUtils;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
public class DLIDataset extends Dataset {
private String originalObjIdentifier;
private List<ProvenaceInfo> dlicollectedfrom;
private String completionStatus;
public String getCompletionStatus() {
return completionStatus;
}
public void setCompletionStatus(String completionStatus) {
this.completionStatus = completionStatus;
}
public List<ProvenaceInfo> getDlicollectedfrom() {
return dlicollectedfrom;
}
public void setDlicollectedfrom(List<ProvenaceInfo> dlicollectedfrom) {
this.dlicollectedfrom = dlicollectedfrom;
}
public String getOriginalObjIdentifier() {
return originalObjIdentifier;
}
public void setOriginalObjIdentifier(String originalObjIdentifier) {
this.originalObjIdentifier = originalObjIdentifier;
}
@Override
public void mergeFrom(OafEntity e) {
super.mergeFrom(e);
DLIDataset p = (DLIDataset) e;
if (StringUtils.isBlank(completionStatus) && StringUtils.isNotBlank(p.completionStatus))
completionStatus = p.completionStatus;
if ("complete".equalsIgnoreCase(p.completionStatus))
completionStatus = "complete";
dlicollectedfrom = mergeProvenance(dlicollectedfrom, p.getDlicollectedfrom());
}
private List<ProvenaceInfo> mergeProvenance(final List<ProvenaceInfo> a, final List<ProvenaceInfo> b) {
Map<String, ProvenaceInfo> result = new HashMap<>();
if (a != null)
a.forEach(p -> {
if (p != null && StringUtils.isNotBlank(p.getId()) && result.containsKey(p.getId())) {
if ("incomplete".equalsIgnoreCase(result.get(p.getId()).getCompletionStatus()) && StringUtils.isNotBlank(p.getCompletionStatus())) {
result.put(p.getId(), p);
}
} else if (p != null && p.getId() != null && !result.containsKey(p.getId()))
result.put(p.getId(), p);
});
if (b != null)
b.forEach(p -> {
if (p != null && StringUtils.isNotBlank(p.getId()) && result.containsKey(p.getId())) {
if ("incomplete".equalsIgnoreCase(result.get(p.getId()).getCompletionStatus()) && StringUtils.isNotBlank(p.getCompletionStatus())) {
result.put(p.getId(), p);
}
} else if (p != null && p.getId() != null && !result.containsKey(p.getId()))
result.put(p.getId(), p);
});
return new ArrayList<>(result.values());
}
}

View File

@ -0,0 +1,77 @@
package eu.dnetlib.dhp.schema.scholexplorer;
import eu.dnetlib.dhp.schema.oaf.OafEntity;
import eu.dnetlib.dhp.schema.oaf.Publication;
import org.apache.commons.lang3.StringUtils;
import java.io.Serializable;
import java.util.*;
public class DLIPublication extends Publication implements Serializable {
private String originalObjIdentifier;
private List<ProvenaceInfo> dlicollectedfrom;
private String completionStatus;
public String getCompletionStatus() {
return completionStatus;
}
public void setCompletionStatus(String completionStatus) {
this.completionStatus = completionStatus;
}
public List<ProvenaceInfo> getDlicollectedfrom() {
return dlicollectedfrom;
}
public void setDlicollectedfrom(List<ProvenaceInfo> dlicollectedfrom) {
this.dlicollectedfrom = dlicollectedfrom;
}
public String getOriginalObjIdentifier() {
return originalObjIdentifier;
}
public void setOriginalObjIdentifier(String originalObjIdentifier) {
this.originalObjIdentifier = originalObjIdentifier;
}
@Override
public void mergeFrom(OafEntity e) {
super.mergeFrom(e);
DLIPublication p = (DLIPublication) e;
if (StringUtils.isBlank(completionStatus) && StringUtils.isNotBlank(p.completionStatus))
completionStatus = p.completionStatus;
if ("complete".equalsIgnoreCase(p.completionStatus))
completionStatus = "complete";
dlicollectedfrom = mergeProvenance(dlicollectedfrom, p.getDlicollectedfrom());
}
private List<ProvenaceInfo> mergeProvenance(final List<ProvenaceInfo> a, final List<ProvenaceInfo> b) {
Map<String, ProvenaceInfo> result = new HashMap<>();
if (a != null)
a.forEach(p -> {
if (p != null && StringUtils.isNotBlank(p.getId()) && result.containsKey(p.getId())) {
if ("incomplete".equalsIgnoreCase(result.get(p.getId()).getCompletionStatus()) && StringUtils.isNotBlank(p.getCompletionStatus())) {
result.put(p.getId(), p);
}
} else if (p != null && p.getId() != null && !result.containsKey(p.getId()))
result.put(p.getId(), p);
});
if (b != null)
b.forEach(p -> {
if (p != null && StringUtils.isNotBlank(p.getId()) && result.containsKey(p.getId())) {
if ("incomplete".equalsIgnoreCase(result.get(p.getId()).getCompletionStatus()) && StringUtils.isNotBlank(p.getCompletionStatus())) {
result.put(p.getId(), p);
}
} else if (p != null && p.getId() != null && !result.containsKey(p.getId()))
result.put(p.getId(), p);
});
return new ArrayList<>(result.values());
}
}

View File

@ -0,0 +1,108 @@
package eu.dnetlib.dhp.schema.scholexplorer;
import eu.dnetlib.dhp.schema.oaf.Oaf;
import eu.dnetlib.dhp.schema.oaf.OafEntity;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
import org.apache.commons.lang3.StringUtils;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
public class DLIUnknown extends Oaf implements Serializable {
private String id;
private List<StructuredProperty> pid;
private String dateofcollection;
private String dateoftransformation;
private List<ProvenaceInfo> dlicollectedfrom;
private String completionStatus = "incomplete";
public String getCompletionStatus() {
return completionStatus;
}
public void setCompletionStatus(String completionStatus) {
this.completionStatus = completionStatus;
}
public List<ProvenaceInfo> getDlicollectedfrom() {
return dlicollectedfrom;
}
public void setDlicollectedfrom(List<ProvenaceInfo> dlicollectedfrom) {
this.dlicollectedfrom = dlicollectedfrom;
}
public String getId() {
return id;
}
public void setId(String id) {
this.id = id;
}
public List<StructuredProperty> getPid() {
return pid;
}
public void setPid(List<StructuredProperty> pid) {
this.pid = pid;
}
public String getDateofcollection() {
return dateofcollection;
}
public void setDateofcollection(String dateofcollection) {
this.dateofcollection = dateofcollection;
}
public String getDateoftransformation() {
return dateoftransformation;
}
public void setDateoftransformation(String dateoftransformation) {
this.dateoftransformation = dateoftransformation;
}
public void mergeFrom(DLIUnknown p) {
if ("complete".equalsIgnoreCase(p.completionStatus))
completionStatus = "complete";
dlicollectedfrom = mergeProvenance(dlicollectedfrom, p.getDlicollectedfrom());
}
private List<ProvenaceInfo> mergeProvenance(final List<ProvenaceInfo> a, final List<ProvenaceInfo> b) {
Map<String, ProvenaceInfo> result = new HashMap<>();
if (a != null)
a.forEach(p -> {
if (p != null && StringUtils.isNotBlank(p.getId()) && result.containsKey(p.getId())) {
if ("incomplete".equalsIgnoreCase(result.get(p.getId()).getCompletionStatus()) && StringUtils.isNotBlank(p.getCompletionStatus())) {
result.put(p.getId(), p);
}
} else if (p != null && p.getId() != null && !result.containsKey(p.getId()))
result.put(p.getId(), p);
});
if (b != null)
b.forEach(p -> {
if (p != null && StringUtils.isNotBlank(p.getId()) && result.containsKey(p.getId())) {
if ("incomplete".equalsIgnoreCase(result.get(p.getId()).getCompletionStatus()) && StringUtils.isNotBlank(p.getCompletionStatus())) {
result.put(p.getId(), p);
}
} else if (p != null && p.getId() != null && !result.containsKey(p.getId()))
result.put(p.getId(), p);
});
return new ArrayList<>(result.values());
}
}

View File

@ -0,0 +1,46 @@
package eu.dnetlib.dhp.schema.scholexplorer;
import java.io.Serializable;
public class ProvenaceInfo implements Serializable {
private String id;
private String name;
private String completionStatus;
private String collectionMode ="collected";
public String getId() {
return id;
}
public void setId(String id) {
this.id = id;
}
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
public String getCompletionStatus() {
return completionStatus;
}
public void setCompletionStatus(String completionStatus) {
this.completionStatus = completionStatus;
}
public String getCollectionMode() {
return collectionMode;
}
public void setCollectionMode(String collectionMode) {
this.collectionMode = collectionMode;
}
}

View File

@ -3,11 +3,16 @@ package eu.dnetlib.dhp.schema.action;
import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.schema.oaf.Relation; import eu.dnetlib.dhp.schema.oaf.Relation;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import org.junit.Assert;
import org.junit.Test; import org.junit.jupiter.api.Test;
import java.io.IOException; import java.io.IOException;
import static org.junit.jupiter.api.Assertions.*;
/**
* @author claudio.atzori
*/
public class AtomicActionTest { public class AtomicActionTest {
@Test @Test
@ -25,12 +30,12 @@ public class AtomicActionTest {
final ObjectMapper mapper = new ObjectMapper(); final ObjectMapper mapper = new ObjectMapper();
String json = mapper.writeValueAsString(aa1); String json = mapper.writeValueAsString(aa1);
Assert.assertTrue(StringUtils.isNotBlank(json)); assertTrue(StringUtils.isNotBlank(json));
AtomicAction aa2 = mapper.readValue(json, AtomicAction.class); AtomicAction aa2 = mapper.readValue(json, AtomicAction.class);
Assert.assertEquals(aa1.getClazz(), aa2.getClazz()); assertEquals(aa1.getClazz(), aa2.getClazz());
Assert.assertEquals(aa1.getPayload(), aa2.getPayload()); assertEquals(aa1.getPayload(), aa2.getPayload());
} }

View File

@ -1,11 +1,9 @@
package eu.dnetlib.dhp.schema.oaf; package eu.dnetlib.dhp.schema.oaf;
import org.junit.Assert; import static org.junit.jupiter.api.Assertions.*;
import org.junit.Before; import org.junit.jupiter.api.BeforeEach;
import org.junit.Test; import org.junit.jupiter.api.Test;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays; import java.util.Arrays;
import java.util.List; import java.util.List;
@ -13,7 +11,7 @@ public class MergeTest {
OafEntity oaf; OafEntity oaf;
@Before @BeforeEach
public void setUp() { public void setUp() {
oaf = new Publication(); oaf = new Publication();
} }
@ -44,8 +42,8 @@ public class MergeTest {
a.mergeFrom(b); a.mergeFrom(b);
Assert.assertNotNull(a.getCollectedfrom()); assertNotNull(a.getCollectedfrom());
Assert.assertEquals(3, a.getCollectedfrom().size()); assertEquals(3, a.getCollectedfrom().size());
} }
@ -60,8 +58,8 @@ public class MergeTest {
a.mergeFrom(b); a.mergeFrom(b);
Assert.assertNotNull(a.getSubject()); assertNotNull(a.getSubject());
Assert.assertEquals(3, a.getSubject().size()); assertEquals(3, a.getSubject().size());
} }

View File

@ -0,0 +1,81 @@
package eu.dnetlib.dhp.schema.scholexplorer;
import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.DeserializationFeature;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.SerializationFeature;
import eu.dnetlib.dhp.schema.oaf.Qualifier;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
import org.junit.jupiter.api.Test;
import java.io.IOException;
import java.util.Arrays;
import java.util.Collections;
public class DLItest {
@Test
public void testMergePublication() throws JsonProcessingException {
DLIPublication a1 = new DLIPublication();
a1.setPid(Arrays.asList( createSP("123456","pdb","dnet:pid_types")));
a1.setTitle(Collections.singletonList(createSP("Un Titolo", "title", "dnetTitle")));
a1.setDlicollectedfrom(Arrays.asList(createCollectedFrom("znd","Zenodo","complete")));
a1.setCompletionStatus("complete");
DLIPublication a = new DLIPublication();
a.setPid(Arrays.asList(createSP("10.11","doi","dnet:pid_types"), createSP("123456","pdb","dnet:pid_types")));
a.setTitle(Collections.singletonList(createSP("A Title", "title", "dnetTitle")));
a.setDlicollectedfrom(Arrays.asList(createCollectedFrom("dct","datacite","complete"),createCollectedFrom("dct","datacite","incomplete")));
a.setCompletionStatus("incomplete");
a.mergeFrom(a1);
ObjectMapper mapper = new ObjectMapper();
System.out.println(mapper.writeValueAsString(a));
}
@Test
public void testDeserialization() throws IOException {
final String json ="{\"dataInfo\":{\"invisible\":false,\"inferred\":null,\"deletedbyinference\":false,\"trust\":\"0.9\",\"inferenceprovenance\":null,\"provenanceaction\":null},\"lastupdatetimestamp\":null,\"id\":\"60|bd9352547098929a394655ad1a44a479\",\"originalId\":[\"bd9352547098929a394655ad1a44a479\"],\"collectedfrom\":[{\"key\":\"dli_________::datacite\",\"value\":\"Datasets in Datacite\",\"dataInfo\":null,\"blank\":false}],\"pid\":[{\"value\":\"10.7925/DRS1.DUCHAS_5078760\",\"qualifier\":{\"classid\":\"doi\",\"classname\":\"doi\",\"schemeid\":\"dnet:pid_types\",\"schemename\":\"dnet:pid_types\",\"blank\":false},\"dataInfo\":null}],\"dateofcollection\":\"2020-01-09T08:29:31.885Z\",\"dateoftransformation\":null,\"extraInfo\":null,\"oaiprovenance\":null,\"author\":[{\"fullname\":\"Cathail, S. Ó\",\"name\":null,\"surname\":null,\"rank\":null,\"pid\":null,\"affiliation\":null},{\"fullname\":\"Donnell, Breda Mc\",\"name\":null,\"surname\":null,\"rank\":null,\"pid\":null,\"affiliation\":null},{\"fullname\":\"Ireland. Department of Arts, Culture, and the Gaeltacht\",\"name\":null,\"surname\":null,\"rank\":null,\"pid\":null,\"affiliation\":null},{\"fullname\":\"University College Dublin\",\"name\":null,\"surname\":null,\"rank\":null,\"pid\":null,\"affiliation\":null},{\"fullname\":\"National Folklore Foundation\",\"name\":null,\"surname\":null,\"rank\":null,\"pid\":null,\"affiliation\":null},{\"fullname\":\"Cathail, S. Ó\",\"name\":null,\"surname\":null,\"rank\":null,\"pid\":null,\"affiliation\":null},{\"fullname\":\"Donnell, Breda Mc\",\"name\":null,\"surname\":null,\"rank\":null,\"pid\":null,\"affiliation\":null}],\"resulttype\":null,\"language\":null,\"country\":null,\"subject\":[{\"value\":\"Recreation\",\"qualifier\":{\"classid\":\"dnet:subject\",\"classname\":\"dnet:subject\",\"schemeid\":\"unknown\",\"schemename\":\"unknown\",\"blank\":false},\"dataInfo\":null},{\"value\":\"Entertainments and recreational activities\",\"qualifier\":{\"classid\":\"dnet:subject\",\"classname\":\"dnet:subject\",\"schemeid\":\"unknown\",\"schemename\":\"unknown\",\"blank\":false},\"dataInfo\":null},{\"value\":\"Siamsaíocht agus caitheamh aimsire\",\"qualifier\":{\"classid\":\"dnet:subject\",\"classname\":\"dnet:subject\",\"schemeid\":\"unknown\",\"schemename\":\"unknown\",\"blank\":false},\"dataInfo\":null}],\"title\":[{\"value\":\"Games We Play\",\"qualifier\":null,\"dataInfo\":null}],\"relevantdate\":[{\"value\":\"1938-09-28\",\"qualifier\":{\"classid\":\"date\",\"classname\":\"date\",\"schemeid\":\"dnet::date\",\"schemename\":\"dnet::date\",\"blank\":false},\"dataInfo\":null}],\"description\":[{\"value\":\"Story collected by Breda Mc Donnell, a student at Tenure school (Tinure, Co. Louth) (no informant identified).\",\"dataInfo\":null}],\"dateofacceptance\":null,\"publisher\":{\"value\":\"University College Dublin\",\"dataInfo\":null},\"embargoenddate\":null,\"source\":null,\"fulltext\":null,\"format\":null,\"contributor\":null,\"resourcetype\":null,\"coverage\":null,\"refereed\":null,\"context\":null,\"processingchargeamount\":null,\"processingchargecurrency\":null,\"externalReference\":null,\"instance\":[],\"storagedate\":null,\"device\":null,\"size\":null,\"version\":null,\"lastmetadataupdate\":null,\"metadataversionnumber\":null,\"geolocation\":null,\"dlicollectedfrom\":[{\"id\":\"dli_________::datacite\",\"name\":\"Datasets in Datacite\",\"completionStatus\":\"complete\",\"collectionMode\":\"resolved\"}],\"completionStatus\":\"complete\"}";
ObjectMapper mapper = new ObjectMapper();
mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
DLIDataset dliDataset = mapper.readValue(json, DLIDataset.class);
mapper.enable(SerializationFeature.INDENT_OUTPUT);
System.out.println(mapper.writeValueAsString(dliDataset));
}
private ProvenaceInfo createCollectedFrom(final String id, final String name, final String completionStatus) {
ProvenaceInfo p = new ProvenaceInfo();
p.setId(id);
p.setName(name);
p.setCompletionStatus(completionStatus);
return p;
}
private StructuredProperty createSP(final String value, final String className, final String schemeName) {
StructuredProperty p = new StructuredProperty();
p.setValue(value);
Qualifier schema = new Qualifier();
schema.setClassname(className);
schema.setClassid(className);
schema.setSchemename(schemeName);
schema.setSchemeid(schemeName);
p.setQualifier(schema);
return p;
}
}

View File

@ -4,7 +4,7 @@
<parent> <parent>
<groupId>eu.dnetlib.dhp</groupId> <groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-workflows</artifactId> <artifactId>dhp-workflows</artifactId>
<version>1.1.6-SNAPSHOT</version> <version>1.1.7-SNAPSHOT</version>
</parent> </parent>
<artifactId>dhp-aggregation</artifactId> <artifactId>dhp-aggregation</artifactId>
@ -105,6 +105,7 @@
<artifactId>mongo-java-driver</artifactId> <artifactId>mongo-java-driver</artifactId>
</dependency> </dependency>
<dependency> <dependency>
<groupId>org.apache.hadoop</groupId> <groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-distcp</artifactId> <artifactId>hadoop-distcp</artifactId>
@ -116,13 +117,6 @@
<version>42.2.10</version> <version>42.2.10</version>
</dependency> </dependency>
<dependency>
<groupId>org.mockito</groupId>
<artifactId>mockito-core</artifactId>
<version>2.25.0</version>
<scope>test</scope>
</dependency>
</dependencies> </dependencies>
</project> </project>

View File

@ -20,6 +20,7 @@ import java.util.Arrays;
import java.util.Date; import java.util.Date;
import java.util.List; import java.util.List;
import java.util.function.Consumer; import java.util.function.Consumer;
import java.util.function.Function;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
@ -36,6 +37,7 @@ import eu.dnetlib.dhp.schema.oaf.Datasource;
import eu.dnetlib.dhp.schema.oaf.Field; import eu.dnetlib.dhp.schema.oaf.Field;
import eu.dnetlib.dhp.schema.oaf.Journal; import eu.dnetlib.dhp.schema.oaf.Journal;
import eu.dnetlib.dhp.schema.oaf.KeyValue; import eu.dnetlib.dhp.schema.oaf.KeyValue;
import eu.dnetlib.dhp.schema.oaf.Oaf;
import eu.dnetlib.dhp.schema.oaf.Organization; import eu.dnetlib.dhp.schema.oaf.Organization;
import eu.dnetlib.dhp.schema.oaf.OtherResearchProduct; import eu.dnetlib.dhp.schema.oaf.OtherResearchProduct;
import eu.dnetlib.dhp.schema.oaf.Project; import eu.dnetlib.dhp.schema.oaf.Project;
@ -95,6 +97,12 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication i
} }
} }
protected MigrateDbEntitiesApplication() { // ONLY FOR UNIT TEST
super();
this.dbClient = null;
this.lastUpdateTimestamp = new Date().getTime();
}
public MigrateDbEntitiesApplication(final String hdfsPath, final String dbUrl, final String dbUser, public MigrateDbEntitiesApplication(final String hdfsPath, final String dbUrl, final String dbUser,
final String dbPassword) throws Exception { final String dbPassword) throws Exception {
super(hdfsPath); super(hdfsPath);
@ -102,12 +110,15 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication i
this.lastUpdateTimestamp = new Date().getTime(); this.lastUpdateTimestamp = new Date().getTime();
} }
public void execute(final String sqlFile, final Consumer<ResultSet> consumer) throws Exception { public void execute(final String sqlFile, final Function<ResultSet, List<Oaf>> producer) throws Exception {
final String sql = IOUtils.toString(getClass().getResourceAsStream("/eu/dnetlib/dhp/migration/sql/" + sqlFile)); final String sql = IOUtils.toString(getClass().getResourceAsStream("/eu/dnetlib/dhp/migration/sql/" + sqlFile));
final Consumer<ResultSet> consumer = rs -> producer.apply(rs).forEach(oaf -> emitOaf(oaf));
dbClient.processResults(sql, consumer); dbClient.processResults(sql, consumer);
} }
public void processDatasource(final ResultSet rs) { public List<Oaf> processDatasource(final ResultSet rs) {
try { try {
@ -161,61 +172,13 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication i
ds.setDataInfo(info); ds.setDataInfo(info);
ds.setLastupdatetimestamp(lastUpdateTimestamp); ds.setLastupdatetimestamp(lastUpdateTimestamp);
// rs.getString("datasourceid"); return Arrays.asList(ds);
// rs.getArray("identities");
// rs.getString("officialname");
// rs.getString("englishname");
// rs.getString("contactemail");
// rs.getString("openairecompatibility"); // COMPLEX ...@@@...
// rs.getString("websiteurl");
// rs.getString("logourl");
// rs.getArray("accessinfopackage");
// rs.getDouble("latitude");
// rs.getDouble("longitude");
// rs.getString("namespaceprefix");
// rs.getInt("odnumberofitems"); // NULL
// rs.getDate("odnumberofitemsdate"); // NULL
// rs.getArray("subjects");
// rs.getString("description");
// rs.getString("odpolicies"); // NULL
// rs.getArray("odlanguages");
// rs.getArray("odcontenttypes");
// rs.getBoolean("inferred"); // false
// rs.getBoolean("deletedbyinference");// false
// rs.getDouble("trust"); // 0.9
// rs.getString("inferenceprovenance"); // NULL
// rs.getDate("dateofcollection");
// rs.getDate("dateofvalidation");
// rs.getDate("releasestartdate");
// rs.getDate("releaseenddate");
// rs.getString("missionstatementurl");
// rs.getBoolean("dataprovider");
// rs.getBoolean("serviceprovider");
// rs.getString("databaseaccesstype");
// rs.getString("datauploadtype");
// rs.getString("databaseaccessrestriction");
// rs.getString("datauploadrestriction");
// rs.getBoolean("versioning");
// rs.getString("citationguidelineurl");
// rs.getString("qualitymanagementkind");
// rs.getString("pidsystems");
// rs.getString("certificates");
// rs.getArray("policies");
// rs.getString("collectedfromid");
// rs.getString("collectedfromname");
// rs.getString("datasourcetype"); // COMPLEX
// rs.getString("provenanceaction"); //
// 'sysimport:crosswalk:entityregistry@@@sysimport:crosswalk:entityregistry@@@dnet:provenance_actions@@@dnet:provenance_actions'
// AS provenanceaction,
// rs.getString("journal"); // CONCAT(d.issn, '@@@', d.eissn, '@@@', d.lissn) AS journal
emitOaf(ds);
} catch (final Exception e) { } catch (final Exception e) {
throw new RuntimeException(e); throw new RuntimeException(e);
} }
} }
public void processProject(final ResultSet rs) { public List<Oaf> processProject(final ResultSet rs) {
try { try {
final DataInfo info = prepareDataInfo(rs); final DataInfo info = prepareDataInfo(rs);
@ -259,52 +222,14 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication i
p.setDataInfo(info); p.setDataInfo(info);
p.setLastupdatetimestamp(lastUpdateTimestamp); p.setLastupdatetimestamp(lastUpdateTimestamp);
// rs.getString("projectid"); return Arrays.asList(p);
// rs.getString("code");
// rs.getString("websiteurl");
// rs.getString("acronym");
// rs.getString("title");
// rs.getDate("startdate");
// rs.getDate("enddate");
// rs.getString("callidentifier");
// rs.getString("keywords");
// rs.getInt("duration");
// rs.getBoolean("ecsc39");
// rs.getBoolean("oamandatepublications");
// rs.getBoolean("ecarticle29_3");
// rs.getDate("dateofcollection");
// rs.getDate("dateoftransformation");
// rs.getBoolean("inferred");
// rs.getBoolean("deletedbyinference");
// rs.getDouble("trust");
// rs.getString("inferenceprovenance");
// rs.getString("optional1");
// rs.getString("optional2");
// rs.getString("jsonextrainfo");
// rs.getString("contactfullname");
// rs.getString("contactfax");
// rs.getString("contactphone");
// rs.getString("contactemail");
// rs.getString("summary");
// rs.getString("currency");
// rs.getDouble("totalcost");
// rs.getDouble("fundedamount");
// rs.getString("collectedfromid");
// rs.getString("collectedfromname");
// rs.getString("contracttype"); // COMPLEX
// rs.getString("provenanceaction"); // COMPLEX
// rs.getArray("pid");
// rs.getArray("subjects");
// rs.getArray("fundingtree");
emitOaf(p);
} catch (final Exception e) { } catch (final Exception e) {
throw new RuntimeException(e); throw new RuntimeException(e);
} }
} }
public void processOrganization(final ResultSet rs) { public List<Oaf> processOrganization(final ResultSet rs) {
try { try {
@ -320,11 +245,11 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication i
o.setDateoftransformation(asString(rs.getDate("dateoftransformation"))); o.setDateoftransformation(asString(rs.getDate("dateoftransformation")));
o.setExtraInfo(new ArrayList<>()); // Values not present in the DB o.setExtraInfo(new ArrayList<>()); // Values not present in the DB
o.setOaiprovenance(null); // Values not present in the DB o.setOaiprovenance(null); // Values not present in the DB
o.setLegalshortname(field("legalshortname", info)); o.setLegalshortname(field(rs.getString("legalshortname"), info));
o.setLegalname(field("legalname", info)); o.setLegalname(field(rs.getString("legalname"), info));
o.setAlternativeNames(new ArrayList<>()); // Values not returned by the SQL query o.setAlternativeNames(new ArrayList<>()); // Values not returned by the SQL query
o.setWebsiteurl(field("websiteurl", info)); o.setWebsiteurl(field(rs.getString("websiteurl"), info));
o.setLogourl(field("logourl", info)); o.setLogourl(field(rs.getString("logourl"), info));
o.setEclegalbody(field(Boolean.toString(rs.getBoolean("eclegalbody")), info)); o.setEclegalbody(field(Boolean.toString(rs.getBoolean("eclegalbody")), info));
o.setEclegalperson(field(Boolean.toString(rs.getBoolean("eclegalperson")), info)); o.setEclegalperson(field(Boolean.toString(rs.getBoolean("eclegalperson")), info));
o.setEcnonprofit(field(Boolean.toString(rs.getBoolean("ecnonprofit")), info)); o.setEcnonprofit(field(Boolean.toString(rs.getBoolean("ecnonprofit")), info));
@ -339,41 +264,13 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication i
o.setDataInfo(info); o.setDataInfo(info);
o.setLastupdatetimestamp(lastUpdateTimestamp); o.setLastupdatetimestamp(lastUpdateTimestamp);
// rs.getString("organizationid"); return Arrays.asList(o);
// rs.getString("legalshortname");
// rs.getString("legalname");
// rs.getString("websiteurl");
// rs.getString("logourl");
// rs.getBoolean("eclegalbody");
// rs.getBoolean("eclegalperson");
// rs.getBoolean("ecnonprofit");
// rs.getBoolean("ecresearchorganization");
// rs.getBoolean("echighereducation");
// rs.getBoolean("ecinternationalorganizationeurinterests");
// rs.getBoolean("ecinternationalorganization");
// rs.getBoolean("ecenterprise");
// rs.getBoolean("ecsmevalidated");
// rs.getBoolean("ecnutscode");
// rs.getDate("dateofcollection");
// rs.getDate("dateoftransformation");
// rs.getBoolean("inferred");
// rs.getBoolean("deletedbyinference");
// rs.getDouble("trust");
// rs.getString("inferenceprovenance");
// rs.getString("collectedfromid");
// rs.getString("collectedfromname");
// rs.getString("country");
// rs.getString("provenanceaction");
// rs.getArray("pid");
emitOaf(o);
} catch (final Exception e) { } catch (final Exception e) {
throw new RuntimeException(e); throw new RuntimeException(e);
} }
} }
public void processDatasourceOrganization(final ResultSet rs) { public List<Oaf> processDatasourceOrganization(final ResultSet rs) {
try { try {
final DataInfo info = prepareDataInfo(rs); final DataInfo info = prepareDataInfo(rs);
final String orgId = createOpenaireId(20, rs.getString("organization"), true); final String orgId = createOpenaireId(20, rs.getString("organization"), true);
@ -389,7 +286,6 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication i
r1.setCollectedFrom(collectedFrom); r1.setCollectedFrom(collectedFrom);
r1.setDataInfo(info); r1.setDataInfo(info);
r1.setLastupdatetimestamp(lastUpdateTimestamp); r1.setLastupdatetimestamp(lastUpdateTimestamp);
emitOaf(r1);
final Relation r2 = new Relation(); final Relation r2 = new Relation();
r2.setRelType("datasourceOrganization"); r2.setRelType("datasourceOrganization");
@ -400,29 +296,14 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication i
r2.setCollectedFrom(collectedFrom); r2.setCollectedFrom(collectedFrom);
r2.setDataInfo(info); r2.setDataInfo(info);
r2.setLastupdatetimestamp(lastUpdateTimestamp); r2.setLastupdatetimestamp(lastUpdateTimestamp);
emitOaf(r2);
// rs.getString("datasource");
// rs.getString("organization");
// rs.getDate("startdate"); // NULL
// rs.getDate("enddate"); // NULL
// rs.getBoolean("inferred"); // false
// rs.getBoolean("deletedbyinference"); // false
// rs.getDouble("trust"); // 0.9
// rs.getString("inferenceprovenance"); // NULL
// rs.getString("semantics"); // 'providedBy@@@provided
// by@@@dnet:datasources_organizations_typologies@@@dnet:datasources_organizations_typologies' AS
// semantics,
// rs.getString("provenanceaction"); // d.provenanceaction || '@@@' || d.provenanceaction ||
// '@@@dnet:provenanceActions@@@dnet:provenanceActions' AS provenanceaction
return Arrays.asList(r1, r2);
} catch (final Exception e) { } catch (final Exception e) {
throw new RuntimeException(e); throw new RuntimeException(e);
} }
} }
public void processProjectOrganization(final ResultSet rs) { public List<Oaf> processProjectOrganization(final ResultSet rs) {
try { try {
final DataInfo info = prepareDataInfo(rs); final DataInfo info = prepareDataInfo(rs);
final String orgId = createOpenaireId(20, rs.getString("resporganization"), true); final String orgId = createOpenaireId(20, rs.getString("resporganization"), true);
@ -438,7 +319,6 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication i
r1.setCollectedFrom(collectedFrom); r1.setCollectedFrom(collectedFrom);
r1.setDataInfo(info); r1.setDataInfo(info);
r1.setLastupdatetimestamp(lastUpdateTimestamp); r1.setLastupdatetimestamp(lastUpdateTimestamp);
emitOaf(r1);
final Relation r2 = new Relation(); final Relation r2 = new Relation();
r2.setRelType("projectOrganization"); r2.setRelType("projectOrganization");
@ -449,30 +329,14 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication i
r2.setCollectedFrom(collectedFrom); r2.setCollectedFrom(collectedFrom);
r2.setDataInfo(info); r2.setDataInfo(info);
r2.setLastupdatetimestamp(lastUpdateTimestamp); r2.setLastupdatetimestamp(lastUpdateTimestamp);
emitOaf(r2);
// rs.getString("project");
// rs.getString("resporganization");
// rs.getInt("participantnumber");
// rs.getDouble("contribution");
// rs.getDate("startdate");// null
// rs.getDate("enddate");// null
// rs.getBoolean("inferred");// false
// rs.getBoolean("deletedbyinference"); // false
// rs.getDouble("trust");
// rs.getString("inferenceprovenance"); // NULL
// rs.getString("semantics"); // po.semanticclass || '@@@' || po.semanticclass ||
// '@@@dnet:project_organization_relations@@@dnet:project_organization_relations' AS semantics,
// rs.getString("provenanceaction"); //
// 'sysimport:crosswalk:entityregistry@@@sysimport:crosswalk:entityregistry@@@dnet:provenance_actions@@@dnet:provenance_actions'
// AS provenanceaction
return Arrays.asList(r1, r2);
} catch (final Exception e) { } catch (final Exception e) {
throw new RuntimeException(e); throw new RuntimeException(e);
} }
} }
public void processClaims(final ResultSet rs) { public List<Oaf> processClaims(final ResultSet rs) {
final DataInfo info = final DataInfo info =
dataInfo(false, null, false, false, qualifier("user:claim", "user:claim", "dnet:provenanceActions", "dnet:provenanceActions"), "0.9"); dataInfo(false, null, false, false, qualifier("user:claim", "user:claim", "dnet:provenanceActions", "dnet:provenanceActions"), "0.9");
@ -495,7 +359,8 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication i
r.setLastupdatetimestamp(lastUpdateTimestamp); r.setLastupdatetimestamp(lastUpdateTimestamp);
r.setContext(prepareContext(rs.getString("source_id"), info)); r.setContext(prepareContext(rs.getString("source_id"), info));
r.setDataInfo(info); r.setDataInfo(info);
emitOaf(r);
return Arrays.asList(r);
} else { } else {
final String sourceId = createOpenaireId(rs.getString("source_type"), rs.getString("source_id"), false); final String sourceId = createOpenaireId(rs.getString("source_type"), rs.getString("source_id"), false);
final String targetId = createOpenaireId(rs.getString("target_type"), rs.getString("target_id"), false); final String targetId = createOpenaireId(rs.getString("target_type"), rs.getString("target_id"), false);
@ -525,14 +390,13 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication i
r1.setTarget(targetId); r1.setTarget(targetId);
r1.setDataInfo(info); r1.setDataInfo(info);
r1.setLastupdatetimestamp(lastUpdateTimestamp); r1.setLastupdatetimestamp(lastUpdateTimestamp);
emitOaf(r1);
r2.setSource(targetId); r2.setSource(targetId);
r2.setTarget(sourceId); r2.setTarget(sourceId);
r2.setDataInfo(info); r2.setDataInfo(info);
r2.setLastupdatetimestamp(lastUpdateTimestamp); r2.setLastupdatetimestamp(lastUpdateTimestamp);
emitOaf(r2);
return Arrays.asList(r1, r2);
} }
} catch (final Exception e) { } catch (final Exception e) {
@ -563,7 +427,7 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication i
private List<Field<String>> prepareListFields(final Array array, final DataInfo info) { private List<Field<String>> prepareListFields(final Array array, final DataInfo info) {
try { try {
return listFields(info, (String[]) array.getArray()); return array != null ? listFields(info, (String[]) array.getArray()) : new ArrayList<>();
} catch (final SQLException e) { } catch (final SQLException e) {
throw new RuntimeException("Invalid SQL array", e); throw new RuntimeException("Invalid SQL array", e);
} }

View File

@ -69,7 +69,7 @@ public abstract class AbstractMdRecordToOafMapper {
nsContext.put("datacite", "http://datacite.org/schema/kernel-3"); nsContext.put("datacite", "http://datacite.org/schema/kernel-3");
DocumentFactory.getInstance().setXPathNamespaceURIs(nsContext); DocumentFactory.getInstance().setXPathNamespaceURIs(nsContext);
final Document doc = DocumentHelper.parseText(xml); final Document doc = DocumentHelper.parseText(xml.replaceAll("http://datacite.org/schema/kernel-4", "http://datacite.org/schema/kernel-3"));
final String type = doc.valueOf("//dr:CobjCategory/@type"); final String type = doc.valueOf("//dr:CobjCategory/@type");
final KeyValue collectedFrom = keyValue(doc.valueOf("//oaf:collectedFrom/@id"), doc.valueOf("//oaf:collectedFrom/@name")); final KeyValue collectedFrom = keyValue(doc.valueOf("//oaf:collectedFrom/@id"), doc.valueOf("//oaf:collectedFrom/@name"));

View File

@ -28,6 +28,10 @@ public class AbstractMigrationApplication implements Closeable {
private static final Log log = LogFactory.getLog(AbstractMigrationApplication.class); private static final Log log = LogFactory.getLog(AbstractMigrationApplication.class);
protected AbstractMigrationApplication() { // ONLY FOR UNIT TEST
this.writer = null;
}
public AbstractMigrationApplication(final String hdfsPath) throws Exception { public AbstractMigrationApplication(final String hdfsPath) throws Exception {
log.info(String.format("Creating SequenceFile Writer, hdfsPath=%s", hdfsPath)); log.info(String.format("Creating SequenceFile Writer, hdfsPath=%s", hdfsPath));

View File

@ -22,8 +22,7 @@ SELECT
'' AS inferenceprovenance, '' AS inferenceprovenance,
d.id AS collectedfromid, d.id AS collectedfromid,
d.officialname AS collectedfromname, d.officialname AS collectedfromname,
o.country || '@@@' || o.country || '@@@dnet:countries@@@dnet:countries' AS country,
o.country || '@@@dnet:countries' AS country,
'sysimport:crosswalk:entityregistry@@@sysimport:crosswalk:entityregistry@@@dnet:provenance_actions@@@dnet:provenance_actions' AS provenanceaction, 'sysimport:crosswalk:entityregistry@@@sysimport:crosswalk:entityregistry@@@dnet:provenance_actions@@@dnet:provenance_actions' AS provenanceaction,
ARRAY[]::text[] AS pid ARRAY[]::text[] AS pid

View File

@ -11,7 +11,7 @@ SELECT
'' AS inferenceprovenance, '' AS inferenceprovenance,
'openaire____::openorgs' AS collectedfromid, 'openaire____::openorgs' AS collectedfromid,
'OpenOrgs Database' AS collectedfromname, 'OpenOrgs Database' AS collectedfromname,
o.country || '@@@dnet:countries' AS country, o.country || '@@@' || o.country || '@@@dnet:countries@@@dnet:countries' AS country,
'sysimport:crosswalk:entityregistry@@@sysimport:crosswalk:entityregistry@@@dnet:provenance_actions@@@dnet:provenance_actions' AS provenanceaction, 'sysimport:crosswalk:entityregistry@@@sysimport:crosswalk:entityregistry@@@dnet:provenance_actions@@@dnet:provenance_actions' AS provenanceaction,
array_agg(DISTINCT i.otherid || '###' || i.type || '@@@dnet:pid_types') AS pid array_agg(DISTINCT i.otherid || '###' || i.type || '@@@dnet:pid_types') AS pid
FROM organizations o FROM organizations o
@ -40,7 +40,7 @@ SELECT
'' AS inferenceprovenance, '' AS inferenceprovenance,
'openaire____::openorgs' AS collectedfromid, 'openaire____::openorgs' AS collectedfromid,
'OpenOrgs Database' AS collectedfromname, 'OpenOrgs Database' AS collectedfromname,
o.country || '@@@dnet:countries' AS country, o.country || '@@@' || o.country || '@@@dnet:countries@@@dnet:countries' AS country,
'sysimport:crosswalk:entityregistry@@@sysimport:crosswalk:entityregistry@@@dnet:provenance_actions@@@dnet:provenance_actions' AS provenanceaction, 'sysimport:crosswalk:entityregistry@@@sysimport:crosswalk:entityregistry@@@dnet:provenance_actions@@@dnet:provenance_actions' AS provenanceaction,
array_agg(DISTINCT i.otherid || '###' || i.type || '@@@dnet:pid_types') AS pid array_agg(DISTINCT i.otherid || '###' || i.type || '@@@dnet:pid_types') AS pid
FROM other_names n FROM other_names n

View File

@ -15,12 +15,4 @@
<name>oozie.action.sharelib.for.spark</name> <name>oozie.action.sharelib.for.spark</name>
<value>spark2</value> <value>spark2</value>
</property> </property>
<property>
<name>hive_metastore_uris</name>
<value>thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083</value>
</property>
<property>
<name>hive_db_name</name>
<value>openaire</value>
</property>
</configuration> </configuration>

View File

@ -0,0 +1,62 @@
<workflow-app name="import db entities (step 1)" xmlns="uri:oozie:workflow:0.5">
<parameters>
<property>
<name>migrationPathStep1</name>
<description>the base path to store hdfs file</description>
</property>
<property>
<name>postgresURL</name>
<description>the postgres URL to access to the database</description>
</property>
<property>
<name>postgresUser</name>
<description>the user postgres</description>
</property>
<property>
<name>postgresPassword</name>
<description>the password postgres</description>
</property>
<property>
<name>sparkDriverMemory</name>
<description>memory for driver process</description>
</property>
<property>
<name>sparkExecutorMemory</name>
<description>memory for individual executor</description>
</property>
<property>
<name>sparkExecutorCores</name>
<description>number of cores used by single executor</description>
</property>
</parameters>
<start to="ResetWorkingPath"/>
<kill name="Kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<action name="ResetWorkingPath">
<fs>
<delete path='${migrationPathStep1}/db_records'/>
</fs>
<ok to="ImportDB"/>
<error to="Kill"/>
</action>
<action name="ImportDB">
<java>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<main-class>eu.dnetlib.dhp.migration.step1.MigrateDbEntitiesApplication</main-class>
<arg>-p</arg><arg>${migrationPathStep1}/db_records</arg>
<arg>-pgurl</arg><arg>${postgresURL}</arg>
<arg>-pguser</arg><arg>${postgresUser}</arg>
<arg>-pgpasswd</arg><arg>${postgresPassword}</arg>
</java>
<ok to="End"/>
<error to="Kill"/>
</action>
<end name="End"/>
</workflow-app>

View File

@ -6,26 +6,28 @@ import java.nio.file.Path;
import org.apache.commons.io.FileUtils; import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.junit.After; import org.junit.jupiter.api.AfterEach;
import org.junit.Assert;
import org.junit.Before; import org.junit.jupiter.api.BeforeEach;
import org.junit.Test; import org.junit.jupiter.api.Test;
import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.model.mdstore.MetadataRecord; import eu.dnetlib.dhp.model.mdstore.MetadataRecord;
import eu.dnetlib.dhp.model.mdstore.Provenance; import eu.dnetlib.dhp.model.mdstore.Provenance;
import static org.junit.jupiter.api.Assertions.*;
public class CollectionJobTest { public class CollectionJobTest {
private Path testDir; private Path testDir;
@Before @BeforeEach
public void setup() throws IOException { public void setup() throws IOException {
testDir = Files.createTempDirectory("dhp-collection"); testDir = Files.createTempDirectory("dhp-collection");
} }
@After @AfterEach
public void teadDown() throws IOException { public void teadDown() throws IOException {
FileUtils.deleteDirectory(testDir.toFile()); FileUtils.deleteDirectory(testDir.toFile());
} }
@ -80,7 +82,7 @@ public class CollectionJobTest {
record.setBody("ciao"); record.setBody("ciao");
assert record1 != null; assert record1 != null;
record1.setBody("mondo"); record1.setBody("mondo");
Assert.assertEquals(record, record1); assertEquals(record, record1);
} }

View File

@ -7,13 +7,13 @@ import eu.dnetlib.dhp.collection.worker.DnetCollectorWorker;
import eu.dnetlib.dhp.collection.worker.utils.CollectorPluginFactory; import eu.dnetlib.dhp.collection.worker.utils.CollectorPluginFactory;
import eu.dnetlib.message.Message; import eu.dnetlib.message.Message;
import eu.dnetlib.message.MessageManager; import eu.dnetlib.message.MessageManager;
import org.junit.After; import org.junit.jupiter.api.AfterEach;
import org.junit.Before; import org.junit.jupiter.api.BeforeEach;
import org.junit.Test; import org.junit.jupiter.api.Test;
import java.io.File; import java.io.File;
import static org.junit.Assert.assertNotNull; import static org.junit.jupiter.api.Assertions.assertNotNull;
import static org.mockito.Mockito.*; import static org.mockito.Mockito.*;
@ -24,7 +24,7 @@ public class DnetCollectorWorkerApplicationTests {
private MessageManager messageManager = mock(MessageManager.class); private MessageManager messageManager = mock(MessageManager.class);
private DnetCollectorWorker worker; private DnetCollectorWorker worker;
@Before @BeforeEach
public void setup() throws Exception { public void setup() throws Exception {
ObjectMapper mapper = new ObjectMapper(); ObjectMapper mapper = new ObjectMapper();
final String apiJson = mapper.writeValueAsString(getApi()); final String apiJson = mapper.writeValueAsString(getApi());
@ -47,7 +47,7 @@ public class DnetCollectorWorkerApplicationTests {
} }
@After @AfterEach
public void dropDown(){ public void dropDown(){
File f = new File("/tmp/file.seq"); File f = new File("/tmp/file.seq");
f.delete(); f.delete();

View File

@ -0,0 +1,293 @@
package eu.dnetlib.dhp.migration.step1;
import com.fasterxml.jackson.core.type.TypeReference;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.schema.oaf.*;
import org.apache.commons.io.IOUtils;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.extension.ExtendWith;
import org.mockito.Mock;
import org.mockito.Mockito;
import org.mockito.junit.jupiter.MockitoExtension;
import java.io.IOException;
import java.sql.Array;
import java.sql.Date;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.util.List;
import java.util.Objects;
import static org.junit.jupiter.api.Assertions.assertEquals;
@ExtendWith(MockitoExtension.class)
public class MigrateDbEntitiesApplicationTest {
private MigrateDbEntitiesApplication app;
@Mock
private ResultSet rs;
@BeforeEach
public void setUp() {
this.app = new MigrateDbEntitiesApplication();
}
@Test
public void testProcessDatasource() throws Exception {
final List<TypedField> fields = prepareMocks("datasources_resultset_entry.json");
final List<Oaf> list = app.processDatasource(rs);
assertEquals(1, list.size());
verifyMocks(fields);
final Datasource ds = (Datasource) list.get(0);
assertValidId(ds.getId());
assertEquals(ds.getOfficialname().getValue(), getValueAsString("officialname", fields));
assertEquals(ds.getEnglishname().getValue(), getValueAsString("englishname", fields));
assertEquals(ds.getContactemail().getValue(), getValueAsString("contactemail", fields));
assertEquals(ds.getWebsiteurl().getValue(), getValueAsString("websiteurl", fields));
assertEquals(ds.getNamespaceprefix().getValue(), getValueAsString("namespaceprefix", fields));
assertEquals(ds.getCollectedfrom().get(0).getKey(), getValueAsString("collectedfromid", fields));
assertEquals(ds.getCollectedfrom().get(0).getValue(), getValueAsString("collectedfromname", fields));
}
@Test
public void testProcessProject() throws Exception {
final List<TypedField> fields = prepareMocks("projects_resultset_entry.json");
final List<Oaf> list = app.processProject(rs);
assertEquals(1, list.size());
verifyMocks(fields);
final Project p = (Project) list.get(0);
assertValidId(p.getId());
assertEquals(p.getAcronym().getValue(), getValueAsString("acronym", fields));
assertEquals(p.getTitle().getValue(), getValueAsString("title", fields));
assertEquals(p.getCollectedfrom().get(0).getKey(), getValueAsString("collectedfromid", fields));
assertEquals(p.getCollectedfrom().get(0).getValue(), getValueAsString("collectedfromname", fields));
}
@Test
public void testProcessOrganization() throws Exception {
final List<TypedField> fields = prepareMocks("organizations_resultset_entry.json");
final List<Oaf> list = app.processOrganization(rs);
assertEquals(1, list.size());
verifyMocks(fields);
final Organization o = (Organization) list.get(0);
assertValidId(o.getId());
assertEquals(o.getLegalshortname().getValue(), getValueAsString("legalshortname", fields));
assertEquals(o.getLegalname().getValue(), getValueAsString("legalname", fields));
assertEquals(o.getWebsiteurl().getValue(), getValueAsString("websiteurl", fields));
assertEquals(o.getCountry().getClassid(), getValueAsString("country", fields).split("@@@")[0]);
assertEquals(o.getCountry().getClassname(), getValueAsString("country", fields).split("@@@")[1]);
assertEquals(o.getCountry().getSchemeid(), getValueAsString("country", fields).split("@@@")[2]);
assertEquals(o.getCountry().getSchemename(), getValueAsString("country", fields).split("@@@")[3]);
assertEquals(o.getCollectedfrom().get(0).getKey(), getValueAsString("collectedfromid", fields));
assertEquals(o.getCollectedfrom().get(0).getValue(), getValueAsString("collectedfromname", fields));
}
@Test
public void testProcessDatasourceOrganization() throws Exception {
final List<TypedField> fields = prepareMocks("datasourceorganization_resultset_entry.json");
final List<Oaf> list = app.processDatasourceOrganization(rs);
assertEquals(2, list.size());
verifyMocks(fields);
final Relation r1 = (Relation) list.get(0);
final Relation r2 = (Relation) list.get(1);
assertValidId(r1.getSource());
assertValidId(r2.getSource());
assertEquals(r1.getSource(), r2.getTarget());
assertEquals(r2.getSource(), r1.getTarget());
}
@Test
public void testProcessProjectOrganization() throws Exception {
final List<TypedField> fields = prepareMocks("projectorganization_resultset_entry.json");
final List<Oaf> list = app.processProjectOrganization(rs);
assertEquals(2, list.size());
verifyMocks(fields);
final Relation r1 = (Relation) list.get(0);
final Relation r2 = (Relation) list.get(1);
assertValidId(r1.getSource());
assertValidId(r2.getSource());
assertEquals(r1.getSource(), r2.getTarget());
assertEquals(r2.getSource(), r1.getTarget());
}
@Test
public void testProcessClaims_context() throws Exception {
final List<TypedField> fields = prepareMocks("claimscontext_resultset_entry.json");
final List<Oaf> list = app.processClaims(rs);
assertEquals(1, list.size());
verifyMocks(fields);
}
@Test
public void testProcessClaims_rels() throws Exception {
final List<TypedField> fields = prepareMocks("claimsrel_resultset_entry.json");
final List<Oaf> list = app.processClaims(rs);
assertEquals(2, list.size());
verifyMocks(fields);
}
private List<TypedField> prepareMocks(final String jsonFile) throws IOException, SQLException {
final String json = IOUtils.toString(getClass().getResourceAsStream(jsonFile));
final ObjectMapper mapper = new ObjectMapper();
final List<TypedField> list = mapper.readValue(json, new TypeReference<List<TypedField>>() {});
for (final TypedField tf : list) {
if (tf.getValue() == null) {
switch (tf.getType()) {
case "not_used":
break;
case "boolean":
Mockito.when(rs.getBoolean(tf.getField())).thenReturn(false);
break;
case "date":
Mockito.when(rs.getDate(tf.getField())).thenReturn(null);
break;
case "int":
Mockito.when(rs.getInt(tf.getField())).thenReturn(0);
break;
case "double":
Mockito.when(rs.getDouble(tf.getField())).thenReturn(0.0);
break;
case "array":
Mockito.when(rs.getArray(tf.getField())).thenReturn(null);
break;
case "string":
default:
Mockito.when(rs.getString(tf.getField())).thenReturn(null);
break;
}
} else {
switch (tf.getType()) {
case "not_used":
break;
case "boolean":
Mockito.when(rs.getBoolean(tf.getField())).thenReturn(Boolean.parseBoolean(tf.getValue().toString()));
break;
case "date":
Mockito.when(rs.getDate(tf.getField())).thenReturn(Date.valueOf(tf.getValue().toString()));
break;
case "int":
Mockito.when(rs.getInt(tf.getField())).thenReturn(new Integer(tf.getValue().toString()));
break;
case "double":
Mockito.when(rs.getDouble(tf.getField())).thenReturn(new Double(tf.getValue().toString()));
break;
case "array":
final Array arr = Mockito.mock(Array.class);
final String[] values = ((List<?>) tf.getValue()).stream()
.filter(Objects::nonNull)
.map(o -> o.toString())
.toArray(String[]::new);
Mockito.when(arr.getArray()).thenReturn(values);
Mockito.when(rs.getArray(tf.getField())).thenReturn(arr);
break;
case "string":
default:
Mockito.when(rs.getString(tf.getField())).thenReturn(tf.getValue().toString());
break;
}
}
}
return list;
}
private void verifyMocks(final List<TypedField> list) throws SQLException {
for (final TypedField tf : list) {
switch (tf.getType()) {
case "not_used":
break;
case "boolean":
Mockito.verify(rs, Mockito.atLeastOnce()).getBoolean(tf.getField());
break;
case "date":
Mockito.verify(rs, Mockito.atLeastOnce()).getDate(tf.getField());
break;
case "int":
Mockito.verify(rs, Mockito.atLeastOnce()).getInt(tf.getField());
break;
case "double":
Mockito.verify(rs, Mockito.atLeastOnce()).getDouble(tf.getField());
break;
case "array":
Mockito.verify(rs, Mockito.atLeastOnce()).getArray(tf.getField());
break;
case "string":
default:
Mockito.verify(rs, Mockito.atLeastOnce()).getString(tf.getField());
break;
}
}
}
private void assertValidId(final String id) {
assertEquals(49, id.length());
assertEquals('|', id.charAt(2));
assertEquals(':', id.charAt(15));
assertEquals(':', id.charAt(16));
}
private String getValueAsString(final String name, final List<TypedField> fields) {
return fields.stream()
.filter(f -> f.getField().equals(name))
.map(TypedField::getValue)
.filter(Objects::nonNull)
.map(o -> o.toString())
.findFirst()
.get();
}
}
class TypedField {
private String field;
private String type;
private Object value;
public String getField() {
return field;
}
public void setField(final String field) {
this.field = field;
}
public String getType() {
return type;
}
public void setType(final String type) {
this.type = type;
}
public Object getValue() {
return value;
}
public void setValue(final Object value) {
this.value = value;
}
}

View File

@ -0,0 +1,109 @@
package eu.dnetlib.dhp.migration.step2;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertTrue;
import static org.mockito.ArgumentMatchers.anyString;
import static org.mockito.Mockito.when;
import java.io.IOException;
import java.util.List;
import java.util.Map;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.extension.ExtendWith;
import org.mockito.Mock;
import org.mockito.junit.jupiter.MockitoExtension;
import eu.dnetlib.dhp.schema.oaf.Dataset;
import eu.dnetlib.dhp.schema.oaf.Oaf;
import eu.dnetlib.dhp.schema.oaf.Publication;
import eu.dnetlib.dhp.schema.oaf.Relation;
import eu.dnetlib.dhp.schema.oaf.Software;
@ExtendWith(MockitoExtension.class)
public class MappersTest {
@Mock
private Map<String, String> code2name;
@BeforeEach
void setUp() throws Exception {
when(code2name.get(anyString())).thenAnswer(invocation -> invocation.getArgument(0));
}
@Test
void testPublication() throws IOException {
final String xml = IOUtils.toString(getClass().getResourceAsStream("oaf_record.xml"));
final List<Oaf> list = new OafToOafMapper(code2name).processMdRecord(xml);
assertEquals(3, list.size());
assertTrue(list.get(0) instanceof Publication);
assertTrue(list.get(1) instanceof Relation);
assertTrue(list.get(2) instanceof Relation);
final Publication p = (Publication) list.get(0);
final Relation r1 = (Relation) list.get(1);
final Relation r2 = (Relation) list.get(2);
assertValidId(p.getId());
assertTrue(StringUtils.isNotBlank(p.getTitle().get(0).getValue()));
assertTrue(p.getAuthor().size() > 0);
assertTrue(p.getSubject().size() > 0);
assertTrue(StringUtils.isNotBlank(p.getJournal().getIssnOnline()));
assertTrue(StringUtils.isNotBlank(p.getJournal().getName()));
assertValidId(r1.getSource());
assertValidId(r2.getSource());
assertEquals(r1.getSource(), r2.getTarget());
assertEquals(r2.getSource(), r1.getTarget());
assertTrue(StringUtils.isNotBlank(r1.getRelClass()));
assertTrue(StringUtils.isNotBlank(r2.getRelClass()));
assertTrue(StringUtils.isNotBlank(r1.getRelType()));
assertTrue(StringUtils.isNotBlank(r2.getRelType()));
}
@Test
void testDataset() throws IOException {
final String xml = IOUtils.toString(getClass().getResourceAsStream("odf_dataset.xml"));
final List<Oaf> list = new OdfToOafMapper(code2name).processMdRecord(xml);
assertEquals(1, list.size());
assertTrue(list.get(0) instanceof Dataset);
final Dataset d = (Dataset) list.get(0);
assertValidId(d.getId());
assertTrue(StringUtils.isNotBlank(d.getTitle().get(0).getValue()));
assertTrue(d.getAuthor().size() > 0);
assertTrue(d.getSubject().size() > 0);
}
@Test
void testSoftware() throws IOException {
final String xml = IOUtils.toString(getClass().getResourceAsStream("odf_software.xml"));
final List<Oaf> list = new OdfToOafMapper(code2name).processMdRecord(xml);
assertEquals(1, list.size());
assertTrue(list.get(0) instanceof Software);
final Software s = (Software) list.get(0);
assertValidId(s.getId());
assertTrue(StringUtils.isNotBlank(s.getTitle().get(0).getValue()));
assertTrue(s.getAuthor().size() > 0);
assertTrue(s.getSubject().size() > 0);
}
private void assertValidId(final String id) {
assertEquals(49, id.length());
assertEquals('|', id.charAt(2));
assertEquals(':', id.charAt(15));
assertEquals(':', id.charAt(16));
}
}

View File

@ -6,47 +6,32 @@ import eu.dnetlib.dhp.transformation.vocabulary.Vocabulary;
import eu.dnetlib.dhp.transformation.vocabulary.VocabularyHelper; import eu.dnetlib.dhp.transformation.vocabulary.VocabularyHelper;
import eu.dnetlib.dhp.utils.DHPUtils; import eu.dnetlib.dhp.utils.DHPUtils;
import net.sf.saxon.s9api.*; import net.sf.saxon.s9api.*;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.apache.spark.util.LongAccumulator; import org.apache.spark.util.LongAccumulator;
import org.dom4j.Document; import org.dom4j.Document;
import org.dom4j.Node; import org.dom4j.Node;
import org.dom4j.io.SAXReader; import org.dom4j.io.SAXReader;
import org.junit.*; import org.junit.jupiter.api.DisplayName;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.extension.ExtendWith;
import org.junit.jupiter.api.io.TempDir;
import org.mockito.Mock; import org.mockito.Mock;
import org.mockito.junit.MockitoJUnit; import org.mockito.junit.jupiter.MockitoExtension;
import org.mockito.junit.MockitoRule;
import javax.xml.transform.stream.StreamSource; import javax.xml.transform.stream.StreamSource;
import java.io.File;
import java.io.IOException;
import java.io.StringWriter; import java.io.StringWriter;
import java.nio.file.Files; import java.nio.file.Files;
import java.nio.file.Path; import java.nio.file.Path;
import java.util.Comparator;
import java.util.HashMap; import java.util.HashMap;
import java.util.Map; import java.util.Map;
import static org.junit.jupiter.api.Assertions.assertNotNull;
@ExtendWith(MockitoExtension.class)
public class TransformationJobTest { public class TransformationJobTest {
@Mock @Mock
LongAccumulator accumulator; private LongAccumulator accumulator;
@Rule
public MockitoRule mockitoRule = MockitoJUnit.rule();
private Path testDir;
@Before
public void setup() throws IOException {
testDir = Files.createTempDirectory("dhp-collection");
}
@After
public void tearDown() throws IOException {
FileUtils.deleteDirectory(testDir.toFile());
}
@Test @Test
public void testTransformSaxonHE() throws Exception { public void testTransformSaxonHE() throws Exception {
@ -70,9 +55,9 @@ public class TransformationJobTest {
System.out.println(output.toString()); System.out.println(output.toString());
} }
@DisplayName("Test TransformSparkJobNode.main")
@Test @Test
public void transformTest() throws Exception { public void transformTest(@TempDir Path testDir) throws Exception {
final String mdstore_input = this.getClass().getResource("/eu/dnetlib/dhp/transform/mdstorenative").getFile(); final String mdstore_input = this.getClass().getResource("/eu/dnetlib/dhp/transform/mdstorenative").getFile();
final String mdstore_output = testDir.toString()+"/version"; final String mdstore_output = testDir.toString()+"/version";
final String xslt = DHPUtils.compressString(IOUtils.toString(this.getClass().getResourceAsStream("/eu/dnetlib/dhp/transform/tr.xml"))); final String xslt = DHPUtils.compressString(IOUtils.toString(this.getClass().getResourceAsStream("/eu/dnetlib/dhp/transform/tr.xml")));
@ -89,8 +74,6 @@ public class TransformationJobTest {
"-rh", "", "-rh", "",
"-ro", "", "-ro", "",
"-rr", ""}); "-rr", ""});
} }
@Test @Test
@ -121,7 +104,7 @@ public class TransformationJobTest {
record.setBody(IOUtils.toString(this.getClass().getResourceAsStream("/eu/dnetlib/dhp/transform/input.xml"))); record.setBody(IOUtils.toString(this.getClass().getResourceAsStream("/eu/dnetlib/dhp/transform/input.xml")));
final MetadataRecord result = tf.call(record); final MetadataRecord result = tf.call(record);
Assert.assertNotNull(result.getBody()); assertNotNull(result.getBody());
System.out.println(result.getBody()); System.out.println(result.getBody());
} }

View File

@ -1,7 +1,7 @@
package eu.dnetlib.dhp.transformation.vocabulary; package eu.dnetlib.dhp.transformation.vocabulary;
import org.junit.Test; import org.junit.jupiter.api.Test;
import static org.junit.Assert.*; import static org.junit.jupiter.api.Assertions.*;
public class VocabularyTest { public class VocabularyTest {

View File

@ -0,0 +1,27 @@
[
{
"field": "source_type",
"type": "string",
"value": "context"
},
{
"field": "source_id",
"type": "string",
"value": "oa-pg"
},
{
"field": "target_type",
"type": "string",
"value": "publication"
},
{
"field": "target_id",
"type": "string",
"value": "userclaim___::d99de49026e79d271f3e7451d8de18b6"
},
{
"field": "semantics",
"type": "not_used",
"value": "isRelevantTo"
}
]

View File

@ -0,0 +1,27 @@
[
{
"field": "source_type",
"type": "string",
"value": "project"
},
{
"field": "source_id",
"type": "string",
"value": "corda__h2020::b38a638a93b505d670fcacc47a0283d6"
},
{
"field": "target_type",
"type": "string",
"value": "publication"
},
{
"field": "target_id",
"type": "string",
"value": "userclaim___::5b5117253d3c64c79809d0b92fa287b4"
},
{
"field": "semantics",
"type": "not_used",
"value": "resultProject_outcome_produces"
}
]

View File

@ -0,0 +1,62 @@
[
{
"field": "datasource",
"type": "string",
"value": "openaire____::revistasunicauca"
},
{
"field": "organization",
"type": "string",
"value": "openaire____::openaire____::revistasunicauca"
},
{
"field": "startdate",
"type": "not_used",
"value": null
},
{
"field": "enddate",
"type": "not_used",
"value": null
},
{
"field": "inferred",
"type": "boolean",
"value": false
},
{
"field": "deletedbyinference",
"type": "boolean",
"value": false
},
{
"field": "trust",
"type": "string",
"value": "0.9"
},
{
"field": "inferenceprovenance",
"type": "string",
"value": null
},
{
"field": "collectedfromid",
"type": "string",
"value": null
},
{
"field": "collectedfromname",
"type": "string",
"value": null
},
{
"field": "semantics",
"type": "not_used",
"value": "providedBy@@@provided by@@@dnet:datasources_organizations_typologies@@@dnet:datasources_organizations_typologies"
},
{
"field": "provenanceaction",
"type": "not_used",
"value": null
}
]

View File

@ -0,0 +1,234 @@
[
{
"field": "datasourceid",
"type": "string",
"value": "274269ac6f3b::2579-5449"
},
{
"field": "identities",
"type": "not_used",
"value": [
"274269ac6f3b::2579-5449",
null
]
},
{
"field": "officialname",
"type": "string",
"value": "Jurnal Ilmiah Pendidikan Scholastic"
},
{
"field": "englishname",
"type": "string",
"value": "Jurnal Ilmiah Pendidikan Scholastic"
},
{
"field": "contactemail",
"type": "string",
"value": "test@test.it"
},
{
"field": "openairecompatibility",
"type": "string",
"value": "hostedBy@@@collected from a compatible aggregator@@@dnet:datasourceCompatibilityLevel@@@dnet:datasourceCompatibilityLevel"
},
{
"field": "websiteurl",
"type": "string",
"value": "http://e-journal.sastra-unes.com/index.php/JIPS/index"
},
{
"field": "logourl",
"type": "string",
"value": null
},
{
"field": "accessinfopackage",
"type": "array",
"value": [
null
]
},
{
"field": "latitude",
"type": "double",
"value": 0
},
{
"field": "longitude",
"type": "double",
"value": 0
},
{
"field": "namespaceprefix",
"type": "string",
"value": "ojs_25795449"
},
{
"field": "odnumberofitems",
"type": "int",
"value": null
},
{
"field": "odnumberofitemsdate",
"type": "date",
"value": null
},
{
"field": "subjects",
"type": "array",
"value": null
},
{
"field": "description",
"type": "string",
"value": null
},
{
"field": "odpolicies",
"type": "string",
"value": null
},
{
"field": "odlanguages",
"type": "array",
"value": []
},
{
"field": "odcontenttypes",
"type": "array",
"value": [
"Journal articles"
]
},
{
"field": "inferred",
"type": "boolean",
"value": false
},
{
"field": "deletedbyinference",
"type": "boolean",
"value": false
},
{
"field": "trust",
"type": "string",
"value": "0.9"
},
{
"field": "inferenceprovenance",
"type": "string",
"value": null
},
{
"field": "dateofcollection",
"type": "date",
"value": "2020-01-21"
},
{
"field": "dateofvalidation",
"type": "date",
"value": null
},
{
"field": "releasestartdate",
"type": "date",
"value": null
},
{
"field": "releaseenddate",
"type": "date",
"value": null
},
{
"field": "missionstatementurl",
"type": "string",
"value": null
},
{
"field": "dataprovider",
"type": "boolean",
"value": null
},
{
"field": "serviceprovider",
"type": "boolean",
"value": null
},
{
"field": "databaseaccesstype",
"type": "string",
"value": null
},
{
"field": "datauploadtype",
"type": "string",
"value": null
},
{
"field": "databaseaccessrestriction",
"type": "string",
"value": null
},
{
"field": "datauploadrestriction",
"type": "string",
"value": null
},
{
"field": "versioning",
"type": "boolean",
"value": null
},
{
"field": "citationguidelineurl",
"type": "string",
"value": null
},
{
"field": "qualitymanagementkind",
"type": "string",
"value": null
},
{
"field": "pidsystems",
"type": "string",
"value": null
},
{
"field": "certificates",
"type": "string",
"value": null
},
{
"field": "policies",
"type": "not_used",
"value": []
},
{
"field": "collectedfromid",
"type": "string",
"value": "openaire____::SnVybmFsIEZha3VsdGFzIFNhc3RyYSBVbml2ZXJzaXRhcyBFa2FzYWt0aQ=="
},
{
"field": "collectedfromname",
"type": "string",
"value": "Jurnal Fakultas Sastra Universitas Ekasakti"
},
{
"field": "datasourcetype",
"type": "string",
"value": "pubsrepository::journal@@@Journal@@@dnet:datasource_typologies@@@dnet:datasource_typologies"
},
{
"field": "provenanceaction",
"type": "not_used",
"value": "sysimport:crosswalk:entityregistry@@@sysimport:crosswalk:entityregistry@@@dnet:provenance_actions@@@dnet:provenance_actions"
},
{
"field": "journal",
"type": "string",
"value": "2579-5449@@@2597-6540@@@"
}
]

View File

@ -0,0 +1,127 @@
[
{
"field": "organizationid",
"type": "string",
"value": "openaire____::openaire____::microsoft"
},
{
"field": "legalshortname",
"type": "string",
"value": "MSFTResearch"
},
{
"field": "legalname",
"type": "string",
"value": "Microsoft Research"
},
{
"field": "websiteurl",
"type": "string",
"value": "https://www.microsoft.com/en-us/research/"
},
{
"field": "logourl",
"type": "string",
"value": null
},
{
"field": "eclegalbody",
"type": "boolean",
"value": false
},
{
"field": "eclegalperson",
"type": "boolean",
"value": false
},
{
"field": "ecnonprofit",
"type": "boolean",
"value": false
},
{
"field": "ecresearchorganization",
"type": "boolean",
"value": false
},
{
"field": "echighereducation",
"type": "boolean",
"value": false
},
{
"field": "ecinternationalorganizationeurinterests",
"type": "boolean",
"value": false
},
{
"field": "ecinternationalorganization",
"type": "boolean",
"value": false
},
{
"field": "ecenterprise",
"type": "boolean",
"value": false
},
{
"field": "ecsmevalidated",
"type": "boolean",
"value": false
},
{
"field": "ecnutscode",
"type": "boolean",
"value": false
},
{
"field": "dateofcollection",
"type": "date",
"value": "2018-10-19"
},
{
"field": "dateoftransformation",
"type": "date",
"value": "2018-10-19"
},
{
"field": "inferred",
"type": "boolean",
"value": false
},
{
"field": "deletedbyinference",
"type": "boolean",
"value": false
},
{
"field": "trust",
"type": "string",
"value": "0.9"
},
{
"field": "inferenceprovenance",
"type": "string",
"value": ""
},
{
"field": "collectedfromid",
"type": "string",
"value": "openaire____::TEST"
},
{
"field": "collectedfromname",
"type": "string",
"value": "TEST"
},
{
"field": "country",
"type": "string",
"value": "US@@@US@@@dnet:countries@@@dnet:countries"
},
{
"field": "provenanceaction",
"type": "not_used",
"value": "sysimport:crosswalk:entityregistry@@@sysimport:crosswalk:entityregistry@@@dnet:provenance_actions@@@dnet:provenance_actions"
}
]

View File

@ -0,0 +1,72 @@
[
{
"field": "project",
"type": "string",
"value": "nsf_________::1700003"
},
{
"field": "resporganization",
"type": "string",
"value": "nsf_________::University_of_Notre_Dame"
},
{
"field": "participantnumber",
"type": "not_used",
"value": 1
},
{
"field": "contribution",
"type": "not_used",
"value": null
},
{
"field": "startdate",
"type": "not_used",
"value": null
},
{
"field": "enddate",
"type": "not_used",
"value": null
},
{
"field": "inferred",
"type": "boolean",
"value": false
},
{
"field": "deletedbyinference",
"type": "boolean",
"value": false
},
{
"field": "trust",
"type": "string",
"value": "0.9"
},
{
"field": "inferenceprovenance",
"type": "string",
"value": null
},
{
"field": "collectedfromid",
"type": "string",
"value": "openaire____::nsf"
},
{
"field": "collectedfromname",
"type": "string",
"value": "NSF - National Science Foundation"
},
{
"field": "semantics",
"type": "not_used",
"value": "coordinator@@@coordinator@@@dnet:project_organization_relations@@@dnet:project_organization_relations"
},
{
"field": "provenanceaction",
"type": "not_used",
"value": "sysimport:crosswalk:entityregistry@@@sysimport:crosswalk:entityregistry@@@dnet:provenance_actions@@@dnet:provenance_actions"
}
]

View File

@ -0,0 +1,193 @@
[
{
"field": "projectid",
"type": "string",
"value": "aka_________::100469"
},
{
"field": "code",
"type": "string",
"value": "100469"
},
{
"field": "websiteurl",
"type": "string",
"value": "http://test"
},
{
"field": "acronym",
"type": "string",
"value": "RMCAG"
},
{
"field": "title",
"type": "string",
"value": "Regulation of melanoma cell autonomous growth"
},
{
"field": "startdate",
"type": "date",
"value": null
},
{
"field": "enddate",
"type": "date",
"value": null
},
{
"field": "callidentifier",
"type": "string",
"value": "Tutkijankoulutus ja työskentely ulkomailla/kevät TT"
},
{
"field": "keywords",
"type": "string",
"value": null
},
{
"field": "duration",
"type": "int",
"value": null
},
{
"field": "ecsc39",
"type": "boolean",
"value": null
},
{
"field": "oamandatepublications",
"type": "boolean",
"value": false
},
{
"field": "ecarticle29_3",
"type": "boolean",
"value": null
},
{
"field": "dateofcollection",
"type": "date",
"value": "2019-01-25"
},
{
"field": "dateoftransformation",
"type": "date",
"value": "2019-04-16"
},
{
"field": "inferred",
"type": "boolean",
"value": false
},
{
"field": "deletedbyinference",
"type": "boolean",
"value": false
},
{
"field": "trust",
"type": "string",
"value": "0.9"
},
{
"field": "inferenceprovenance",
"type": "string",
"value": null
},
{
"field": "optional1",
"type": "string",
"value": "9,284 €"
},
{
"field": "optional2",
"type": "string",
"value": null
},
{
"field": "jsonextrainfo",
"type": "string",
"value": "{}"
},
{
"field": "contactfullname",
"type": "string",
"value": null
},
{
"field": "contactfax",
"type": "string",
"value": null
},
{
"field": "contactphone",
"type": "string",
"value": null
},
{
"field": "contactemail",
"type": "string",
"value": null
},
{
"field": "summary",
"type": "string",
"value": null
},
{
"field": "currency",
"type": "string",
"value": null
},
{
"field": "totalcost",
"type": "double",
"value": null
},
{
"field": "fundedamount",
"type": "double",
"value": null
},
{
"field": "collectedfromid",
"type": "string",
"value": "openaire____::aka"
},
{
"field": "collectedfromname",
"type": "string",
"value": "Academy of Finland"
},
{
"field": "contracttype",
"type": "string",
"value": null
},
{
"field": "provenanceaction",
"type": "not_used",
"value": "sysimport:crosswalk:entityregistry@@@Harvested@@@dnet:provenanceActions@@@dnet:provenanceActions"
},
{
"field": "pid",
"type": "not_used",
"value": [
null
]
},
{
"field": "subjects",
"type": "array",
"value": [
null
]
},
{
"field": "fundingtree",
"type": "array",
"value": [
"<fundingtree><funder>\n <id>aka_________::AKA</id>\n <shortname>AKA</shortname>\n <name>Academy of Finland</name>\n <originalname>Academy of Finland</originalname>\n <jurisdiction>FI</jurisdiction>\n </funder></fundingtree>"
]
}
]

View File

@ -0,0 +1,80 @@
<?xml version="1.0" encoding="UTF-8"?>
<record xmlns:dc="http://purl.org/dc/elements/1.1/"
xmlns:dr="http://www.driver-repository.eu/namespace/dr"
xmlns:dri="http://www.driver-repository.eu/namespace/dri"
xmlns:oaf="http://namespace.openaire.eu/oaf"
xmlns:prov="http://www.openarchives.org/OAI/2.0/provenance" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
<header xmlns="http://namespace.openaire.eu/">
<dri:objIdentifier>pensoft_____::00ea4a1cd53806a97d62ea6bf268f2a2</dri:objIdentifier>
<dri:recordIdentifier>10.3897/oneeco.2.e13718</dri:recordIdentifier>
<dri:dateOfCollection/>
<dri:mdFormat/>
<dri:mdFormatInterpretation/>
<dri:repositoryId/>
<dr:objectIdentifier/>
<dr:dateOfCollection>2020-03-23T00:20:51.392Z</dr:dateOfCollection>
<dr:dateOfTransformation>2020-03-23T00:26:59.078Z</dr:dateOfTransformation>
<oaf:datasourceprefix>pensoft_____</oaf:datasourceprefix>
</header>
<metadata xmlns="http://namespace.openaire.eu/">
<dc:title>Ecosystem Service capacity is higher in areas of multiple designation types</dc:title>
<dc:creator>Nikolaidou,Charitini</dc:creator>
<dc:creator>Votsi,Nefta</dc:creator>
<dc:creator>Sgardelis,Steanos</dc:creator>
<dc:creator>Halley,John</dc:creator>
<dc:creator>Pantis,John</dc:creator>
<dc:creator>Tsiafouli,Maria</dc:creator>
<dc:date>2017</dc:date>
<dc:description>The implementation of the Ecosystem Service (ES) concept into practice might be a challenging task as it has to take into account previous “traditional” policies and approaches that have evaluated nature and biodiversity differently. Among them the Habitat (92/43/EC) and Bird Directives (79/409/EC), the Water Framework Directive (2000/60/EC), and the Noise Directive (2002/49/EC) have led to the evaluation/designation of areas in Europe with different criteria. In this study our goal was to understand how the ES capacity of an area is related to its designation and if areas with multiple designations have higher capacity in providing ES. We selected four catchments in Greece with a great variety of characteristics covering over 25% of the national territory. Inside the catchments we assessed the ES capacity (following the methodology of Burkhard et al. 2009) of areas designated as Natura 2000 sites, Quiet areas and Wetlands or Water bodies and found those areas that have multiple designations. Data were analyzed by GLM to reveal differences regarding the ES capacity among the different types of areas. We also investigated by PCA synergies and trade-offs among different kinds of ES and tested for correlations among landscape properties, such as elevation, aspect and slope and the ES potential. Our results show that areas with different types or multiple designations have a different capacity in providing ES. Areas of one designation type (Protected or Quiet Areas) had in general intermediate scores in most ES but scores were higher compared to areas with no designation, which displayed stronger capacity in provisioning services. Among Protected Areas and Quiet Areas the latter scored better in general. Areas that combined both designation types (Protected and Quiet Areas) showed the highest capacity in 13 out of 29 ES, that were mostly linked with natural and forest ecosystems. We found significant synergies among most regulating, supporting and cultural ES which in turn display trade-offs with provisioning services. The different ES are spatially related and display strong correlation with landscape properties, such as elevation and slope. We suggest that the designation status of an area can be used as an alternative tool for environmental policy, indicating the capacity for ES provision. Multiple designations of areas can be used as proxies for locating ES “hotspots”. This integration of “traditional” evaluation and designation and the “newer” ES concept forms a time- and cost-effective way to be adopted by stakeholders and policy-makers in order to start complying with new standards and demands for nature conservation and environmental management.</dc:description>
<dc:format>text/html</dc:format>
<dc:identifier>https://doi.org/10.3897/oneeco.2.e13718</dc:identifier>
<dc:identifier>https://oneecosystem.pensoft.net/article/13718/</dc:identifier>
<dc:language>eng</dc:language>
<dc:publisher>Pensoft Publishers</dc:publisher>
<dc:relation>info:eu-repo/semantics/altIdentifier/eissn/2367-8194</dc:relation>
<dc:relation>info:eu-repo/grantAgreement/EC/FP7/226852</dc:relation>
<dc:source>One Ecosystem 2: e13718</dc:source>
<dc:subject>Ecosystem Services hotspots</dc:subject>
<dc:subject>Natura 2000</dc:subject>
<dc:subject>Quiet Protected Areas</dc:subject>
<dc:subject>Biodiversity</dc:subject>
<dc:subject>Agriculture</dc:subject>
<dc:subject>Elevation</dc:subject>
<dc:subject>Slope</dc:subject>
<dc:subject>Ecosystem Service trade-offs and synergies</dc:subject>
<dc:subject> cultural services</dc:subject>
<dc:subject>provisioning services</dc:subject>
<dc:subject>regulating services</dc:subject>
<dc:subject>supporting services</dc:subject>
<dc:type>Research Article</dc:type>
<dr:CobjCategory type="publication">0001</dr:CobjCategory>
<oaf:dateAccepted>2017-01-01</oaf:dateAccepted>
<oaf:projectid>corda_______::226852</oaf:projectid>
<oaf:accessrights>OPEN</oaf:accessrights>
<oaf:hostedBy id="openaire____::issn226852" name="One Ecosystem"/>
<oaf:collectedFrom
id="openaire____::45e3c7b69bcee6cc5fa945c9e183deb9" name="Pensoft"/>
<oaf:identifier identifierType="doi">10.3897/oneeco.2.e13718</oaf:identifier>
<oaf:fulltext>https://oneecosystem.pensoft.net/article/13718/</oaf:fulltext>
<oaf:journal eissn="2367-8194" issn="">One Ecosystem</oaf:journal>
</metadata>
<about xmlns:oai="http://www.openarchives.org/OAI/2.0/">
<provenance xmlns="http://www.openarchives.org/OAI/2.0/provenance" xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/provenance http://www.openarchives.org/OAI/2.0/provenance.xsd">
<originDescription altered="true" harvestDate="2020-03-23T00:20:51.392Z">
<baseURL>http%3A%2F%2Fzookeys.pensoft.net%2Foai.php</baseURL>
<identifier>10.3897/oneeco.2.e13718</identifier>
<datestamp>2017-09-08</datestamp>
<metadataNamespace>http://www.openarchives.org/OAI/2.0/oai_dc/</metadataNamespace>
</originDescription>
</provenance>
<oaf:datainfo>
<oaf:inferred>false</oaf:inferred>
<oaf:deletedbyinference>false</oaf:deletedbyinference>
<oaf:trust>0.9</oaf:trust>
<oaf:inferenceprovenance/>
<oaf:provenanceaction classid="sysimport:crosswalk:repository"
classname="sysimport:crosswalk:repository"
schemeid="dnet:provenanceActions" schemename="dnet:provenanceActions"/>
</oaf:datainfo>
</about>
</record>

View File

@ -0,0 +1,113 @@
<?xml version="1.0" encoding="UTF-8"?>
<record xmlns:dr="http://www.driver-repository.eu/namespace/dr"
xmlns:oaf="http://namespace.openaire.eu/oaf" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
<oai:header xmlns="http://namespace.openaire.eu/"
xmlns:dc="http://purl.org/dc/elements/1.1/"
xmlns:dri="http://www.driver-repository.eu/namespace/dri"
xmlns:oai="http://www.openarchives.org/OAI/2.0/" xmlns:prov="http://www.openarchives.org/OAI/2.0/provenance">
<dri:objIdentifier>r37b0ad08687::000374d100a9db469bd42b69dbb40b36</dri:objIdentifier>
<dri:recordIdentifier>10.5281/zenodo.3234526</dri:recordIdentifier>
<dri:dateOfCollection>2020-03-21T00:05:35.927Z</dri:dateOfCollection>
<oaf:datasourceprefix>r37b0ad08687</oaf:datasourceprefix>
<identifier xmlns="http://www.openarchives.org/OAI/2.0/">oai:zenodo.org:3234526</identifier>
<datestamp xmlns="http://www.openarchives.org/OAI/2.0/">2020-03-19T10:58:08Z</datestamp>
<setSpec xmlns="http://www.openarchives.org/OAI/2.0/">openaire_data</setSpec>
<setSpec xmlns="http://www.openarchives.org/OAI/2.0/">user-epfl</setSpec>
</oai:header>
<metadata>
<resource xmlns="http://datacite.org/schema/kernel-4"
xmlns:dc="http://purl.org/dc/elements/1.1/"
xmlns:dri="http://www.driver-repository.eu/namespace/dri"
xmlns:oai="http://www.openarchives.org/OAI/2.0/" xmlns:prov="http://www.openarchives.org/OAI/2.0/provenance">
<identifier identifierType="DOI">10.5281/zenodo.3234526</identifier>
<creators>
<creator>
<creatorName>Nouchi, Vincent</creatorName>
<givenName>Vincent</givenName>
<familyName>Nouchi</familyName>
<affiliation>Physics of Aquatic Systems Laboratory (APHYS) Margaretha Kamprad Chair, ENAC, EPFL, Lausanne, 1015, Switzerland</affiliation>
</creator>
<creator>
<creatorName>Lavanchy, Sébastien</creatorName>
<givenName>Sébastien</givenName>
<familyName>Lavanchy</familyName>
<affiliation>Physics of Aquatic Systems Laboratory (APHYS) Margaretha Kamprad Chair, ENAC, EPFL, Lausanne, 1015, Switzerland</affiliation>
</creator>
<creator>
<creatorName>Baracchini, Theo</creatorName>
<givenName>Theo</givenName>
<familyName>Baracchini</familyName>
<affiliation>Physics of Aquatic Systems Laboratory (APHYS) Margaretha Kamprad Chair, ENAC, EPFL, Lausanne, 1015, Switzerland</affiliation>
</creator>
<creator>
<creatorName>Wüest, Alfred</creatorName>
<givenName>Alfred</givenName>
<familyName>Wüest</familyName>
<affiliation>Physics of Aquatic Systems Laboratory (APHYS) Margaretha Kamprad Chair, ENAC, EPFL, Lausanne, 1015, Switzerland</affiliation>
</creator>
<creator>
<creatorName>Bouffard, Damien</creatorName>
<givenName>Damien</givenName>
<familyName>Bouffard</familyName>
<affiliation>Eawag, Swiss Federal Institute of Aquatic Science and Technology, Surface Waters Research and Management, Kastanienbaum, 6047, Switzerland</affiliation>
</creator>
</creators>
<titles>
<title>Temperature and ADCP data collected on Lake Geneva between 2015 and 2017</title>
</titles>
<publisher>Zenodo</publisher>
<publicationYear>2019</publicationYear>
<subjects>
<subject>Lake Geneva</subject>
<subject>temperature</subject>
<subject>ADCP</subject>
</subjects>
<dates>
<date dateType="Issued">2019-05-29</date>
</dates>
<resourceType resourceTypeGeneral="Dataset"/>
<relatedIdentifiers>
<relatedIdentifier relatedIdentifierType="DOI" relationType="IsVersionOf">10.5281/zenodo.3234525</relatedIdentifier>
<relatedIdentifier relatedIdentifierType="URL" relationType="IsPartOf">https://zenodo.org/communities/epfl</relatedIdentifier>
</relatedIdentifiers>
<version>1.0.0</version>
<rightsList>
<rights rightsURI="http://creativecommons.org/licenses/by/4.0/legalcode">Creative Commons Attribution 4.0 International</rights>
<rights rightsURI="info:eu-repo/semantics/openAccess">Open Access</rights>
</rightsList>
<descriptions>
<description descriptionType="Abstract"><p>Data collected between 2015 and 2017 on Lake Geneva by Acoustic Doppler Current Profiler (ADCP) and CTDs. One file includes all the temperature profiles, the two others are the ADCP data (up- and down-looking) at the SHL2 station (centre of the main basin). Coordinates of the SHL2 station are 534700 and 144950 in the Swiss CH1903 coordinate system. The file with the CTD data contains the coordinates of the sample location (lat, lon), times (in MATLAB time), depths (in meters) and temperatures (in C).</p>
<p>All files are in MATLAB .mat format.</p></description>
</descriptions>
</resource>
<dr:CobjCategory type="dataset">0021</dr:CobjCategory>
<oaf:dateAccepted>2019-01-01</oaf:dateAccepted>
<oaf:accessrights>OPEN</oaf:accessrights>
<oaf:language>und</oaf:language>
<oaf:concept id="https://zenodo.org/communities/epfl"/>
<oaf:hostedBy id="re3data_____::r3d100010468" name="Zenodo"/>
<oaf:collectedFrom id="re3data_____::r3d100010468" name="Zenodo"/>
</metadata>
<about xmlns:dc="http://purl.org/dc/elements/1.1/"
xmlns:dri="http://www.driver-repository.eu/namespace/dri"
xmlns:oai="http://www.openarchives.org/OAI/2.0/" xmlns:prov="http://www.openarchives.org/OAI/2.0/provenance">
<provenance xmlns="http://www.openarchives.org/OAI/2.0/provenance" xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/provenance http://www.openarchives.org/OAI/2.0/provenance.xsd">
<originDescription altered="true" harvestDate="2020-03-21T00:05:35.927Z">
<baseURL>https%3A%2F%2Fzenodo.org%2Foai2d</baseURL>
<identifier>oai:zenodo.org:3234526</identifier>
<datestamp>2020-03-19T10:58:08Z</datestamp>
<metadataNamespace/>
</originDescription>
</provenance>
<oaf:datainfo>
<oaf:inferred>false</oaf:inferred>
<oaf:deletedbyinference>false</oaf:deletedbyinference>
<oaf:trust>0.9</oaf:trust>
<oaf:inferenceprovenance/>
<oaf:provenanceaction classid="sysimport:crosswalk:datasetarchive"
classname="sysimport:crosswalk:datasetarchive"
schemeid="dnet:provenanceActions" schemename="dnet:provenanceActions"/>
</oaf:datainfo>
</about>
</record>

View File

@ -0,0 +1,82 @@
<?xml version="1.0" encoding="UTF-8"?>
<record xmlns:datacite="http://datacite.org/schema/kernel-4"
xmlns:dr="http://www.driver-repository.eu/namespace/dr"
xmlns:functx="http://www.functx.com"
xmlns:oaf="http://namespace.openaire.eu/oaf" xmlns:xs="http://www.w3.org/2001/XMLSchema">
<oai:header xmlns="http://namespace.openaire.eu/"
xmlns:dc="http://purl.org/dc/elements/1.1/"
xmlns:dri="http://www.driver-repository.eu/namespace/dri"
xmlns:oai="http://www.openarchives.org/OAI/2.0/"
xmlns:prov="http://www.openarchives.org/OAI/2.0/provenance" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
<dri:objIdentifier>__bioTools__::001321907fcc9f8d020f05230f9d3ddf</dri:objIdentifier>
<dri:recordIdentifier>chainy</dri:recordIdentifier>
<dri:dateOfCollection>2020-02-05T10:49:49.694Z</dri:dateOfCollection>
<oaf:datasourceprefix>__bioTools__</oaf:datasourceprefix>
<dr:dateOfTransformation>2020-02-05T10:56:28.875Z</dr:dateOfTransformation>
</oai:header>
<metadata>
<datacite:resource>
<datacite:resourceType resourceTypeGeneral="Software">Web application</datacite:resourceType>
<datacite:publisher>bio.tools</datacite:publisher>
<datacite:relatedIdentifiers>
<datacite:relatedIdentifier relatedIdentifierType="URL" relationType="IsDocumentedBy">http://maplab.imppc.org/chainy/</datacite:relatedIdentifier>
<datacite:relatedIdentifier relatedIdentifierType="DOI" relationType="isReferencedBy">10.1093/bioinformatics/btw839</datacite:relatedIdentifier>
</datacite:relatedIdentifiers>
<datacite:alternateIdentifiers>
<datacite:alternateIdentifier alternateIdentifierType="LandingPage">https://bio.tools/</datacite:alternateIdentifier>
</datacite:alternateIdentifiers>
<datacite:identifier identifierType="URL">https://bio.tools/</datacite:identifier>
<datacite:titles>
<datacite:title>Chainy</datacite:title>
</datacite:titles>
<datacite:creators>
<datacite:creator>
<datacite:creatorName>Mallona, Izaskun</datacite:creatorName>
<datacite:givenName>Izaskun</datacite:givenName>
<datacite:familyName>Mallona</datacite:familyName>
</datacite:creator>
</datacite:creators>
<datacite:contributors/>
<datacite:descriptions>
<datacite:description descriptionType="Abstract">Universal tool for standardized relative quantification in real-time PCR.</datacite:description>
<datacite:description descriptionType="TechnicalInfo">Linux</datacite:description>
<datacite:description descriptionType="TechnicalInfo">Windows</datacite:description>
<datacite:description descriptionType="TechnicalInfo">Mac</datacite:description>
</datacite:descriptions>
<datacite:subjects>
<datacite:subject schemeURI="http://edamontology.org"
subjectScheme="EDAM Ontology" valueURI="http://edamontology.org/topic_3519">PCR experiment</datacite:subject>
<datacite:subject schemeURI="http://edamontology.org"
subjectScheme="EDAM Ontology" valueURI="http://edamontology.org/topic_0203">Gene expression</datacite:subject>
<datacite:subject schemeURI="http://edamontology.org"
subjectScheme="EDAM Ontology" valueURI="http://edamontology.org/topic_3534">Protein binding sites</datacite:subject>
</datacite:subjects>
</datacite:resource>
<dr:CobjCategory type="software">0029</dr:CobjCategory>
<oaf:hostedBy id="rest________::bioTools" name="bio.tools"/>
<oaf:collectedFrom id="rest________::bioTools" name="bio.tools"/>
<oaf:dateAccepted>2018-06-06</oaf:dateAccepted>
</metadata>
<about xmlns:dc="http://purl.org/dc/elements/1.1/"
xmlns:dri="http://www.driver-repository.eu/namespace/dri"
xmlns:oai="http://www.openarchives.org/OAI/2.0/"
xmlns:prov="http://www.openarchives.org/OAI/2.0/provenance" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
<provenance xmlns="http://www.openarchives.org/OAI/2.0/provenance" xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/provenance http://www.openarchives.org/OAI/2.0/provenance.xsd">
<originDescription altered="true" harvestDate="2020-02-05T10:49:49.694Z">
<baseURL>https%3A%2F%2Fbio.tools%2Fapi%2Ftool</baseURL>
<identifier/>
<datestamp/>
<metadataNamespace/>
</originDescription>
</provenance>
<oaf:datainfo>
<oaf:inferred>false</oaf:inferred>
<oaf:deletedbyinference>false</oaf:deletedbyinference>
<oaf:trust>0.9</oaf:trust>
<oaf:inferenceprovenance/>
<oaf:provenanceaction classid="sysimport:crosswalk:datasetarchive"
classname="sysimport:crosswalk:datasetarchive"
schemeid="dnet:provenanceActions" schemename="dnet:provenanceActions"/>
</oaf:datainfo>
</about>
</record>

View File

@ -0,0 +1,97 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<parent>
<artifactId>dhp-workflows</artifactId>
<groupId>eu.dnetlib.dhp</groupId>
<version>1.1.7-SNAPSHOT</version>
</parent>
<modelVersion>4.0.0</modelVersion>
<artifactId>dhp-dedup-openaire</artifactId>
<build>
<plugins>
<plugin>
<groupId>net.alchim31.maven</groupId>
<artifactId>scala-maven-plugin</artifactId>
<version>4.0.1</version>
<executions>
<execution>
<id>scala-compile-first</id>
<phase>initialize</phase>
<goals>
<goal>add-source</goal>
<goal>compile</goal>
</goals>
</execution>
<execution>
<id>scala-test-compile</id>
<phase>process-test-resources</phase>
<goals>
<goal>testCompile</goal>
</goals>
</execution>
</executions>
<configuration>
<scalaVersion>${scala.version}</scalaVersion>
</configuration>
</plugin>
</plugins>
</build>
<dependencies>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.11</artifactId>
</dependency>
<dependency>
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-common</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-schemas</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>com.arakelian</groupId>
<artifactId>java-jq</artifactId>
</dependency>
<dependency>
<groupId>dom4j</groupId>
<artifactId>dom4j</artifactId>
</dependency>
<dependency>
<groupId>jaxen</groupId>
<artifactId>jaxen</artifactId>
</dependency>
<dependency>
<groupId>eu.dnetlib</groupId>
<artifactId>dnet-pace-core</artifactId>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-graphx_2.11</artifactId>
</dependency>
<dependency>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-databind</artifactId>
</dependency>
<dependency>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-core</artifactId>
</dependency>
</dependencies>
</project>

View File

@ -0,0 +1,119 @@
package eu.dnetlib.dhp.oa.dedup;
import eu.dnetlib.dhp.schema.oaf.Field;
import org.apache.commons.lang.StringUtils;
import java.time.Year;
import java.util.*;
import java.util.stream.Collectors;
import static java.util.Collections.reverseOrder;
import static java.util.Map.Entry.comparingByValue;
import static java.util.stream.Collectors.toMap;
import static org.apache.commons.lang.StringUtils.endsWith;
import static org.apache.commons.lang.StringUtils.substringBefore;
public class DatePicker {
private static final String DATE_PATTERN = "\\d{4}-\\d{2}-\\d{2}";
private static final String DATE_DEFAULT_SUFFIX = "01-01";
private static final int YEAR_LB = 1300;
private static final int YEAR_UB = Year.now().getValue() + 5;
public static Field<String> pick(final Collection<String> dateofacceptance) {
final Map<String, Integer> frequencies = dateofacceptance
.parallelStream()
.filter(StringUtils::isNotBlank)
.collect(
Collectors.toConcurrentMap(
w -> w, w -> 1, Integer::sum));
if (frequencies.isEmpty()) {
return new Field<>();
}
final Field<String> date = new Field<>();
date.setValue(frequencies.keySet().iterator().next());
// let's sort this map by values first, filtering out invalid dates
final Map<String, Integer> sorted = frequencies
.entrySet()
.stream()
.filter(d -> StringUtils.isNotBlank(d.getKey()))
.filter(d -> d.getKey().matches(DATE_PATTERN))
.filter(d -> inRange(d.getKey()))
.sorted(reverseOrder(comparingByValue()))
.collect(
toMap(
Map.Entry::getKey,
Map.Entry::getValue, (e1, e2) -> e2,
LinkedHashMap::new));
// shortcut
if (sorted.size() == 0) {
return date;
}
// voting method (1/3 + 1) wins
if (sorted.size() >= 3) {
final int acceptThreshold = (sorted.size() / 3) + 1;
final List<String> accepted = sorted.entrySet().stream()
.filter(e -> e.getValue() >= acceptThreshold)
.map(e -> e.getKey())
.collect(Collectors.toList());
// cannot find strong majority
if (accepted.isEmpty()) {
final int max = sorted.values().iterator().next();
Optional<String> first = sorted.entrySet().stream()
.filter(e -> e.getValue() == max && !endsWith(e.getKey(), DATE_DEFAULT_SUFFIX))
.map(Map.Entry::getKey)
.findFirst();
if (first.isPresent()) {
date.setValue(first.get());
return date;
}
date.setValue(sorted.keySet().iterator().next());
return date;
}
if (accepted.size() == 1) {
date.setValue(accepted.get(0));
return date;
} else {
final Optional<String> first = accepted.stream()
.filter(d -> !endsWith(d, DATE_DEFAULT_SUFFIX))
.findFirst();
if (first.isPresent()) {
date.setValue(first.get());
return date;
}
return date;
}
//1st non YYYY-01-01 is returned
} else {
if (sorted.size() == 2) {
for (Map.Entry<String, Integer> e : sorted.entrySet()) {
if (!endsWith(e.getKey(), DATE_DEFAULT_SUFFIX)) {
date.setValue(e.getKey());
return date;
}
}
}
// none of the dates seems good enough, return the 1st one
date.setValue(sorted.keySet().iterator().next());
return date;
}
}
private static boolean inRange(final String date) {
final int year = Integer.parseInt(substringBefore(date, "-"));
return year >= YEAR_LB && year <= YEAR_UB;
}
}

View File

@ -1,11 +1,9 @@
package eu.dnetlib.dedup; package eu.dnetlib.dhp.oa.dedup;
import com.google.common.collect.Lists; import com.google.common.collect.Lists;
import eu.dnetlib.dhp.schema.oaf.*; import eu.dnetlib.dhp.schema.oaf.*;
import eu.dnetlib.pace.config.DedupConfig; import eu.dnetlib.pace.config.DedupConfig;
import eu.dnetlib.pace.util.MapDocumentUtil; import eu.dnetlib.pace.util.MapDocumentUtil;
import org.apache.commons.lang.NotImplementedException;
import org.apache.commons.lang.StringUtils;
import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.JavaSparkContext;
@ -16,9 +14,6 @@ import org.codehaus.jackson.map.ObjectMapper;
import scala.Tuple2; import scala.Tuple2;
import java.util.Collection; import java.util.Collection;
import java.util.Random;
import static java.util.stream.Collectors.toMap;
public class DedupRecordFactory { public class DedupRecordFactory {

View File

@ -0,0 +1,221 @@
package eu.dnetlib.dhp.oa.dedup;
import com.google.common.collect.Sets;
import com.wcohen.ss.JaroWinkler;
import eu.dnetlib.dhp.schema.oaf.Author;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
import eu.dnetlib.dhp.utils.ISLookupClientFactory;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
import eu.dnetlib.pace.clustering.BlacklistAwareClusteringCombiner;
import eu.dnetlib.pace.config.DedupConfig;
import eu.dnetlib.pace.model.MapDocument;
import eu.dnetlib.pace.model.Person;
import org.apache.commons.codec.binary.Hex;
import org.apache.commons.lang3.StringUtils;
import org.apache.spark.SparkContext;
import org.apache.spark.util.LongAccumulator;
import org.dom4j.Document;
import org.dom4j.DocumentException;
import org.dom4j.Element;
import org.dom4j.io.SAXReader;
import scala.Tuple2;
import java.io.StringReader;
import java.security.MessageDigest;
import java.text.Normalizer;
import java.util.*;
import java.util.stream.Collectors;
public class DedupUtility {
private static final Double THRESHOLD = 0.95;
public static Map<String, LongAccumulator> constructAccumulator(final DedupConfig dedupConf, final SparkContext context) {
Map<String, LongAccumulator> accumulators = new HashMap<>();
String acc1 = String.format("%s::%s", dedupConf.getWf().getEntityType(), "records per hash key = 1");
accumulators.put(acc1, context.longAccumulator(acc1));
String acc2 = String.format("%s::%s", dedupConf.getWf().getEntityType(), "missing " + dedupConf.getWf().getOrderField());
accumulators.put(acc2, context.longAccumulator(acc2));
String acc3 = String.format("%s::%s", dedupConf.getWf().getEntityType(), String.format("Skipped records for count(%s) >= %s", dedupConf.getWf().getOrderField(), dedupConf.getWf().getGroupMaxSize()));
accumulators.put(acc3, context.longAccumulator(acc3));
String acc4 = String.format("%s::%s", dedupConf.getWf().getEntityType(), "skip list");
accumulators.put(acc4, context.longAccumulator(acc4));
String acc5 = String.format("%s::%s", dedupConf.getWf().getEntityType(), "dedupSimilarity (x2)");
accumulators.put(acc5, context.longAccumulator(acc5));
String acc6 = String.format("%s::%s", dedupConf.getWf().getEntityType(), "d < " + dedupConf.getWf().getThreshold());
accumulators.put(acc6, context.longAccumulator(acc6));
return accumulators;
}
static Set<String> getGroupingKeys(DedupConfig conf, MapDocument doc) {
return Sets.newHashSet(BlacklistAwareClusteringCombiner.filterAndCombine(doc, conf));
}
public static String md5(final String s) {
try {
final MessageDigest md = MessageDigest.getInstance("MD5");
md.update(s.getBytes("UTF-8"));
return new String(Hex.encodeHex(md.digest()));
} catch (final Exception e) {
System.err.println("Error creating id");
return null;
}
}
public static List<Author> mergeAuthor(final List<Author> a, final List<Author> b) {
int pa = countAuthorsPids(a);
int pb = countAuthorsPids(b);
List<Author> base, enrich;
int sa = authorsSize(a);
int sb = authorsSize(b);
if (pa == pb) {
base = sa > sb ? a : b;
enrich = sa > sb ? b : a;
} else {
base = pa > pb ? a : b;
enrich = pa > pb ? b : a;
}
enrichPidFromList(base, enrich);
return base;
}
private static void enrichPidFromList(List<Author> base, List<Author> enrich) {
if (base == null || enrich == null)
return;
final Map<String, Author> basePidAuthorMap = base.stream()
.filter(a -> a.getPid() != null && a.getPid().size() > 0)
.flatMap(a -> a.getPid()
.stream()
.map(p -> new Tuple2<>(p.toComparableString(), a))
).collect(Collectors.toMap(Tuple2::_1, Tuple2::_2, (x1, x2) -> x1));
final List<Tuple2<StructuredProperty, Author>> pidToEnrich = enrich
.stream()
.filter(a -> a.getPid() != null && a.getPid().size() > 0)
.flatMap(a -> a.getPid().stream().filter(p -> !basePidAuthorMap.containsKey(p.toComparableString())).map(p -> new Tuple2<>(p, a)))
.collect(Collectors.toList());
pidToEnrich.forEach(a -> {
Optional<Tuple2<Double, Author>> simAuhtor = base.stream().map(ba -> new Tuple2<>(sim(ba, a._2()), ba)).max(Comparator.comparing(Tuple2::_1));
if (simAuhtor.isPresent() && simAuhtor.get()._1() > THRESHOLD) {
Author r = simAuhtor.get()._2();
r.getPid().add(a._1());
}
});
}
public static String createDedupRecordPath(final String basePath, final String actionSetId, final String entityType) {
return String.format("%s/%s/%s_deduprecord", basePath, actionSetId, entityType);
}
public static String createEntityPath(final String basePath, final String entityType) {
return String.format("%s/%s", basePath, entityType);
}
public static String createSimRelPath(final String basePath, final String actionSetId,final String entityType) {
return String.format("%s/%s/%s_simrel", basePath, actionSetId, entityType);
}
public static String createMergeRelPath(final String basePath, final String actionSetId, final String entityType) {
return String.format("%s/%s/%s_mergerel", basePath, actionSetId, entityType);
}
private static Double sim(Author a, Author b) {
final Person pa = parse(a);
final Person pb = parse(b);
if (pa.isAccurate() & pb.isAccurate()) {
return new JaroWinkler().score(
normalize(pa.getSurnameString()),
normalize(pb.getSurnameString()));
} else {
return new JaroWinkler().score(
normalize(pa.getNormalisedFullname()),
normalize(pb.getNormalisedFullname()));
}
}
private static String normalize(final String s) {
return nfd(s).toLowerCase()
// do not compact the regexes in a single expression, would cause StackOverflowError in case of large input strings
.replaceAll("(\\W)+", " ")
.replaceAll("(\\p{InCombiningDiacriticalMarks})+", " ")
.replaceAll("(\\p{Punct})+", " ")
.replaceAll("(\\d)+", " ")
.replaceAll("(\\n)+", " ")
.trim();
}
private static String nfd(final String s) {
return Normalizer.normalize(s, Normalizer.Form.NFD);
}
private static Person parse(Author author) {
if (StringUtils.isNotBlank(author.getSurname())) {
return new Person(author.getSurname() + ", " + author.getName(), false);
} else {
return new Person(author.getFullname(), false);
}
}
private static int countAuthorsPids(List<Author> authors) {
if (authors == null)
return 0;
return (int) authors.stream().filter(DedupUtility::hasPid).count();
}
private static int authorsSize(List<Author> authors) {
if (authors == null)
return 0;
return authors.size();
}
private static boolean hasPid(Author a) {
if (a == null || a.getPid() == null || a.getPid().size() == 0)
return false;
return a.getPid().stream().anyMatch(p -> p != null && StringUtils.isNotBlank(p.getValue()));
}
public static List<DedupConfig> getConfigurations(String isLookUpUrl, String orchestrator) throws ISLookUpException, DocumentException {
final ISLookUpService isLookUpService = ISLookupClientFactory.getLookUpService(isLookUpUrl);
final String xquery = String.format("/RESOURCE_PROFILE[.//DEDUPLICATION/ACTION_SET/@id = '%s']", orchestrator);
String orchestratorProfile = isLookUpService.getResourceProfileByQuery(xquery);
final Document doc = new SAXReader().read(new StringReader(orchestratorProfile));
final String actionSetId = doc.valueOf("//DEDUPLICATION/ACTION_SET/@id");
final List<DedupConfig> configurations = new ArrayList<>();
for (final Object o : doc.selectNodes("//SCAN_SEQUENCE/SCAN")) {
configurations.add(loadConfig(isLookUpService, actionSetId, o));
}
return configurations;
}
private static DedupConfig loadConfig(final ISLookUpService isLookUpService, final String actionSetId, final Object o)
throws ISLookUpException {
final Element s = (Element) o;
final String configProfileId = s.attributeValue("id");
final String conf =
isLookUpService.getResourceProfileByQuery(String.format(
"for $x in /RESOURCE_PROFILE[.//RESOURCE_IDENTIFIER/@value = '%s'] return $x//DEDUPLICATION/text()",
configProfileId));
final DedupConfig dedupConfig = DedupConfig.load(conf);
dedupConfig.getWf().setConfigurationId(actionSetId);
return dedupConfig;
}
}

View File

@ -0,0 +1,161 @@
package eu.dnetlib.dhp.oa.dedup;
import eu.dnetlib.pace.config.DedupConfig;
import eu.dnetlib.pace.model.MapDocument;
import eu.dnetlib.pace.util.BlockProcessor;
import eu.dnetlib.pace.util.MapDocumentUtil;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.PairFlatMapFunction;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.util.LongAccumulator;
import scala.Serializable;
import scala.Tuple2;
import java.util.*;
import java.util.stream.Collectors;
public class Deduper implements Serializable {
private static final Log log = LogFactory.getLog(Deduper.class);
/**
* @return the list of relations generated by the deduplication
* @param: the spark context
* @param: list of JSON entities to be deduped
* @param: the dedup configuration
*/
public static JavaPairRDD<String, String> dedup(JavaSparkContext context, JavaRDD<String> entities, DedupConfig config) {
Map<String, LongAccumulator> accumulators = DedupUtility.constructAccumulator(config, context.sc());
//create vertexes of the graph: <ID, MapDocument>
JavaPairRDD<String, MapDocument> mapDocs = mapToVertexes(context, entities, config);
//create blocks for deduplication
JavaPairRDD<String, Iterable<MapDocument>> blocks = createBlocks(context, mapDocs, config);
//create relations by comparing only elements in the same group
return computeRelations(context, blocks, config);
// final RDD<Edge<String>> edgeRdd = relationRDD.map(it -> new Edge<>(it._1().hashCode(), it._2().hashCode(), "equalTo")).rdd();
//
// RDD<Tuple2<Object, MapDocument>> vertexes = mapDocs.mapToPair((PairFunction<Tuple2<String, MapDocument>, Object, MapDocument>) t -> new Tuple2<Object, MapDocument>((long) t._1().hashCode(), t._2())).rdd();
// accumulators.forEach((name, acc) -> log.info(name + " -> " + acc.value()));
//
// return GraphProcessor.findCCs(vertexes, edgeRdd, 20).toJavaRDD();
}
/**
* @return the list of relations generated by the deduplication
* @param: the spark context
* @param: list of blocks
* @param: the dedup configuration
*/
public static JavaPairRDD<String, String> computeRelations(JavaSparkContext context, JavaPairRDD<String, Iterable<MapDocument>> blocks, DedupConfig config) {
Map<String, LongAccumulator> accumulators = DedupUtility.constructAccumulator(config, context.sc());
return blocks.flatMapToPair((PairFlatMapFunction<Tuple2<String, Iterable<MapDocument>>, String, String>) it -> {
final SparkReporter reporter = new SparkReporter(accumulators);
new BlockProcessor(config).process(it._1(), it._2(), reporter);
return reporter.getRelations().iterator();
}).mapToPair(
(PairFunction<Tuple2<String, String>, String, Tuple2<String, String>>) item ->
new Tuple2<String, Tuple2<String, String>>(item._1() + item._2(), item))
.reduceByKey((a, b) -> a)
.mapToPair((PairFunction<Tuple2<String, Tuple2<String, String>>, String, String>) Tuple2::_2);
}
/**
* @return the list of blocks based on clustering of dedup configuration
* @param: the spark context
* @param: list of entities: <id, entity>
* @param: the dedup configuration
*/
public static JavaPairRDD<String, Iterable<MapDocument>> createBlocks(JavaSparkContext context, JavaPairRDD<String, MapDocument> mapDocs, DedupConfig config) {
return mapDocs
//the reduce is just to be sure that we haven't document with same id
.reduceByKey((a, b) -> a)
.map(Tuple2::_2)
//Clustering: from <id, doc> to List<groupkey,doc>
.flatMapToPair((PairFlatMapFunction<MapDocument, String, MapDocument>) a ->
DedupUtility.getGroupingKeys(config, a)
.stream()
.map(it -> new Tuple2<>(it, a))
.collect(Collectors.toList())
.iterator())
.groupByKey();
}
public static JavaPairRDD<String, List<MapDocument>> createsortedBlocks(JavaSparkContext context, JavaPairRDD<String, MapDocument> mapDocs, DedupConfig config) {
final String of = config.getWf().getOrderField();
final int maxQueueSize = config.getWf().getGroupMaxSize();
return mapDocs
//the reduce is just to be sure that we haven't document with same id
.reduceByKey((a, b) -> a)
.map(Tuple2::_2)
//Clustering: from <id, doc> to List<groupkey,doc>
.flatMapToPair((PairFlatMapFunction<MapDocument, String, List<MapDocument>>) a ->
DedupUtility.getGroupingKeys(config, a)
.stream()
.map(it -> {
List<MapDocument> tmp = new ArrayList<>();
tmp.add(a);
return new Tuple2<>(it, tmp);
}
)
.collect(Collectors.toList())
.iterator())
.reduceByKey((Function2<List<MapDocument>, List<MapDocument>, List<MapDocument>>) (v1, v2) -> {
v1.addAll(v2);
v1.sort(Comparator.comparing(a -> a.getFieldMap().get(of).stringValue()));
if (v1.size() > maxQueueSize)
return new ArrayList<>(v1.subList(0, maxQueueSize));
return v1;
});
}
/**
* @return the list of vertexes: <id, mapDocument>
* @param: the spark context
* @param: list of JSON entities
* @param: the dedup configuration
*/
public static JavaPairRDD<String, MapDocument> mapToVertexes(JavaSparkContext context, JavaRDD<String> entities, DedupConfig config) {
return entities.mapToPair((PairFunction<String, String, MapDocument>) s -> {
MapDocument mapDocument = MapDocumentUtil.asMapDocumentWithJPath(config, s);
return new Tuple2<String, MapDocument>(mapDocument.getIdentifier(), mapDocument);
});
}
public static JavaPairRDD<String, String> computeRelations2(JavaSparkContext context, JavaPairRDD<String, List<MapDocument>> blocks, DedupConfig config) {
Map<String, LongAccumulator> accumulators = DedupUtility.constructAccumulator(config, context.sc());
return blocks.flatMapToPair((PairFlatMapFunction<Tuple2<String, List<MapDocument>>, String, String>) it -> {
try {
final SparkReporter reporter = new SparkReporter(accumulators);
new BlockProcessor(config).processSortedBlock(it._1(), it._2(), reporter);
return reporter.getRelations().iterator();
} catch (Exception e) {
throw new RuntimeException(it._2().get(0).getIdentifier(), e);
}
}).mapToPair(
(PairFunction<Tuple2<String, String>, String, Tuple2<String, String>>) item ->
new Tuple2<String, Tuple2<String, String>>(item._1() + item._2(), item))
.reduceByKey((a, b) -> a)
.mapToPair((PairFunction<Tuple2<String, Tuple2<String, String>>, String, String>) Tuple2::_2);
}
}

View File

@ -0,0 +1,15 @@
package eu.dnetlib.dhp.oa.dedup;
public enum OafEntityType {
datasource,
organization,
project,
dataset,
otherresearchproduct,
software,
publication
}

View File

@ -0,0 +1,101 @@
package eu.dnetlib.dhp.oa.dedup;
import com.google.common.hash.Hashing;
import eu.dnetlib.dhp.oa.dedup.graph.ConnectedComponent;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.oa.dedup.graph.GraphProcessor;
import eu.dnetlib.dhp.schema.oaf.Relation;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
import eu.dnetlib.pace.config.DedupConfig;
import eu.dnetlib.pace.util.MapDocumentUtil;
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.graphx.Edge;
import org.apache.spark.rdd.RDD;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SparkSession;
import org.dom4j.DocumentException;
import scala.Tuple2;
import java.util.ArrayList;
import java.util.List;
public class SparkCreateConnectedComponent {
public static void main(String[] args) throws Exception {
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
IOUtils.toString(
SparkCreateConnectedComponent.class.getResourceAsStream("/eu/dnetlib/dhp/oa/dedup/createCC_parameters.json")));
parser.parseArgument(args);
new SparkCreateConnectedComponent().run(parser);
}
private void run(ArgumentApplicationParser parser) throws ISLookUpException, DocumentException {
final String graphBasePath = parser.get("graphBasePath");
final String workingPath = parser.get("workingPath");
final String isLookUpUrl = parser.get("isLookUpUrl");
final String actionSetId = parser.get("actionSetId");
try (SparkSession spark = getSparkSession(parser)) {
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
for (DedupConfig dedupConf: DedupUtility.getConfigurations(isLookUpUrl, actionSetId)) {
final String entity = dedupConf.getWf().getEntityType();
final String subEntity = dedupConf.getWf().getSubEntityValue();
final JavaPairRDD<Object, String> vertexes = sc.textFile(graphBasePath + "/" + subEntity)
.map(s -> MapDocumentUtil.getJPathString(dedupConf.getWf().getIdPath(), s))
.mapToPair((PairFunction<String, Object, String>)
s -> new Tuple2<Object, String>(getHashcode(s), s)
);
final Dataset<Relation> similarityRelations = spark.read().load(DedupUtility.createSimRelPath(workingPath, actionSetId, subEntity)).as(Encoders.bean(Relation.class));
final RDD<Edge<String>> edgeRdd = similarityRelations.javaRDD().map(it -> new Edge<>(getHashcode(it.getSource()), getHashcode(it.getTarget()), it.getRelClass())).rdd();
final JavaRDD<ConnectedComponent> cc = GraphProcessor.findCCs(vertexes.rdd(), edgeRdd, dedupConf.getWf().getMaxIterations()).toJavaRDD();
final Dataset<Relation> mergeRelation = spark.createDataset(cc.filter(k -> k.getDocIds().size() > 1).flatMap((FlatMapFunction<ConnectedComponent, Relation>) c ->
c.getDocIds()
.stream()
.flatMap(id -> {
List<Relation> tmp = new ArrayList<>();
Relation r = new Relation();
r.setSource(c.getCcId());
r.setTarget(id);
r.setRelClass("merges");
tmp.add(r);
r = new Relation();
r.setTarget(c.getCcId());
r.setSource(id);
r.setRelClass("isMergedIn");
tmp.add(r);
return tmp.stream();
}).iterator()).rdd(), Encoders.bean(Relation.class));
mergeRelation.write().mode("overwrite").save(DedupUtility.createMergeRelPath(workingPath, actionSetId, entity));
}
}
}
public static long getHashcode(final String id) {
return Hashing.murmur3_128().hashString(id).asLong();
}
private static SparkSession getSparkSession(ArgumentApplicationParser parser) {
SparkConf conf = new SparkConf();
return SparkSession
.builder()
.appName(SparkCreateSimRels.class.getSimpleName())
.master(parser.get("master"))
.config(conf)
.getOrCreate();
}
}

View File

@ -0,0 +1,64 @@
package eu.dnetlib.dhp.oa.dedup;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.schema.oaf.OafEntity;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
import eu.dnetlib.pace.config.DedupConfig;
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.SparkSession;
import org.dom4j.DocumentException;
public class SparkCreateDedupRecord {
public static void main(String[] args) throws Exception {
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
IOUtils.toString(
SparkCreateDedupRecord.class.getResourceAsStream("/eu/dnetlib/dhp/oa/dedup/createDedupRecord_parameters.json")));
parser.parseArgument(args);
new SparkCreateDedupRecord().run(parser);
}
private void run(ArgumentApplicationParser parser) throws ISLookUpException, DocumentException {
final String graphBasePath = parser.get("graphBasePath");
final String isLookUpUrl = parser.get("isLookUpUrl");
final String actionSetId = parser.get("actionSetId");
final String workingPath = parser.get("workingPath");
try (SparkSession spark = getSparkSession(parser)) {
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
for (DedupConfig dedupConf: DedupUtility.getConfigurations(isLookUpUrl, actionSetId)) {
String subEntity = dedupConf.getWf().getSubEntityValue();
final String mergeRelPath = DedupUtility.createMergeRelPath(workingPath, actionSetId, subEntity);
final String entityPath = DedupUtility.createEntityPath(graphBasePath, subEntity);
final OafEntityType entityType = OafEntityType.valueOf(subEntity);
final JavaRDD<OafEntity> dedupRecord =
DedupRecordFactory.createDedupRecord(sc, spark, mergeRelPath, entityPath, entityType, dedupConf);
dedupRecord.map(r -> {
ObjectMapper mapper = new ObjectMapper();
return mapper.writeValueAsString(r);
}).saveAsTextFile(DedupUtility.createDedupRecordPath(workingPath, actionSetId, subEntity));
}
}
}
private static SparkSession getSparkSession(ArgumentApplicationParser parser) {
SparkConf conf = new SparkConf();
return SparkSession
.builder()
.appName(SparkCreateDedupRecord.class.getSimpleName())
.master(parser.get("master"))
.config(conf)
.enableHiveSupport()
.getOrCreate();
}
}

View File

@ -0,0 +1,134 @@
package eu.dnetlib.dhp.oa.dedup;
import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.schema.action.AtomicAction;
import eu.dnetlib.dhp.schema.oaf.Relation;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
import eu.dnetlib.pace.config.DedupConfig;
import eu.dnetlib.pace.model.MapDocument;
import eu.dnetlib.pace.util.MapDocumentUtil;
import org.apache.commons.io.IOUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.io.Text;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SparkSession;
import org.dom4j.DocumentException;
import scala.Tuple2;
import java.io.Serializable;
import java.util.List;
public class SparkCreateSimRels implements Serializable {
private static final Log log = LogFactory.getLog(SparkCreateSimRels.class);
public static void main(String[] args) throws Exception {
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
IOUtils.toString(
SparkCreateSimRels.class.getResourceAsStream("/eu/dnetlib/dhp/oa/dedup/createSimRels_parameters.json")));
parser.parseArgument(args);
new SparkCreateSimRels().run(parser);
}
private void run(ArgumentApplicationParser parser) throws ISLookUpException, DocumentException {
//read oozie parameters
final String graphBasePath = parser.get("graphBasePath");
final String isLookUpUrl = parser.get("isLookUpUrl");
final String actionSetId = parser.get("actionSetId");
final String workingPath = parser.get("workingPath");
System.out.println(String.format("graphBasePath: '%s'", graphBasePath));
System.out.println(String.format("isLookUpUrl: '%s'", isLookUpUrl));
System.out.println(String.format("actionSetId: '%s'", actionSetId));
System.out.println(String.format("workingPath: '%s'", workingPath));
try (SparkSession spark = getSparkSession(parser)) {
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
//for each dedup configuration
for (DedupConfig dedupConf: DedupUtility.getConfigurations(isLookUpUrl, actionSetId)) {
final String entity = dedupConf.getWf().getEntityType();
final String subEntity = dedupConf.getWf().getSubEntityValue();
JavaPairRDD<String, MapDocument> mapDocument = sc.textFile(DedupUtility.createEntityPath(graphBasePath, subEntity))
.mapToPair(s -> {
MapDocument d = MapDocumentUtil.asMapDocumentWithJPath(dedupConf, s);
return new Tuple2<>(d.getIdentifier(), d);
});
//create blocks for deduplication
JavaPairRDD<String, List<MapDocument>> blocks = Deduper.createsortedBlocks(sc, mapDocument, dedupConf);
//create relations by comparing only elements in the same group
final JavaPairRDD<String, String> dedupRels = Deduper.computeRelations2(sc, blocks, dedupConf);
JavaRDD<Relation> relationsRDD = dedupRels.map(r -> createSimRel(r._1(), r._2(), entity));
//save the simrel in the workingdir
spark.createDataset(relationsRDD.rdd(), Encoders.bean(Relation.class))
.write()
.mode("overwrite")
.save(DedupUtility.createSimRelPath(workingPath, actionSetId, subEntity));
}
}
}
/**
* Utility method used to create an atomic action from a Relation object
* @param relation input relation
* @return A tuple2 with [id, json serialization of the atomic action]
* @throws JsonProcessingException
*/
public Tuple2<Text, Text> createSequenceFileRow(Relation relation) throws JsonProcessingException {
ObjectMapper mapper = new ObjectMapper();
String id = relation.getSource() + "@" + relation.getRelClass() + "@" + relation.getTarget();
AtomicAction<Relation> aa = new AtomicAction<>(Relation.class, relation);
return new Tuple2<>(
new Text(id),
new Text(mapper.writeValueAsString(aa))
);
}
public Relation createSimRel(String source, String target, String entity){
final Relation r = new Relation();
r.setSource(source);
r.setTarget(target);
switch(entity){
case "result":
r.setRelClass("resultResult_dedupSimilarity_isSimilarTo");
break;
case "organization":
r.setRelClass("organizationOrganization_dedupSimilarity_isSimilarTo");
break;
default:
r.setRelClass("isSimilarTo");
break;
}
return r;
}
private static SparkSession getSparkSession(ArgumentApplicationParser parser) {
SparkConf conf = new SparkConf();
return SparkSession
.builder()
.appName(SparkCreateSimRels.class.getSimpleName())
.master(parser.get("master"))
.config(conf)
.getOrCreate();
}
}

View File

@ -0,0 +1,171 @@
package eu.dnetlib.dhp.oa.dedup;
import com.fasterxml.jackson.databind.DeserializationFeature;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.schema.oaf.DataInfo;
import eu.dnetlib.dhp.schema.oaf.Oaf;
import eu.dnetlib.dhp.schema.oaf.Relation;
import eu.dnetlib.pace.util.MapDocumentUtil;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.io.compress.GzipCodec;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.Optional;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;
import scala.Tuple2;
import java.io.IOException;
public class SparkPropagateRelation {
enum FieldType {
SOURCE,
TARGET
}
final static String SOURCEJSONPATH = "$.source";
final static String TARGETJSONPATH = "$.target";
public static void main(String[] args) throws Exception {
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
IOUtils.toString(
SparkPropagateRelation.class.getResourceAsStream("/eu/dnetlib/dhp/oa/dedup/propagateRelation_parameters.json")));
parser.parseArgument(args);
new SparkPropagateRelation().run(parser);
}
public void run(ArgumentApplicationParser parser) {
final String graphBasePath = parser.get("graphBasePath");
final String workingPath = parser.get("workingPath");
final String dedupGraphPath = parser.get("dedupGraphPath");
try (SparkSession spark = getSparkSession(parser)) {
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
final Dataset<Relation> mergeRels = spark.read().load(DedupUtility.createMergeRelPath(workingPath, "*", "*")).as(Encoders.bean(Relation.class));
final JavaPairRDD<String, String> mergedIds = mergeRels
.where("relClass == 'merges'")
.select(mergeRels.col("source"), mergeRels.col("target"))
.distinct()
.toJavaRDD()
.mapToPair((PairFunction<Row, String, String>) r -> new Tuple2<>(r.getString(1), r.getString(0)));
JavaRDD<String> relations = sc.textFile(DedupUtility.createEntityPath(graphBasePath, "relation"));
JavaRDD<String> newRels = relations.mapToPair(
(PairFunction<String, String, String>) s ->
new Tuple2<>(MapDocumentUtil.getJPathString(SOURCEJSONPATH, s), s))
.leftOuterJoin(mergedIds)
.map((Function<Tuple2<String, Tuple2<String, Optional<String>>>, String>) v1 -> {
if (v1._2()._2().isPresent()) {
return replaceField(v1._2()._1(), v1._2()._2().get(), FieldType.SOURCE);
}
return v1._2()._1();
})
.mapToPair(
(PairFunction<String, String, String>) s ->
new Tuple2<>(MapDocumentUtil.getJPathString(TARGETJSONPATH, s), s))
.leftOuterJoin(mergedIds)
.map((Function<Tuple2<String, Tuple2<String, Optional<String>>>, String>) v1 -> {
if (v1._2()._2().isPresent()) {
return replaceField(v1._2()._1(), v1._2()._2().get(), FieldType.TARGET);
}
return v1._2()._1();
}).filter(SparkPropagateRelation::containsDedup)
.repartition(500);
//update deleted by inference
relations = relations.mapToPair(
(PairFunction<String, String, String>) s ->
new Tuple2<>(MapDocumentUtil.getJPathString(SOURCEJSONPATH, s), s))
.leftOuterJoin(mergedIds)
.map((Function<Tuple2<String, Tuple2<String, Optional<String>>>, String>) v1 -> {
if (v1._2()._2().isPresent()) {
return updateDeletedByInference(v1._2()._1(), Relation.class);
}
return v1._2()._1();
})
.mapToPair(
(PairFunction<String, String, String>) s ->
new Tuple2<>(MapDocumentUtil.getJPathString(TARGETJSONPATH, s), s))
.leftOuterJoin(mergedIds)
.map((Function<Tuple2<String, Tuple2<String, Optional<String>>>, String>) v1 -> {
if (v1._2()._2().isPresent()) {
return updateDeletedByInference(v1._2()._1(), Relation.class);
}
return v1._2()._1();
})
.repartition(500);
newRels.union(relations).repartition(1000)
.saveAsTextFile(DedupUtility.createEntityPath(dedupGraphPath, "relation"), GzipCodec.class);
}
}
private static boolean containsDedup(final String json) {
final String source = MapDocumentUtil.getJPathString(SOURCEJSONPATH, json);
final String target = MapDocumentUtil.getJPathString(TARGETJSONPATH, json);
return source.toLowerCase().contains("dedup") || target.toLowerCase().contains("dedup");
}
private static String replaceField(final String json, final String id, final FieldType type) {
ObjectMapper mapper = new ObjectMapper();
mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
try {
Relation relation = mapper.readValue(json, Relation.class);
if (relation.getDataInfo() == null)
relation.setDataInfo(new DataInfo());
relation.getDataInfo().setDeletedbyinference(false);
switch (type) {
case SOURCE:
relation.setSource(id);
return mapper.writeValueAsString(relation);
case TARGET:
relation.setTarget(id);
return mapper.writeValueAsString(relation);
default:
throw new IllegalArgumentException("");
}
} catch (IOException e) {
throw new RuntimeException("unable to deserialize json relation: " + json, e);
}
}
private static SparkSession getSparkSession(ArgumentApplicationParser parser) {
SparkConf conf = new SparkConf();
return SparkSession
.builder()
.appName(SparkPropagateRelation.class.getSimpleName())
.master(parser.get("master"))
.config(conf)
.enableHiveSupport()
.getOrCreate();
}
private static <T extends Oaf> String updateDeletedByInference(final String json, final Class<T> clazz) {
final ObjectMapper mapper = new ObjectMapper();
mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
try {
Oaf entity = mapper.readValue(json, clazz);
if (entity.getDataInfo()== null)
entity.setDataInfo(new DataInfo());
entity.getDataInfo().setDeletedbyinference(true);
return mapper.writeValueAsString(entity);
} catch (IOException e) {
throw new RuntimeException("Unable to convert json", e);
}
}
}

View File

@ -0,0 +1,47 @@
package eu.dnetlib.dhp.oa.dedup;
import eu.dnetlib.pace.util.Reporter;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.spark.util.LongAccumulator;
import scala.Serializable;
import scala.Tuple2;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
public class SparkReporter implements Serializable, Reporter {
final List<Tuple2<String, String>> relations = new ArrayList<>();
private static final Log log = LogFactory.getLog(SparkReporter.class);
Map<String, LongAccumulator> accumulators;
public SparkReporter(Map<String, LongAccumulator> accumulators){
this.accumulators = accumulators;
}
public void incrementCounter(String counterGroup, String counterName, long delta, Map<String, LongAccumulator> accumulators) {
final String accumulatorName = String.format("%s::%s", counterGroup, counterName);
if (accumulators.containsKey(accumulatorName)){
accumulators.get(accumulatorName).add(delta);
}
}
@Override
public void incrementCounter(String counterGroup, String counterName, long delta) {
incrementCounter(counterGroup, counterName, delta, accumulators);
}
@Override
public void emit(String type, String from, String to) {
relations.add(new Tuple2<>(from, to));
}
public List<Tuple2<String, String>> getRelations() {
return relations;
}
}

View File

@ -0,0 +1,143 @@
package eu.dnetlib.dhp.oa.dedup;
import com.fasterxml.jackson.databind.DeserializationFeature;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.schema.oaf.*;
import eu.dnetlib.pace.util.MapDocumentUtil;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.*;
import org.apache.hadoop.io.compress.GzipCodec;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;
import scala.Tuple2;
import java.io.IOException;
import java.io.Serializable;
public class SparkUpdateEntity implements Serializable {
final String IDJSONPATH = "$.id";
public static void main(String[] args) throws Exception {
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
IOUtils.toString(
SparkUpdateEntity.class.getResourceAsStream("/eu/dnetlib/dhp/oa/dedup/updateEntity_parameters.json")));
parser.parseArgument(args);
new SparkUpdateEntity().run(parser);
}
public boolean mergeRelExists(String basePath, String entity) throws IOException {
boolean result = false;
FileSystem fileSystem = FileSystem.get(new Configuration());
FileStatus[] fileStatuses = fileSystem.listStatus(new Path(basePath));
for (FileStatus fs : fileStatuses) {
if (fs.isDirectory())
if (fileSystem.exists(new Path(DedupUtility.createMergeRelPath(basePath, fs.getPath().getName(), entity))))
result = true;
}
return result;
}
public void run(ArgumentApplicationParser parser) throws IOException {
final String graphBasePath = parser.get("graphBasePath");
final String workingPath = parser.get("workingPath");
final String dedupGraphPath = parser.get("dedupGraphPath");
try (SparkSession spark = getSparkSession(parser)) {
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
//for each entity
for (OafEntityType entity: OafEntityType.values()) {
JavaRDD<String> sourceEntity = sc.textFile(DedupUtility.createEntityPath(graphBasePath, entity.toString()));
if (mergeRelExists(workingPath, entity.toString())) {
final Dataset<Relation> rel = spark.read().load(DedupUtility.createMergeRelPath(workingPath, "*", entity.toString())).as(Encoders.bean(Relation.class));
final JavaPairRDD<String, String> mergedIds = rel
.where("relClass == 'merges'")
.select(rel.col("target"))
.distinct()
.toJavaRDD()
.mapToPair((PairFunction<Row, String, String>) r -> new Tuple2<>(r.getString(0), "d"));
final JavaRDD<String> dedupEntity = sc.textFile(DedupUtility.createDedupRecordPath(workingPath, "*", entity.toString()));
JavaPairRDD<String, String> entitiesWithId = sourceEntity.mapToPair((PairFunction<String, String, String>) s -> new Tuple2<>(MapDocumentUtil.getJPathString(IDJSONPATH, s), s));
JavaRDD<String> map = entitiesWithId.leftOuterJoin(mergedIds).map(k -> k._2()._2().isPresent() ? updateDeletedByInference(k._2()._1(), getOafClass(entity)) : k._2()._1());
sourceEntity = map.union(dedupEntity);
}
sourceEntity.saveAsTextFile(dedupGraphPath + "/" + entity, GzipCodec.class);
}
}
}
public Class<? extends Oaf> getOafClass(OafEntityType className) {
switch (className.toString()) {
case "publication":
return Publication.class;
case "dataset":
return eu.dnetlib.dhp.schema.oaf.Dataset.class;
case "datasource":
return Datasource.class;
case "software":
return Software.class;
case "organization":
return Organization.class;
case "otherresearchproduct":
return OtherResearchProduct.class;
case "project":
return Project.class;
default:
throw new IllegalArgumentException("Illegal type " + className);
}
}
private static <T extends Oaf> String updateDeletedByInference(final String json, final Class<T> clazz) {
final ObjectMapper mapper = new ObjectMapper();
mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
try {
Oaf entity = mapper.readValue(json, clazz);
if (entity.getDataInfo()== null)
entity.setDataInfo(new DataInfo());
entity.getDataInfo().setDeletedbyinference(true);
return mapper.writeValueAsString(entity);
} catch (IOException e) {
throw new RuntimeException("Unable to convert json", e);
}
}
private static SparkSession getSparkSession(ArgumentApplicationParser parser) {
SparkConf conf = new SparkConf();
return SparkSession
.builder()
.appName(SparkUpdateEntity.class.getSimpleName())
.master(parser.get("master"))
.config(conf)
.enableHiveSupport()
.getOrCreate();
}
}

View File

@ -0,0 +1,80 @@
package eu.dnetlib.dhp.oa.dedup.graph;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.oa.dedup.DedupUtility;
import eu.dnetlib.pace.util.PaceException;
import org.apache.commons.lang.StringUtils;
import org.codehaus.jackson.annotate.JsonIgnore;
import java.io.IOException;
import java.io.Serializable;
import java.util.Set;
public class ConnectedComponent implements Serializable {
private Set<String> docIds;
private String ccId;
public ConnectedComponent() {
}
public ConnectedComponent(Set<String> docIds) {
this.docIds = docIds;
createID();
}
public String createID() {
if (docIds.size() > 1) {
final String s = getMin();
String prefix = s.split("\\|")[0];
ccId =prefix + "|dedup_______::" + DedupUtility.md5(s);
return ccId;
} else {
return docIds.iterator().next();
}
}
@JsonIgnore
public String getMin(){
final StringBuilder min = new StringBuilder();
docIds.forEach(i -> {
if (StringUtils.isBlank(min.toString())) {
min.append(i);
} else {
if (min.toString().compareTo(i) > 0) {
min.setLength(0);
min.append(i);
}
}
});
return min.toString();
}
@Override
public String toString(){
ObjectMapper mapper = new ObjectMapper();
try {
return mapper.writeValueAsString(this);
} catch (IOException e) {
throw new PaceException("Failed to create Json: ", e);
}
}
public Set<String> getDocIds() {
return docIds;
}
public void setDocIds(Set<String> docIds) {
this.docIds = docIds;
}
public String getCcId() {
return ccId;
}
public void setCcId(String ccId) {
this.ccId = ccId;
}
}

View File

@ -0,0 +1,37 @@
package eu.dnetlib.dhp.oa.dedup.graph
import org.apache.spark.graphx._
import org.apache.spark.rdd.RDD
import scala.collection.JavaConversions;
object GraphProcessor {
def findCCs(vertexes: RDD[(VertexId, String)], edges: RDD[Edge[String]], maxIterations: Int): RDD[ConnectedComponent] = {
val graph: Graph[String, String] = Graph(vertexes, edges).partitionBy(PartitionStrategy.RandomVertexCut) //TODO remember to remove partitionby
val cc = graph.connectedComponents(maxIterations).vertices
val joinResult = vertexes.leftOuterJoin(cc).map {
case (id, (openaireId, cc)) => {
if (cc.isEmpty) {
(id, openaireId)
}
else {
(cc.get, openaireId)
}
}
}
val connectedComponents = joinResult.groupByKey()
.map[ConnectedComponent](cc => asConnectedComponent(cc))
connectedComponents
}
def asConnectedComponent(group: (VertexId, Iterable[String])): ConnectedComponent = {
val docs = group._2.toSet[String]
val connectedComponent = new ConnectedComponent(JavaConversions.setAsJavaSet[String](docs));
connectedComponent
}
}

View File

@ -0,0 +1,18 @@
<configuration>
<property>
<name>jobTracker</name>
<value>yarnRM</value>
</property>
<property>
<name>nameNode</name>
<value>hdfs://nameservice1</value>
</property>
<property>
<name>oozie.use.system.libpath</name>
<value>true</value>
</property>
<property>
<name>oozie.action.sharelib.for.spark</name>
<value>spark2</value>
</property>
</configuration>

View File

@ -0,0 +1,105 @@
<workflow-app name="Update Graph" xmlns="uri:oozie:workflow:0.5">
<parameters>
<property>
<name>graphBasePath</name>
<description>the raw graph base path</description>
</property>
<property>
<name>workingPath</name>
<description>path of the working directory</description>
</property>
<property>
<name>dedupGraphPath</name>
<description>path of the dedup graph</description>
</property>
<property>
<name>sparkDriverMemory</name>
<description>memory for driver process</description>
</property>
<property>
<name>sparkExecutorMemory</name>
<description>memory for individual executor</description>
</property>
<property>
<name>sparkExecutorCores</name>
<description>number of cores used by single executor</description>
</property>
</parameters>
<global>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<configuration>
<property>
<name>mapreduce.job.queuename</name>
<value>${queueName}</value>
</property>
<property>
<name>oozie.launcher.mapred.job.queue.name</name>
<value>${oozieLauncherQueueName}</value>
</property>
</configuration>
</global>
<start to="UpdateEntity"/>
<kill name="Kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<action name="UpdateEntity">
<spark xmlns="uri:oozie:spark-action:0.2">
<prepare>
<delete path='${dedupGraphPath}'/>
</prepare>
<master>yarn</master>
<mode>cluster</mode>
<name>Update Entity</name>
<class>eu.dnetlib.dhp.oa.dedup.SparkUpdateEntity</class>
<jar>dhp-dedup-openaire-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory ${sparkExecutorMemory}
--executor-cores ${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener"
--conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener"
--conf spark.sql.warehouse.dir="/user/hive/warehouse"
</spark-opts>
<arg>-mt</arg><arg>yarn</arg>
<arg>--i</arg><arg>${graphBasePath}</arg>
<arg>--w</arg><arg>${workingPath}</arg>
<arg>--o</arg><arg>${dedupGraphPath}</arg>
</spark>
<ok to="PropagateRelation"/>
<error to="Kill"/>
</action>
<action name="PropagateRelation">
<spark xmlns="uri:oozie:spark-action:0.2">
<prepare>
<delete path='${dedupGraphPath}/relation'/>
</prepare>
<master>yarn</master>
<mode>cluster</mode>
<name>Update Relations</name>
<class>eu.dnetlib.dhp.oa.dedup.SparkPropagateRelation</class>
<jar>dhp-dedup-openaire-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory ${sparkExecutorMemory}
--executor-cores ${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener"
--conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener"
--conf spark.sql.warehouse.dir="/user/hive/warehouse"
</spark-opts>
<arg>-mt</arg><arg>yarn</arg>
<arg>--i</arg><arg>${graphBasePath}</arg>
<arg>--o</arg><arg>${dedupGraphPath}</arg>
<arg>--w</arg><arg>${workingPath}</arg>
</spark>
<ok to="End"/>
<error to="Kill"/>
</action>
<end name="End"/>
</workflow-app>

View File

@ -0,0 +1,32 @@
[
{
"paramName": "mt",
"paramLongName": "master",
"paramDescription": "should be local or yarn",
"paramRequired": true
},
{
"paramName": "asi",
"paramLongName": "actionSetId",
"paramDescription": "action set identifier (name of the orchestrator)",
"paramRequired": true
},
{
"paramName": "i",
"paramLongName": "graphBasePath",
"paramDescription": "the base path of the raw graph",
"paramRequired": true
},
{
"paramName": "la",
"paramLongName": "isLookUpUrl",
"paramDescription": "the url for the lookup service",
"paramRequired": true
},
{
"paramName": "w",
"paramLongName": "workingPath",
"paramDescription": "path for the working directory",
"paramRequired": true
}
]

View File

@ -0,0 +1,32 @@
[
{
"paramName": "mt",
"paramLongName": "master",
"paramDescription": "should be local or yarn",
"paramRequired": true
},
{
"paramName": "i",
"paramLongName": "graphBasePath",
"paramDescription": "the base path of raw graph",
"paramRequired": true
},
{
"paramName": "w",
"paramLongName": "workingPath",
"paramDescription": "the working directory path",
"paramRequired": true
},
{
"paramName": "la",
"paramLongName": "isLookUpUrl",
"paramDescription": "the url of the lookup service",
"paramRequired": true
},
{
"paramName": "asi",
"paramLongName": "actionSetId",
"paramDescription": "the id of the actionset (orchestrator)",
"paramRequired": true
}
]

View File

@ -0,0 +1,32 @@
[
{
"paramName": "mt",
"paramLongName": "master",
"paramDescription": "should be local or yarn",
"paramRequired": true
},
{
"paramName": "la",
"paramLongName": "isLookUpUrl",
"paramDescription": "address for the LookUp",
"paramRequired": true
},
{
"paramName": "asi",
"paramLongName": "actionSetId",
"paramDescription": "action set identifier (name of the orchestrator)",
"paramRequired": true
},
{
"paramName": "i",
"paramLongName": "graphBasePath",
"paramDescription": "the base path of the raw graph",
"paramRequired": true
},
{
"paramName": "w",
"paramLongName": "workingPath",
"paramDescription": "path of the working directory",
"paramRequired": true
}
]

View File

@ -0,0 +1,26 @@
[
{
"paramName": "mt",
"paramLongName": "master",
"paramDescription": "should be local or yarn",
"paramRequired": true
},
{
"paramName": "i",
"paramLongName": "graphBasePath",
"paramDescription": "the base path of raw graph",
"paramRequired": true
},
{
"paramName": "w",
"paramLongName": "workingPath",
"paramDescription": "the working directory path",
"paramRequired": true
},
{
"paramName": "o",
"paramLongName": "dedupGraphPath",
"paramDescription": "the path of the dedup graph",
"paramRequired": true
}
]

View File

@ -0,0 +1,18 @@
<configuration>
<property>
<name>jobTracker</name>
<value>yarnRM</value>
</property>
<property>
<name>nameNode</name>
<value>hdfs://nameservice1</value>
</property>
<property>
<name>oozie.use.system.libpath</name>
<value>true</value>
</property>
<property>
<name>oozie.action.sharelib.for.spark</name>
<value>spark2</value>
</property>
</configuration>

View File

@ -0,0 +1,138 @@
<workflow-app name="Duplicate Scan" xmlns="uri:oozie:workflow:0.5">
<parameters>
<property>
<name>graphBasePath</name>
<description>the raw graph base path</description>
</property>
<property>
<name>isLookUpUrl</name>
<description>the address of the lookUp service</description>
</property>
<property>
<name>actionSetId</name>
<description>id of the actionSet</description>
</property>
<property>
<name>workingPath</name>
<description>path for the working directory</description>
</property>
<property>
<name>sparkDriverMemory</name>
<description>memory for driver process</description>
</property>
<property>
<name>sparkExecutorMemory</name>
<description>memory for individual executor</description>
</property>
<property>
<name>sparkExecutorCores</name>
<description>number of cores used by single executor</description>
</property>
</parameters>
<global>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<configuration>
<property>
<name>mapreduce.job.queuename</name>
<value>${queueName}</value>
</property>
<property>
<name>oozie.launcher.mapred.job.queue.name</name>
<value>${oozieLauncherQueueName}</value>
</property>
</configuration>
</global>
<start to="CreateSimRel"/>
<kill name="Kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<action name="CreateSimRel">
<spark xmlns="uri:oozie:spark-action:0.2">
<prepare>
<delete path="${workingPath}/${actionSetId}/*_simrel"/>
</prepare>
<master>yarn</master>
<mode>cluster</mode>
<name>Create Similarity Relations</name>
<class>eu.dnetlib.dhp.oa.dedup.SparkCreateSimRels</class>
<jar>dhp-dedup-openaire-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory ${sparkExecutorMemory}
--executor-cores ${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener"
--conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener"
</spark-opts>
<arg>-mt</arg><arg>yarn</arg>
<arg>--i</arg><arg>${graphBasePath}</arg>
<arg>--la</arg><arg>${isLookUpUrl}</arg>
<arg>--asi</arg><arg>${actionSetId}</arg>
<arg>--w</arg><arg>${workingPath}</arg>
</spark>
<ok to="CreateMergeRel"/>
<error to="Kill"/>
</action>
<action name="CreateMergeRel">
<spark xmlns="uri:oozie:spark-action:0.2">
<prepare>
<delete path='${workingPath}/${actionSetId}/*_mergerel'/>
</prepare>
<master>yarn</master>
<mode>cluster</mode>
<name>Create Merge Relations</name>
<class>eu.dnetlib.dhp.oa.dedup.SparkCreateConnectedComponent</class>
<jar>dhp-dedup-openaire-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory ${sparkExecutorMemory}
--executor-cores ${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener"
--conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener"
--conf spark.sql.warehouse.dir="/user/hive/warehouse"
</spark-opts>
<arg>-mt</arg><arg>yarn</arg>
<arg>--i</arg><arg>${graphBasePath}</arg>
<arg>--w</arg><arg>${workingPath}</arg>
<arg>--la</arg><arg>${isLookUpUrl}</arg>
<arg>--asi</arg><arg>${actionSetId}</arg>
</spark>
<ok to="CreateDedupRecord"/>
<error to="Kill"/>
</action>
<action name="CreateDedupRecord">
<spark xmlns="uri:oozie:spark-action:0.2">
<prepare>
<delete path='${workingPath}/${actionSetId}/*_deduprecord'/>
</prepare>
<master>yarn</master>
<mode>cluster</mode>
<name>Create Dedup Record</name>
<class>eu.dnetlib.dhp.oa.dedup.SparkCreateDedupRecord</class>
<jar>dhp-dedup-openaire-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory ${sparkExecutorMemory}
--executor-cores ${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener"
--conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener"
--conf spark.sql.warehouse.dir="/user/hive/warehouse"
</spark-opts>
<arg>-mt</arg><arg>yarn</arg>
<arg>--i</arg><arg>${graphBasePath}</arg>
<arg>--w</arg><arg>${workingPath}</arg>
<arg>--la</arg><arg>${isLookUpUrl}</arg>
<arg>--asi</arg><arg>${actionSetId}</arg>
</spark>
<ok to="End"/>
<error to="Kill"/>
</action>
<end name="End"/>
</workflow-app>

View File

@ -0,0 +1,26 @@
[
{
"paramName": "mt",
"paramLongName": "master",
"paramDescription": "should be local or yarn",
"paramRequired": true
},
{
"paramName": "i",
"paramLongName": "graphBasePath",
"paramDescription": "the base path of raw graph",
"paramRequired": true
},
{
"paramName": "w",
"paramLongName": "workingPath",
"paramDescription": "the working directory path",
"paramRequired": true
},
{
"paramName": "o",
"paramLongName": "dedupGraphPath",
"paramDescription": "the path of the dedup graph",
"paramRequired": true
}
]

View File

@ -1,10 +1,10 @@
package eu.dnetlib.dedup; package eu.dnetlib.dhp.oa.dedup.dedup;
import eu.dnetlib.dhp.oa.dedup.DedupUtility;
import eu.dnetlib.dhp.schema.oaf.Publication; import eu.dnetlib.dhp.schema.oaf.Publication;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.codehaus.jackson.map.ObjectMapper; import org.codehaus.jackson.map.ObjectMapper;
import org.junit.Before; import org.junit.jupiter.api.BeforeEach;
import org.junit.Test;
import java.io.IOException; import java.io.IOException;
import java.util.Arrays; import java.util.Arrays;
@ -13,12 +13,12 @@ import java.util.stream.Collectors;
public class MergeAuthorTest { public class MergeAuthorTest {
List<Publication> publicationsToMerge; private List<Publication> publicationsToMerge;
final ObjectMapper mapper = new ObjectMapper(); private final ObjectMapper mapper = new ObjectMapper();
@Before @BeforeEach
public void setUp() throws Exception { public void setUp() throws Exception {
final String json = IOUtils.toString(this.getClass().getResourceAsStream("/eu/dnetlib/dedup/json/authors_merge.json")); final String json = IOUtils.toString(this.getClass().getResourceAsStream("/eu/dnetlib/dhp/dedup/json/authors_merge.json"));
publicationsToMerge = Arrays.asList(json.split("\n")).stream().map(s-> { publicationsToMerge = Arrays.asList(json.split("\n")).stream().map(s-> {
@ -28,34 +28,19 @@ public class MergeAuthorTest {
throw new RuntimeException(e); throw new RuntimeException(e);
} }
}).collect(Collectors.toList()); }).collect(Collectors.toList());
} }
//FIX ME Michele DB this tests doesn't work
@Test //@Test
public void test() throws Exception { public void test() throws Exception {
Publication dedup = new Publication(); Publication dedup = new Publication();
publicationsToMerge.forEach(p-> { publicationsToMerge.forEach(p-> {
dedup.mergeFrom(p); dedup.mergeFrom(p);
dedup.setAuthor(DedupUtility.mergeAuthor(dedup.getAuthor(),p.getAuthor())); dedup.setAuthor(DedupUtility.mergeAuthor(dedup.getAuthor(),p.getAuthor()));
}); });
System.out.println(mapper.writeValueAsString(dedup)); System.out.println(mapper.writeValueAsString(dedup));
} }
} }

View File

@ -1,48 +1,43 @@
package eu.dnetlib.dedup; package eu.dnetlib.dhp.oa.dedup.dedup;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.common.hash.HashFunction; import com.google.common.hash.HashFunction;
import com.google.common.hash.Hashing; import com.google.common.hash.Hashing;
import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.schema.oaf.Publication; import eu.dnetlib.dhp.oa.dedup.SparkCreateConnectedComponent;
import org.apache.commons.io.FileUtils; import eu.dnetlib.dhp.oa.dedup.SparkCreateDedupRecord;
import org.apache.commons.io.IOUtils; import eu.dnetlib.dhp.oa.dedup.SparkCreateSimRels;
import org.junit.Before; import org.junit.jupiter.api.BeforeEach;
import org.junit.Ignore; import org.junit.jupiter.api.Disabled;
import org.junit.Test;
import java.io.File;
import java.io.IOException; import java.io.IOException;
import java.util.List;
public class SparkCreateDedupTest { public class SparkCreateDedupTest {
String configuration; String configuration;
String entity = "organization"; String entity = "organization";
@Before @BeforeEach
public void setUp() throws IOException { public void setUp() throws IOException {
configuration = IOUtils.toString(getClass().getResourceAsStream("/eu/dnetlib/dedup/conf/org.curr.conf.json")); // configuration = IOUtils.toString(getClass().getResourceAsStream("/eu/dnetlib/dedup/conf/org.curr.conf.json"));
configuration = "";
} }
@Test @Disabled("must be parametrized to run locally")
@Ignore
public void createSimRelsTest() throws Exception { public void createSimRelsTest() throws Exception {
SparkCreateSimRels.main(new String[] { SparkCreateSimRels.main(new String[]{
"-mt", "local[*]", "-mt", "local[*]",
"-s", "/Users/miconis/dumps", "-i", "/Users/miconis/dumps",
"-e", entity, "-o", "/tmp/dedup/rawset_test",
"-c", ArgumentApplicationParser.compressArgument(configuration), "-asi", "dedup-similarity-result-levenstein",
"-t", "/tmp/dedup", "-la", "lookupurl",
"-w", "workingPath"
}); });
} }
@Test @Disabled("must be parametrized to run locally")
@Ignore
public void createCCTest() throws Exception { public void createCCTest() throws Exception {
SparkCreateConnectedComponent.main(new String[] { SparkCreateConnectedComponent.main(new String[]{
"-mt", "local[*]", "-mt", "local[*]",
"-s", "/Users/miconis/dumps", "-s", "/Users/miconis/dumps",
"-e", entity, "-e", entity,
@ -51,10 +46,9 @@ public class SparkCreateDedupTest {
}); });
} }
@Test @Disabled("must be parametrized to run locally")
@Ignore
public void dedupRecordTest() throws Exception { public void dedupRecordTest() throws Exception {
SparkCreateDedupRecord.main(new String[] { SparkCreateDedupRecord.main(new String[]{
"-mt", "local[*]", "-mt", "local[*]",
"-s", "/Users/miconis/dumps", "-s", "/Users/miconis/dumps",
"-e", entity, "-e", entity,
@ -63,24 +57,21 @@ public class SparkCreateDedupTest {
}); });
} }
@Test @Disabled("must be parametrized to run locally")
public void printConfiguration() throws Exception { public void printConfiguration() throws Exception {
System.out.println(ArgumentApplicationParser.compressArgument(configuration)); System.out.println(ArgumentApplicationParser.compressArgument(configuration));
} }
@Test @Disabled("must be parametrized to run locally")
public void testHashCode() { public void testHashCode() {
final String s1 = "20|grid________::6031f94bef015a37783268ec1e75f17f"; final String s1 = "20|grid________::6031f94bef015a37783268ec1e75f17f";
final String s2 = "20|nsf_________::b12be9edf414df8ee66b4c52a2d8da46"; final String s2 = "20|nsf_________::b12be9edf414df8ee66b4c52a2d8da46";
final HashFunction hashFunction = Hashing.murmur3_128(); final HashFunction hashFunction = Hashing.murmur3_128();
System.out.println( s1.hashCode()); System.out.println(s1.hashCode());
System.out.println(hashFunction.hashUnencodedChars(s1).asLong()); System.out.println(hashFunction.hashString(s1).asLong());
System.out.println( s2.hashCode()); System.out.println(s2.hashCode());
System.out.println(hashFunction.hashUnencodedChars(s2).asLong()); System.out.println(hashFunction.hashString(s2).asLong());
} }
} }

File diff suppressed because one or more lines are too long

View File

@ -3,6 +3,7 @@
"threshold" : "0.99", "threshold" : "0.99",
"dedupRun" : "001", "dedupRun" : "001",
"entityType" : "organization", "entityType" : "organization",
"subEntityValue": "organization",
"orderField" : "legalname", "orderField" : "legalname",
"queueMaxSize" : "2000", "queueMaxSize" : "2000",
"groupMaxSize" : "50", "groupMaxSize" : "50",
@ -87,8 +88,8 @@
} }
} }
], ],
"threshold": 0.7, "threshold": 0.1,
"aggregation": "W_MEAN", "aggregation": "AVG",
"positive": "layer4", "positive": "layer4",
"negative": "NO_MATCH", "negative": "NO_MATCH",
"undefined": "NO_MATCH", "undefined": "NO_MATCH",
@ -106,7 +107,7 @@
} }
} }
], ],
"threshold": 0.9, "threshold": 0.7,
"aggregation": "AVG", "aggregation": "AVG",
"positive": "layer5", "positive": "layer5",
"negative": "NO_MATCH", "negative": "NO_MATCH",
@ -129,7 +130,9 @@
"comparator": "jaroWinklerNormalizedName", "comparator": "jaroWinklerNormalizedName",
"weight": 0.1, "weight": 0.1,
"countIfUndefined": "false", "countIfUndefined": "false",
"params": {} "params": {
"windowSize": 4
}
} }
], ],
"threshold": 0.9, "threshold": 0.9,
@ -145,14 +148,14 @@
{ "name" : "legalshortname", "type" : "String", "path" : "$.legalshortname.value"}, { "name" : "legalshortname", "type" : "String", "path" : "$.legalshortname.value"},
{ "name" : "legalname", "type" : "String", "path" : "$.legalname.value" }, { "name" : "legalname", "type" : "String", "path" : "$.legalname.value" },
{ "name" : "websiteurl", "type" : "URL", "path" : "$.websiteurl.value" }, { "name" : "websiteurl", "type" : "URL", "path" : "$.websiteurl.value" },
{ "name" : "gridid", "type" : "String", "path" : "$.pid[?(@.qualifier.classid =='grid.ac')].value"}, { "name" : "gridid", "type" : "String", "path" : "$.pid[?(@.qualifier.classid =='grid')].value"},
{ "name" : "originalId", "type" : "String", "path" : "$.id" } { "name" : "originalId", "type" : "String", "path" : "$.id" }
], ],
"blacklists" : { "blacklists" : {
"legalname" : [] "legalname" : []
}, },
"synonyms": { "synonyms": {
"key::1": ["university","università","università studi","universitario","universitaria","université", "universite", "universitaire","universitaires","universidad","universitade","Universität","universitaet","Uniwersytet","университет","universiteit","πανεπιστήμιο","universitesi","universiteti", "universiti"], "key::1": ["university","università", "universitas", "università studi","universitario","universitaria","université", "universite", "universitaire","universitaires","universidad","universitade","Universität","universitaet","Uniwersytet","университет","universiteit","πανεπιστήμιο","universitesi","universiteti", "universiti"],
"key::2": ["studies","studi","études","estudios","estudos","Studien","studia","исследования","studies","σπουδές"], "key::2": ["studies","studi","études","estudios","estudos","Studien","studia","исследования","studies","σπουδές"],
"key::3": ["advanced","superiore","supérieur","supérieure","supérieurs","supérieures","avancado","avancados","fortgeschrittene","fortgeschritten","zaawansowany","передовой","gevorderd","gevorderde","προχωρημένος","προχωρημένη","προχωρημένο","προχωρημένες","προχωρημένα","wyzsza"], "key::3": ["advanced","superiore","supérieur","supérieure","supérieurs","supérieures","avancado","avancados","fortgeschrittene","fortgeschritten","zaawansowany","передовой","gevorderd","gevorderde","προχωρημένος","προχωρημένη","προχωρημένο","προχωρημένες","προχωρημένα","wyzsza"],
"key::4": ["institute","istituto","institut","instituto","instituto","Institut","instytut","институт","instituut","ινστιτούτο"], "key::4": ["institute","istituto","institut","instituto","instituto","Institut","instytut","институт","instituut","ινστιτούτο"],

View File

@ -28,34 +28,10 @@
"idPath": "$.id" "idPath": "$.id"
}, },
"pace": { "pace": {
"clustering": [ "clustering" : [
{ { "name" : "ngrampairs", "fields" : [ "title" ], "params" : { "max" : "1", "ngramLen" : "3"} },
"name": "ngrampairs", { "name" : "suffixprefix", "fields" : [ "title" ], "params" : { "max" : "1", "len" : "3" } },
"fields": [ { "name" : "lowercase", "fields" : [ "doi" ], "params" : { } }
"title"
],
"params": {
"max": "1",
"ngramLen": "3"
}
},
{
"name": "suffixprefix",
"fields": [
"title"
],
"params": {
"max": "1",
"len": "3"
}
},
{
"name": "lowercase",
"fields": [
"doi"
],
"params": {}
}
], ],
"decisionTree": { "decisionTree": {
"start": { "start": {

View File

@ -3,15 +3,15 @@
<parent> <parent>
<artifactId>dhp-workflows</artifactId> <artifactId>dhp-workflows</artifactId>
<groupId>eu.dnetlib.dhp</groupId> <groupId>eu.dnetlib.dhp</groupId>
<version>1.1.6-SNAPSHOT</version> <version>1.1.7-SNAPSHOT</version>
</parent> </parent>
<modelVersion>4.0.0</modelVersion> <modelVersion>4.0.0</modelVersion>
<artifactId>dhp-dedup</artifactId> <artifactId>dhp-dedup-scholexplorer</artifactId>
<build> <build>
<plugins> <plugins>
<plugin> <plugin>
<groupId>net.alchim31.maven</groupId> <groupId>net.alchim31.maven</groupId>
<artifactId>scala-maven-plugin</artifactId> <artifactId>scala-maven-plugin</artifactId>
<version>4.0.1</version> <version>4.0.1</version>
@ -36,8 +36,8 @@
<scalaVersion>${scala.version}</scalaVersion> <scalaVersion>${scala.version}</scalaVersion>
</configuration> </configuration>
</plugin> </plugin>
</plugins> </plugins>
</build> </build>
<dependencies> <dependencies>
@ -61,10 +61,6 @@
<artifactId>dhp-schemas</artifactId> <artifactId>dhp-schemas</artifactId>
<version>${project.version}</version> <version>${project.version}</version>
</dependency> </dependency>
<dependency>
<groupId>com.arakelian</groupId>
<artifactId>java-jq</artifactId>
</dependency>
<dependency> <dependency>
<groupId>eu.dnetlib</groupId> <groupId>eu.dnetlib</groupId>

View File

@ -0,0 +1,283 @@
package eu.dnetlib.dedup;
import com.fasterxml.jackson.databind.DeserializationFeature;
import com.google.common.collect.Lists;
import eu.dnetlib.dhp.schema.oaf.*;
import eu.dnetlib.pace.config.DedupConfig;
import eu.dnetlib.pace.util.MapDocumentUtil;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SparkSession;
import com.fasterxml.jackson.databind.ObjectMapper;
import scala.Tuple2;
import java.util.Collection;
public class DedupRecordFactory {
public static JavaRDD<OafEntity> createDedupRecord(final JavaSparkContext sc, final SparkSession spark, final String mergeRelsInputPath, final String entitiesInputPath, final OafEntityType entityType, final DedupConfig dedupConf) {
long ts = System.currentTimeMillis();
//<id, json_entity>
final JavaPairRDD<String, String> inputJsonEntities = sc.textFile(entitiesInputPath)
.mapToPair((PairFunction<String, String, String>) it ->
new Tuple2<String, String>(MapDocumentUtil.getJPathString(dedupConf.getWf().getIdPath(), it), it)
);
//<source, target>: source is the dedup_id, target is the id of the mergedIn
JavaPairRDD<String, String> mergeRels = spark
.read().load(mergeRelsInputPath).as(Encoders.bean(Relation.class))
.where("relClass=='merges'")
.javaRDD()
.mapToPair(
(PairFunction<Relation, String, String>) r ->
new Tuple2<String, String>(r.getTarget(), r.getSource())
);
//<dedup_id, json_entity_merged>
final JavaPairRDD<String, String> joinResult = mergeRels.join(inputJsonEntities).mapToPair((PairFunction<Tuple2<String, Tuple2<String, String>>, String, String>) Tuple2::_2);
JavaPairRDD<String, Iterable<String>> sortedJoinResult = joinResult.groupByKey();
switch (entityType) {
case publication:
return sortedJoinResult.map(p -> DedupRecordFactory.publicationMerger(p, ts));
case dataset:
return sortedJoinResult.map(d -> DedupRecordFactory.datasetMerger(d, ts));
case project:
return sortedJoinResult.map(p -> DedupRecordFactory.projectMerger(p, ts));
case software:
return sortedJoinResult.map(s -> DedupRecordFactory.softwareMerger(s, ts));
case datasource:
return sortedJoinResult.map(d -> DedupRecordFactory.datasourceMerger(d, ts));
case organization:
return sortedJoinResult.map(o -> DedupRecordFactory.organizationMerger(o, ts));
case otherresearchproduct:
return sortedJoinResult.map(o -> DedupRecordFactory.otherresearchproductMerger(o, ts));
default:
return null;
}
}
private static Publication publicationMerger(Tuple2<String, Iterable<String>> e, final long ts) {
Publication p = new Publication(); //the result of the merge, to be returned at the end
p.setId(e._1());
final ObjectMapper mapper = new ObjectMapper();
mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
final Collection<String> dateofacceptance = Lists.newArrayList();
if (e._2() != null)
e._2().forEach(pub -> {
try {
Publication publication = mapper.readValue(pub, Publication.class);
p.mergeFrom(publication);
p.setAuthor(DedupUtility.mergeAuthor(p.getAuthor(), publication.getAuthor()));
//add to the list if they are not null
if (publication.getDateofacceptance() != null)
dateofacceptance.add(publication.getDateofacceptance().getValue());
} catch (Exception exc) {
throw new RuntimeException(exc);
}
});
p.setDateofacceptance(DatePicker.pick(dateofacceptance));
if (p.getDataInfo() == null)
p.setDataInfo(new DataInfo());
p.getDataInfo().setTrust("0.9");
p.setLastupdatetimestamp(ts);
return p;
}
private static Dataset datasetMerger(Tuple2<String, Iterable<String>> e, final long ts) {
Dataset d = new Dataset(); //the result of the merge, to be returned at the end
d.setId(e._1());
final ObjectMapper mapper = new ObjectMapper();
mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
final Collection<String> dateofacceptance = Lists.newArrayList();
if (e._2() != null)
e._2().forEach(dat -> {
try {
Dataset dataset = mapper.readValue(dat, Dataset.class);
d.mergeFrom(dataset);
d.setAuthor(DedupUtility.mergeAuthor(d.getAuthor(), dataset.getAuthor()));
//add to the list if they are not null
if (dataset.getDateofacceptance() != null)
dateofacceptance.add(dataset.getDateofacceptance().getValue());
} catch (Exception exc) {
throw new RuntimeException(exc);
}
});
d.setDateofacceptance(DatePicker.pick(dateofacceptance));
if (d.getDataInfo() == null)
d.setDataInfo(new DataInfo());
d.getDataInfo().setTrust("0.9");
d.setLastupdatetimestamp(ts);
return d;
}
private static Project projectMerger(Tuple2<String, Iterable<String>> e, final long ts) {
Project p = new Project(); //the result of the merge, to be returned at the end
p.setId(e._1());
final ObjectMapper mapper = new ObjectMapper();
mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
if (e._2() != null)
e._2().forEach(proj -> {
try {
Project project = mapper.readValue(proj, Project.class);
p.mergeFrom(project);
} catch (Exception exc) {
throw new RuntimeException(exc);
}
});
if (p.getDataInfo() == null)
p.setDataInfo(new DataInfo());
p.getDataInfo().setTrust("0.9");
p.setLastupdatetimestamp(ts);
return p;
}
private static Software softwareMerger(Tuple2<String, Iterable<String>> e, final long ts) {
Software s = new Software(); //the result of the merge, to be returned at the end
s.setId(e._1());
final ObjectMapper mapper = new ObjectMapper();
mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
final Collection<String> dateofacceptance = Lists.newArrayList();
if (e._2() != null)
e._2().forEach(soft -> {
try {
Software software = mapper.readValue(soft, Software.class);
s.mergeFrom(software);
s.setAuthor(DedupUtility.mergeAuthor(s.getAuthor(), software.getAuthor()));
//add to the list if they are not null
if (software.getDateofacceptance() != null)
dateofacceptance.add(software.getDateofacceptance().getValue());
} catch (Exception exc) {
throw new RuntimeException(exc);
}
});
s.setDateofacceptance(DatePicker.pick(dateofacceptance));
if (s.getDataInfo() == null)
s.setDataInfo(new DataInfo());
s.getDataInfo().setTrust("0.9");
s.setLastupdatetimestamp(ts);
return s;
}
private static Datasource datasourceMerger(Tuple2<String, Iterable<String>> e, final long ts) {
Datasource d = new Datasource(); //the result of the merge, to be returned at the end
d.setId(e._1());
final ObjectMapper mapper = new ObjectMapper();
mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
if (e._2() != null)
e._2().forEach(dat -> {
try {
Datasource datasource = mapper.readValue(dat, Datasource.class);
d.mergeFrom(datasource);
} catch (Exception exc) {
throw new RuntimeException(exc);
}
});
if (d.getDataInfo() == null)
d.setDataInfo(new DataInfo());
d.getDataInfo().setTrust("0.9");
d.setLastupdatetimestamp(ts);
return d;
}
private static Organization organizationMerger(Tuple2<String, Iterable<String>> e, final long ts) {
Organization o = new Organization(); //the result of the merge, to be returned at the end
o.setId(e._1());
final ObjectMapper mapper = new ObjectMapper();
mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
StringBuilder trust = new StringBuilder("0.0");
if (e._2() != null)
e._2().forEach(pub -> {
try {
Organization organization = mapper.readValue(pub, Organization.class);
final String currentTrust = organization.getDataInfo().getTrust();
if (!"1.0".equals(currentTrust)) {
trust.setLength(0);
trust.append(currentTrust);
}
o.mergeFrom(organization);
} catch (Exception exc) {
throw new RuntimeException(exc);
}
});
if (o.getDataInfo() == null)
{
o.setDataInfo(new DataInfo());
}
if (o.getDataInfo() == null)
o.setDataInfo(new DataInfo());
o.getDataInfo().setTrust("0.9");
o.setLastupdatetimestamp(ts);
return o;
}
private static OtherResearchProduct otherresearchproductMerger(Tuple2<String, Iterable<String>> e, final long ts) {
OtherResearchProduct o = new OtherResearchProduct(); //the result of the merge, to be returned at the end
o.setId(e._1());
final ObjectMapper mapper = new ObjectMapper();
mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
final Collection<String> dateofacceptance = Lists.newArrayList();
if (e._2() != null)
e._2().forEach(orp -> {
try {
OtherResearchProduct otherResearchProduct = mapper.readValue(orp, OtherResearchProduct.class);
o.mergeFrom(otherResearchProduct);
o.setAuthor(DedupUtility.mergeAuthor(o.getAuthor(), otherResearchProduct.getAuthor()));
//add to the list if they are not null
if (otherResearchProduct.getDateofacceptance() != null)
dateofacceptance.add(otherResearchProduct.getDateofacceptance().getValue());
} catch (Exception exc) {
throw new RuntimeException(exc);
}
});
if (o.getDataInfo() == null)
o.setDataInfo(new DataInfo());
o.setDateofacceptance(DatePicker.pick(dateofacceptance));
o.getDataInfo().setTrust("0.9");
o.setLastupdatetimestamp(ts);
return o;
}
}

View File

@ -6,7 +6,6 @@ import eu.dnetlib.dhp.schema.oaf.Author;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty; import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
import eu.dnetlib.pace.clustering.BlacklistAwareClusteringCombiner; import eu.dnetlib.pace.clustering.BlacklistAwareClusteringCombiner;
import eu.dnetlib.pace.config.DedupConfig; import eu.dnetlib.pace.config.DedupConfig;
import eu.dnetlib.pace.model.MapDocument; import eu.dnetlib.pace.model.MapDocument;
import eu.dnetlib.pace.model.Person; import eu.dnetlib.pace.model.Person;
import org.apache.commons.codec.binary.Hex; import org.apache.commons.codec.binary.Hex;
@ -29,7 +28,6 @@ import java.security.MessageDigest;
import java.text.Normalizer; import java.text.Normalizer;
import java.util.*; import java.util.*;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import java.util.stream.Stream;
public class DedupUtility { public class DedupUtility {
private static final Double THRESHOLD = 0.95; private static final Double THRESHOLD = 0.95;
@ -151,11 +149,11 @@ public class DedupUtility {
} }
public static String createSimRelPath(final String basePath, final String entityType) { public static String createSimRelPath(final String basePath, final String entityType) {
return String.format("%s/%s_simRel", basePath, entityType); return String.format("%s/%s/simRel", basePath, entityType);
} }
public static String createMergeRelPath(final String basePath, final String entityType) { public static String createMergeRelPath(final String basePath, final String entityType) {
return String.format("%s/%s_mergeRel", basePath, entityType); return String.format("%s/%s/mergeRel", basePath, entityType);
} }
private static Double sim(Author a, Author b) { private static Double sim(Author a, Author b) {

View File

@ -76,4 +76,5 @@ public class SparkCreateConnectedComponent {
public static long getHashcode(final String id) { public static long getHashcode(final String id) {
return Hashing.murmur3_128().hashString(id).asLong(); return Hashing.murmur3_128().hashString(id).asLong();
} }
} }

View File

@ -10,7 +10,6 @@ import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.SparkSession; import org.apache.spark.sql.SparkSession;
public class SparkCreateDedupRecord { public class SparkCreateDedupRecord {
public static void main(String[] args) throws Exception { public static void main(String[] args) throws Exception {
final ArgumentApplicationParser parser = new ArgumentApplicationParser(IOUtils.toString(SparkCreateDedupRecord.class.getResourceAsStream("/eu/dnetlib/dhp/dedup/dedupRecord_parameters.json"))); final ArgumentApplicationParser parser = new ArgumentApplicationParser(IOUtils.toString(SparkCreateDedupRecord.class.getResourceAsStream("/eu/dnetlib/dhp/dedup/dedupRecord_parameters.json")));
parser.parseArgument(args); parser.parseArgument(args);
@ -24,16 +23,12 @@ public class SparkCreateDedupRecord {
final String sourcePath = parser.get("sourcePath"); final String sourcePath = parser.get("sourcePath");
final String entity = parser.get("entity"); final String entity = parser.get("entity");
final String dedupPath = parser.get("dedupPath"); final String dedupPath = parser.get("dedupPath");
// final DedupConfig dedupConf = DedupConfig.load(IOUtils.toString(SparkCreateDedupRecord.class.getResourceAsStream("/eu/dnetlib/dhp/dedup/conf/org.curr.conf2.json")));
final DedupConfig dedupConf = DedupConfig.load(parser.get("dedupConf")); final DedupConfig dedupConf = DedupConfig.load(parser.get("dedupConf"));
final JavaRDD<OafEntity> dedupRecord = DedupRecordFactory.createDedupRecord(sc, spark, DedupUtility.createMergeRelPath(dedupPath,entity), DedupUtility.createEntityPath(sourcePath,entity), OafEntityType.valueOf(entity), dedupConf); final JavaRDD<OafEntity> dedupRecord = DedupRecordFactory.createDedupRecord(sc, spark, DedupUtility.createMergeRelPath(dedupPath,entity), DedupUtility.createEntityPath(sourcePath,entity), OafEntityType.valueOf(entity), dedupConf);
dedupRecord.map(r-> { dedupRecord.map(r-> {
ObjectMapper mapper = new ObjectMapper(); ObjectMapper mapper = new ObjectMapper();
return mapper.writeValueAsString(r); return mapper.writeValueAsString(r);
}).saveAsTextFile(dedupPath+"/"+entity+"_dedup_record_json"); }).saveAsTextFile(dedupPath+"/"+entity+"/dedup_records");
} }
} }

View File

@ -44,7 +44,7 @@ public class SparkCreateSimRels {
// final DedupConfig dedupConf = DedupConfig.load(IOUtils.toString(SparkCreateSimRels.class.getResourceAsStream("/eu/dnetlib/dhp/dedup/conf/org.curr.conf.json"))); // final DedupConfig dedupConf = DedupConfig.load(IOUtils.toString(SparkCreateSimRels.class.getResourceAsStream("/eu/dnetlib/dhp/dedup/conf/org.curr.conf.json")));
final DedupConfig dedupConf = DedupConfig.load(parser.get("dedupConf")); final DedupConfig dedupConf = DedupConfig.load(parser.get("dedupConf"));
final long total = sc.textFile(inputPath + "/" + entity).count();
JavaPairRDD<String, MapDocument> mapDocument = sc.textFile(inputPath + "/" + entity) JavaPairRDD<String, MapDocument> mapDocument = sc.textFile(inputPath + "/" + entity)
.mapToPair(s->{ .mapToPair(s->{
@ -70,4 +70,4 @@ public class SparkCreateSimRels {
spark.createDataset(isSimilarToRDD.rdd(), Encoders.bean(Relation.class)).write().mode("overwrite").save( DedupUtility.createSimRelPath(targetPath,entity)); spark.createDataset(isSimilarToRDD.rdd(), Encoders.bean(Relation.class)).write().mode("overwrite").save( DedupUtility.createSimRelPath(targetPath,entity));
} }
} }

View File

@ -0,0 +1,97 @@
package eu.dnetlib.dedup.sx;
import com.fasterxml.jackson.databind.DeserializationFeature;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.schema.oaf.DataInfo;
import eu.dnetlib.dhp.schema.oaf.Relation;
import eu.dnetlib.dhp.utils.DHPUtils;
import org.apache.commons.io.IOUtils;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.*;
import scala.Tuple2;
import java.io.IOException;
public class SparkPropagateRelationsJob {
enum FieldType {
SOURCE,
TARGET
}
final static String SOURCEJSONPATH = "$.source";
final static String TARGETJSONPATH = "$.target";
public static void main(String[] args) throws Exception {
final ArgumentApplicationParser parser = new ArgumentApplicationParser(IOUtils.toString(SparkPropagateRelationsJob.class.getResourceAsStream("/eu/dnetlib/dhp/dedup/dedup_propagate_relation_parameters.json")));
parser.parseArgument(args);
final SparkSession spark = SparkSession
.builder()
.appName(SparkUpdateEntityJob.class.getSimpleName())
.master(parser.get("master"))
.getOrCreate();
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
final String relationPath = parser.get("relationPath");
final String mergeRelPath = parser.get("mergeRelPath");
final String targetRelPath = parser.get("targetRelPath");
final Dataset<Relation> merge = spark.read().load(mergeRelPath).as(Encoders.bean(Relation.class)).where("relClass == 'merges'");
final Dataset<Relation> rels= spark.read().load(relationPath).as(Encoders.bean(Relation.class));
final Dataset<Relation> firstJoin = rels.joinWith(merge, merge.col("target").equalTo(rels.col("source")), "left_outer")
.map((MapFunction<Tuple2<Relation, Relation>, Relation>) r -> {
final Relation mergeRelation = r._2();
final Relation relation = r._1();
if(mergeRelation!= null)
relation.setSource(mergeRelation.getSource());
return relation;
}, Encoders.bean(Relation.class));
final Dataset<Relation> secondJoin = firstJoin.joinWith(merge, merge.col("target").equalTo(firstJoin.col("target")), "left_outer")
.map((MapFunction<Tuple2<Relation, Relation>, Relation>) r -> {
final Relation mergeRelation = r._2();
final Relation relation = r._1();
if (mergeRelation != null )
relation.setTarget(mergeRelation.getSource());
return relation;
}, Encoders.bean(Relation.class));
secondJoin.write().mode(SaveMode.Overwrite).save(targetRelPath);
}
private static boolean containsDedup(final String json) {
final String source = DHPUtils.getJPathString(SOURCEJSONPATH, json);
final String target = DHPUtils.getJPathString(TARGETJSONPATH, json);
return source.toLowerCase().contains("dedup") || target.toLowerCase().contains("dedup");
}
private static String replaceField(final String json, final String id, final FieldType type) {
ObjectMapper mapper = new ObjectMapper();
mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
try {
Relation relation = mapper.readValue(json, Relation.class);
if (relation.getDataInfo() == null)
relation.setDataInfo(new DataInfo());
relation.getDataInfo().setDeletedbyinference(false);
switch (type) {
case SOURCE:
relation.setSource(id);
return mapper.writeValueAsString(relation);
case TARGET:
relation.setTarget(id);
return mapper.writeValueAsString(relation);
default:
throw new IllegalArgumentException("");
}
} catch (IOException e) {
throw new RuntimeException("unable to deserialize json relation: " + json, e);
}
}
}

View File

@ -0,0 +1,92 @@
package eu.dnetlib.dedup.sx;
import com.fasterxml.jackson.databind.DeserializationFeature;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.schema.oaf.DataInfo;
import eu.dnetlib.dhp.schema.oaf.Oaf;
import eu.dnetlib.dhp.schema.oaf.Relation;
import eu.dnetlib.dhp.schema.scholexplorer.DLIDataset;
import eu.dnetlib.dhp.schema.scholexplorer.DLIPublication;
import eu.dnetlib.dhp.schema.scholexplorer.DLIUnknown;
import eu.dnetlib.dhp.utils.DHPUtils;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.io.compress.GzipCodec;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.sql.*;
import scala.Tuple2;
import java.io.IOException;
public class SparkUpdateEntityJob {
final static String IDJSONPATH = "$.id";
public static void main(String[] args) throws Exception {
final ArgumentApplicationParser parser = new ArgumentApplicationParser(IOUtils.toString(SparkUpdateEntityJob.class.getResourceAsStream("/eu/dnetlib/dhp/dedup/dedup_delete_by_inference_parameters.json")));
parser.parseArgument(args);
final SparkSession spark = SparkSession
.builder()
.appName(SparkUpdateEntityJob.class.getSimpleName())
.master(parser.get("master"))
.getOrCreate();
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
final String entityPath = parser.get("entityPath");
final String mergeRelPath = parser.get("mergeRelPath");
final String dedupRecordPath = parser.get("dedupRecordPath");
final String entity = parser.get("entity");
final String destination = parser.get("targetPath");
final Dataset<Relation> df = spark.read().load(mergeRelPath).as(Encoders.bean(Relation.class));
final JavaPairRDD<String, String> mergedIds = df
.where("relClass == 'merges'")
.select(df.col("target"))
.distinct()
.toJavaRDD()
.mapToPair((PairFunction<Row, String, String>) r -> new Tuple2<>(r.getString(0), "d"));
final JavaRDD<String> sourceEntity = sc.textFile(entityPath);
final JavaRDD<String> dedupEntity = sc.textFile(dedupRecordPath);
JavaPairRDD<String, String> entitiesWithId = sourceEntity.mapToPair((PairFunction<String, String, String>) s -> new Tuple2<>(DHPUtils.getJPathString(IDJSONPATH, s), s));
Class<? extends Oaf> mainClass;
switch (entity) {
case "publication":
mainClass = DLIPublication.class;
break;
case "dataset":
mainClass = DLIDataset.class;
break;
case "unknown":
mainClass = DLIUnknown.class;
break;
default:
throw new IllegalArgumentException("Illegal type " + entity);
}
JavaRDD<String> map = entitiesWithId.leftOuterJoin(mergedIds).map(k -> k._2()._2().isPresent() ? updateDeletedByInference(k._2()._1(), mainClass) : k._2()._1());
map.union(dedupEntity).saveAsTextFile(destination, GzipCodec.class);
}
private static <T extends Oaf> String updateDeletedByInference(final String json, final Class<T> clazz) {
final ObjectMapper mapper = new ObjectMapper();
mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
try {
Oaf entity = mapper.readValue(json, clazz);
if (entity.getDataInfo()== null)
entity.setDataInfo(new DataInfo());
entity.getDataInfo().setDeletedbyinference(true);
return mapper.writeValueAsString(entity);
} catch (IOException e) {
throw new RuntimeException("Unable to convert json", e);
}
}
}

View File

@ -0,0 +1,38 @@
[
{
"paramName": "mt",
"paramLongName": "master",
"paramDescription": "should be local or yarn",
"paramRequired": true
},
{
"paramName": "ep",
"paramLongName": "entityPath",
"paramDescription": "the input entity path",
"paramRequired": true
},
{
"paramName": "mr",
"paramLongName": "mergeRelPath",
"paramDescription": "the input path of merge Rel",
"paramRequired": true
},
{
"paramName": "dr",
"paramLongName": "dedupRecordPath",
"paramDescription": "the inputPath of dedup record",
"paramRequired": true
},
{
"paramName": "e",
"paramLongName": "entity",
"paramDescription": "the type of entity",
"paramRequired": true
},
{
"paramName": "t",
"paramLongName": "targetPath",
"paramDescription": "the targetPath",
"paramRequired": true
}
]

View File

@ -0,0 +1,26 @@
[
{
"paramName": "mt",
"paramLongName": "master",
"paramDescription": "should be local or yarn",
"paramRequired": true
},
{
"paramName": "ep",
"paramLongName": "relationPath",
"paramDescription": "the input relation path",
"paramRequired": true
},
{
"paramName": "mr",
"paramLongName": "mergeRelPath",
"paramDescription": "the input path of merge Rel",
"paramRequired": true
},
{
"paramName": "t",
"paramLongName": "targetRelPath",
"paramDescription": "the output Rel Path",
"paramRequired": true
}
]

View File

@ -0,0 +1,18 @@
<configuration>
<property>
<name>jobTracker</name>
<value>yarnRM</value>
</property>
<property>
<name>nameNode</name>
<value>hdfs://nameservice1</value>
</property>
<property>
<name>oozie.use.system.libpath</name>
<value>true</value>
</property>
<property>
<name>oozie.action.sharelib.for.spark</name>
<value>spark2</value>
</property>
</configuration>

View File

@ -0,0 +1,177 @@
<workflow-app name="Dedup Entities" xmlns="uri:oozie:workflow:0.5">
<parameters>
<property>
<name>sourcePath</name>
<description>the source path</description>
</property>
<property>
<name>entity</name>
<description>the entity that should be processed</description>
</property>
<property>
<name>dedupConf</name>
<description>the dedup Configuration</description>
</property>
<property>
<name>targetPath</name>
<description>the target path</description>
</property>
<property>
<name>sparkDriverMemory</name>
<description>memory for driver process</description>
</property>
<property>
<name>sparkExecutorMemory</name>
<description>memory for individual executor</description>
</property>
</parameters>
<start to="DeleteWorkingPath"/>
<kill name="Kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<action name="DeleteWorkingPath">
<fs>
<delete path='${targetPath}/${entity}'/>
<mkdir path="${targetPath}"/>
<mkdir path="${targetPath}/${entity}"/>
</fs>
<ok to="CreateSimRels"/>
<error to="Kill"/>
</action>
<action name="CreateSimRels">
<spark xmlns="uri:oozie:spark-action:0.2">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<master>yarn-cluster</master>
<mode>cluster</mode>
<name>Create Similarity Relations</name>
<class>eu.dnetlib.dedup.SparkCreateSimRels</class>
<jar>dhp-dedup-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory ${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory}
${sparkExtraOPT}
</spark-opts>
<arg>-mt</arg><arg>yarn-cluster</arg>
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
<arg>--targetPath</arg><arg>${targetPath}</arg>
<arg>--entity</arg><arg>${entity}</arg>
<arg>--dedupConf</arg><arg>${dedupConf}</arg>
</spark>
<ok to="CreateConnectedComponents"/>
<error to="Kill"/>
</action>
<action name="CreateConnectedComponents">
<spark xmlns="uri:oozie:spark-action:0.2">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<master>yarn-cluster</master>
<mode>cluster</mode>
<name>Create Connected Components</name>
<class>eu.dnetlib.dedup.SparkCreateConnectedComponent</class>
<jar>dhp-dedup-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory ${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory}
${sparkExtraOPT}
</spark-opts>
<arg>-mt</arg><arg>yarn-cluster</arg>
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
<arg>--targetPath</arg><arg>${targetPath}</arg>
<arg>--entity</arg><arg>${entity}</arg>
<arg>--dedupConf</arg><arg>${dedupConf}</arg>
</spark>
<ok to="CreateDedupRecord"/>
<error to="Kill"/>
</action>
<action name="CreateDedupRecord">
<spark xmlns="uri:oozie:spark-action:0.2">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<master>yarn-cluster</master>
<mode>cluster</mode>
<name>Create Dedup Record</name>
<class>eu.dnetlib.dedup.SparkCreateDedupRecord</class>
<jar>dhp-dedup-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory ${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory}
${sparkExtraOPT}
</spark-opts>
<arg>-mt</arg><arg>yarn-cluster</arg>
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
<arg>--dedupPath</arg><arg>${targetPath}</arg>
<arg>--entity</arg><arg>${entity}</arg>
<arg>--dedupConf</arg><arg>${dedupConf}</arg>
</spark>
<ok to="fixRelation"/>
<error to="Kill"/>
</action>
<action name="fixRelation">
<spark xmlns="uri:oozie:spark-action:0.2">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<master>yarn-cluster</master>
<mode>cluster</mode>
<name>Propagate Dedup Relations</name>
<class>eu.dnetlib.dedup.sx.SparkPropagateRelationsJob</class>
<jar>dhp-dedup-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory ${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory}
${sparkExtraOPT}
</spark-opts>
<arg>-mt</arg><arg>yarn-cluster</arg>
<arg>--mergeRelPath</arg><arg>${targetPath}/${entity}/mergeRel</arg>
<arg>--relationPath</arg><arg>${sourcePath}/relation</arg>
<arg>--targetRelPath</arg><arg>${targetPath}/${entity}/updated_relation</arg>
</spark>
<ok to="updateDeletedByInferenceEntity"/>
<error to="Kill"/>
</action>
<action name="updateDeletedByInferenceEntity">
<spark xmlns="uri:oozie:spark-action:0.2">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<master>yarn-cluster</master>
<mode>cluster</mode>
<name>Update ${entity} and add DedupRecord</name>
<class>eu.dnetlib.dedup.sx.SparkUpdateEntityJob</class>
<jar>dhp-dedup-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory ${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory}
${sparkExtraOPT}
</spark-opts>
<arg>-mt</arg><arg>yarn-cluster</arg>
<arg>--entityPath</arg><arg>${sourcePath}/${entity}</arg>
<arg>--mergeRelPath</arg><arg>${targetPath}/${entity}/mergeRel</arg>
<arg>--entity</arg><arg>${entity}</arg>
<arg>--dedupRecordPath</arg><arg>${targetPath}/${entity}/dedup_records</arg>
<arg>--targetPath</arg><arg>${targetPath}/${entity}/updated_record</arg>
</spark>
<ok to="replaceEntity"/>
<error to="Kill"/>
</action>
<action name="replaceEntity">
<fs>
<delete path='${sourcePath}/${entity}'/>
<delete path='${sourcePath}/relation'/>
<move source="${targetPath}/${entity}/updated_relation" target="${sourcePath}/relation" />
<move source="${targetPath}/${entity}/updated_record" target="${sourcePath}/${entity}" />
</fs>
<ok to="End"/>
<error to="Kill"/>
</action>
<end name="End"/>
</workflow-app>

View File

@ -1,43 +1,151 @@
{ {
"wf" : { "wf": {
"threshold" : "0.99", "threshold": "0.99",
"dedupRun" : "001", "dedupRun": "001",
"entityType" : "result", "entityType": "result",
"subEntityType" : "resulttype", "subEntityType": "resulttype",
"subEntityValue" : "publication", "subEntityValue": "publication",
"orderField" : "title", "orderField": "title",
"queueMaxSize" : "2000", "queueMaxSize": "2000",
"groupMaxSize" : "100", "groupMaxSize": "100",
"maxChildren" : "100", "maxChildren": "100",
"idPath": "$.id", "slidingWindowSize": "200",
"slidingWindowSize" : "200", "rootBuilder": [
"rootBuilder" : [ "result", "resultProject_outcome_isProducedBy", "resultResult_publicationDataset_isRelatedTo", "resultResult_similarity_isAmongTopNSimilarDocuments", "resultResult_similarity_hasAmongTopNSimilarDocuments", "resultOrganization_affiliation_isAffiliatedWith", "resultResult_part_hasPart", "resultResult_part_isPartOf", "resultResult_supplement_isSupplementTo", "resultResult_supplement_isSupplementedBy", "resultResult_version_isVersionOf" ], ],
"includeChildren" : "true" "includeChildren": "true",
"maxIterations": 20,
"idPath": "$.id"
}, },
"pace" : { "pace": {
"clustering" : [ "clustering": [
{ "name" : "ngrampairs", "fields" : [ "title" ], "params" : { "max" : "1", "ngramLen" : "3"} }, {
{ "name" : "suffixprefix", "fields" : [ "title" ], "params" : { "max" : "1", "len" : "3" } }, "name": "ngrampairs",
{ "name" : "lowercase", "fields" : [ "doi" ], "params" : { } } "fields": [
"title"
],
"params": {
"max": "1",
"ngramLen": "3"
}
},
{
"name": "suffixprefix",
"fields": [
"title"
],
"params": {
"max": "1",
"len": "3"
}
}
], ],
"strictConditions" : [ "decisionTree": {
{ "name" : "pidMatch", "fields" : [ "pid" ] } "start": {
"fields": [
{
"field": "pid",
"comparator": "jsonListMatch",
"weight": 1.0,
"countIfUndefined": "false",
"params": {
"jpath_value": "$.value",
"jpath_classid": "$.qualifier.classid"
}
}
],
"threshold": 0.5,
"aggregation": "AVG",
"positive": "MATCH",
"negative": "layer2",
"undefined": "layer2",
"ignoreUndefined": "true"
},
"layer2": {
"fields": [
{
"field": "title",
"comparator": "titleVersionMatch",
"weight": 1.0,
"countIfUndefined": "false",
"params": {}
},
{
"field": "authors",
"comparator": "sizeMatch",
"weight": 1.0,
"countIfUndefined": "false",
"params": {}
}
],
"threshold": 1.0,
"aggregation": "AND",
"positive": "layer3",
"negative": "NO_MATCH",
"undefined": "layer3",
"ignoreUndefined": "false"
},
"layer3": {
"fields": [
{
"field": "title",
"comparator": "levensteinTitle",
"weight": 1.0,
"countIfUndefined": "true",
"params": {}
}
],
"threshold": 0.99,
"aggregation": "AVG",
"positive": "MATCH",
"negative": "NO_MATCH",
"undefined": "NO_MATCH",
"ignoreUndefined": "true"
}
},
"model": [
{
"name": "pid",
"type": "JSON",
"path": "$.pid",
"overrideMatch": "true"
},
{
"name": "title",
"type": "String",
"path": "$.title[*].value",
"length": 250,
"size": 5
},
{
"name": "authors",
"type": "List",
"path": "$.author[*].fullname",
"size": 200
},
{
"name": "resulttype",
"type": "String",
"path": "$.resulttype.classid"
}
], ],
"conditions" : [ "blacklists": {
{ "name" : "titleVersionMatch", "fields" : [ "title" ] }, "title": [
{ "name" : "sizeMatch", "fields" : [ "authors" ] }
],
"model" : [
{ "name" : "doi", "algo" : "Null", "type" : "String", "weight" : "0.0", "ignoreMissing" : "true", "path" : "$.pid[?(@.qualifier.classid ==\"doi\")].value" },
{ "name" : "pid", "algo" : "Null", "type" : "JSON", "weight" : "0.0", "ignoreMissing" : "true", "path" : "$.pid", "overrideMatch" : "true" },
{ "name" : "title", "algo" : "LevensteinTitle", "type" : "String", "weight" : "1.0", "ignoreMissing" : "false", "path" : "$.title[?(@.qualifier.classid ==\"main title\")].value", "length" : 250, "size" : 5 },
{ "name" : "authors", "algo" : "Null", "type" : "List", "weight" : "0.0", "ignoreMissing" : "true", "path" : "$.author[*].fullname", "size" : 200 },
{ "name" : "resulttype", "algo" : "Null", "type" : "String", "weight" : "0.0", "ignoreMissing" : "false", "path" : "$.resulttype.classid" }
],
"synonyms": {},
"blacklists" : {
"title" : [
"^Inside Front Cover$", "^Inside Front Cover$",
"^CORR Insights$",
"^Index des notions$",
"^Department of Error.$",
"^Untitled Item$",
"^Department of Error$",
"^Tome II : 1598 à 1605$",
"^(à lexception de roi, prince, royauté, pouvoir, image… qui sont omniprésents)$",
"^Museen und Ausstellungsinstitute in Nürnberg$",
"^Text/Conference Paper$",
"^Table des illustrations$",
"^An Intimate Insight on Psychopathy and a Novel Hermeneutic Psychological Science$",
"^Index des noms$",
"^Reply by Authors.$",
"^Titelblatt - Inhalt$",
"^Index des œuvres,$",
"(?i)^Poster presentations$", "(?i)^Poster presentations$",
"^THE ASSOCIATION AND THE GENERAL MEDICAL COUNCIL$", "^THE ASSOCIATION AND THE GENERAL MEDICAL COUNCIL$",
"^Problems with perinatal pathology\\.?$", "^Problems with perinatal pathology\\.?$",
@ -48,7 +156,6 @@
"^Cartas? ao editor Letters? to the Editor$", "^Cartas? ao editor Letters? to the Editor$",
"^Note from the Editor$", "^Note from the Editor$",
"^Anesthesia Abstract$", "^Anesthesia Abstract$",
"^Annual report$", "^Annual report$",
"(?i)^“?THE RADICAL PREVENTION OF VENEREAL DISEASE\\.?”?$", "(?i)^“?THE RADICAL PREVENTION OF VENEREAL DISEASE\\.?”?$",
"(?i)^Graph and Table of Infectious Diseases?$", "(?i)^Graph and Table of Infectious Diseases?$",
@ -68,14 +175,12 @@
"^Cálculo de concentraciones en disoluciones acuosas. Ejercicio interactivo\\..*\\.$", "^Cálculo de concentraciones en disoluciones acuosas. Ejercicio interactivo\\..*\\.$",
"(?i)^Genetic and functional analyses of SHANK2 mutations suggest a multiple hit model of Autism spectrum disorders?\\.?$", "(?i)^Genetic and functional analyses of SHANK2 mutations suggest a multiple hit model of Autism spectrum disorders?\\.?$",
"^Gushi hakubutsugaku$", "^Gushi hakubutsugaku$",
"^Starobosanski nadpisi u Bosni i Hercegovini \\(.*\\)$", "^Starobosanski nadpisi u Bosni i Hercegovini \\(.*\\)$",
"^Intestinal spirocha?etosis$", "^Intestinal spirocha?etosis$",
"^Treatment of Rodent Ulcer$", "^Treatment of Rodent Ulcer$",
"(?i)^\\W*Cloud Computing\\W*$", "(?i)^\\W*Cloud Computing\\W*$",
"^Compendio mathematico : en que se contienen todas las materias mas principales de las Ciencias que tratan de la cantidad$", "^Compendio mathematico : en que se contienen todas las materias mas principales de las Ciencias que tratan de la cantidad$",
"^Free Communications, Poster Presentations: Session [A-F]$", "^Free Communications, Poster Presentations: Session [A-F]$",
"^“The Historical Aspects? of Quackery\\.?”$", "^“The Historical Aspects? of Quackery\\.?”$",
"^A designated centre for people with disabilities operated by St John of God Community Services (Limited|Ltd), Louth$", "^A designated centre for people with disabilities operated by St John of God Community Services (Limited|Ltd), Louth$",
"^P(er|re)-Mile Premiums for Auto Insurance\\.?$", "^P(er|re)-Mile Premiums for Auto Insurance\\.?$",
@ -96,10 +201,8 @@
"(?i)^Measurement of the pseudorapidity and centrality dependence of the transverse energy density in Pb(-?)Pb collisions at.*tev(\\.?)$", "(?i)^Measurement of the pseudorapidity and centrality dependence of the transverse energy density in Pb(-?)Pb collisions at.*tev(\\.?)$",
"(?i)^Search for resonances decaying into top-quark pairs using fully hadronic decays in pp collisions with ATLAS at.*TeV$", "(?i)^Search for resonances decaying into top-quark pairs using fully hadronic decays in pp collisions with ATLAS at.*TeV$",
"(?i)^Search for neutral minimal supersymmetric standard model Higgs bosons decaying to tau pairs in pp collisions at.*tev$", "(?i)^Search for neutral minimal supersymmetric standard model Higgs bosons decaying to tau pairs in pp collisions at.*tev$",
"(?i)^Relatório de Estágio (de|em) Angiologia e Cirurgia Vascular$", "(?i)^Relatório de Estágio (de|em) Angiologia e Cirurgia Vascular$",
"^Aus der AGMB$", "^Aus der AGMB$",
"^Znanstveno-stručni prilozi$", "^Znanstveno-stručni prilozi$",
"(?i)^Zhodnocení finanční situace podniku a návrhy na zlepšení$", "(?i)^Zhodnocení finanční situace podniku a návrhy na zlepšení$",
"(?i)^Evaluation of the Financial Situation in the Firm and Proposals to its Improvement$", "(?i)^Evaluation of the Financial Situation in the Firm and Proposals to its Improvement$",
@ -136,7 +239,6 @@
"(?i)^RUBRIKA UREDNIKA$", "(?i)^RUBRIKA UREDNIKA$",
"^A Matching Model of the Academic Publication Market$", "^A Matching Model of the Academic Publication Market$",
"^Yōgaku kōyō$", "^Yōgaku kōyō$",
"^Internetový marketing$", "^Internetový marketing$",
"^Internet marketing$", "^Internet marketing$",
"^Chūtō kokugo dokuhon$", "^Chūtō kokugo dokuhon$",
@ -169,21 +271,17 @@
"^Information System Assessment and Proposal for ICT Modification$", "^Information System Assessment and Proposal for ICT Modification$",
"^Stresové zatížení pracovníků ve vybrané profesi$", "^Stresové zatížení pracovníků ve vybrané profesi$",
"^Stress load in a specific job$", "^Stress load in a specific job$",
"^Sunday: Poster Sessions, Pt.*$", "^Sunday: Poster Sessions, Pt.*$",
"^Monday: Poster Sessions, Pt.*$", "^Monday: Poster Sessions, Pt.*$",
"^Wednesday: Poster Sessions, Pt.*", "^Wednesday: Poster Sessions, Pt.*",
"^Tuesday: Poster Sessions, Pt.*$", "^Tuesday: Poster Sessions, Pt.*$",
"^Analýza reklamy$", "^Analýza reklamy$",
"^Analysis of advertising$", "^Analysis of advertising$",
"^Shōgaku shūshinsho$", "^Shōgaku shūshinsho$",
"^Shōgaku sansū$", "^Shōgaku sansū$",
"^Shintei joshi kokubun$", "^Shintei joshi kokubun$",
"^Taishō joshi kokubun dokuhon$", "^Taishō joshi kokubun dokuhon$",
"^Joshi kokubun$", "^Joshi kokubun$",
"^Účetní uzávěrka a účetní závěrka v ČR$", "^Účetní uzávěrka a účetní závěrka v ČR$",
"(?i)^The \"?Causes\"? of Cancer$", "(?i)^The \"?Causes\"? of Cancer$",
"^Normas para la publicación de artículos$", "^Normas para la publicación de artículos$",
@ -202,7 +300,6 @@
"^Abdominal [Aa]ortic [Aa]neurysms.*$", "^Abdominal [Aa]ortic [Aa]neurysms.*$",
"^Pseudomyxoma peritonei$", "^Pseudomyxoma peritonei$",
"^Kazalo autora$", "^Kazalo autora$",
"(?i)^uvodna riječ$", "(?i)^uvodna riječ$",
"^Motivace jako způsob vedení lidí$", "^Motivace jako způsob vedení lidí$",
"^Motivation as a leadership$", "^Motivation as a leadership$",
@ -275,6 +372,7 @@
"(?i)^.*authors[']? reply\\.?$", "(?i)^.*authors[']? reply\\.?$",
"(?i)^.*authors[']? response\\.?$" "(?i)^.*authors[']? response\\.?$"
] ]
} },
"synonyms": {}
} }
} }

Some files were not shown because too many files have changed in this diff Show More