diff --git a/dhp-build/dhp-build-properties-maven-plugin/src/main/java/eu/dnetlib/maven/plugin/properties/GenerateOoziePropertiesMojo.java b/dhp-build/dhp-build-properties-maven-plugin/src/main/java/eu/dnetlib/maven/plugin/properties/GenerateOoziePropertiesMojo.java index 5f94e7869..08aa42f5b 100644 --- a/dhp-build/dhp-build-properties-maven-plugin/src/main/java/eu/dnetlib/maven/plugin/properties/GenerateOoziePropertiesMojo.java +++ b/dhp-build/dhp-build-properties-maven-plugin/src/main/java/eu/dnetlib/maven/plugin/properties/GenerateOoziePropertiesMojo.java @@ -3,7 +3,6 @@ package eu.dnetlib.maven.plugin.properties; import java.io.File; import java.util.ArrayList; import java.util.List; - import org.apache.commons.lang.ArrayUtils; import org.apache.commons.lang.StringUtils; import org.apache.maven.plugin.AbstractMojo; @@ -12,60 +11,63 @@ import org.apache.maven.plugin.MojoFailureException; /** * Generates oozie properties which were not provided from commandline. - * @author mhorst * + * @author mhorst * @goal generate-properties */ public class GenerateOoziePropertiesMojo extends AbstractMojo { - public static final String PROPERTY_NAME_WF_SOURCE_DIR = "workflow.source.dir"; - public static final String PROPERTY_NAME_SANDBOX_NAME = "sandboxName"; - - private final String[] limiters = {"dhp", "dnetlib", "eu"}; - + public static final String PROPERTY_NAME_WF_SOURCE_DIR = "workflow.source.dir"; + public static final String PROPERTY_NAME_SANDBOX_NAME = "sandboxName"; + + private final String[] limiters = {"dhp", "dnetlib", "eu"}; + @Override public void execute() throws MojoExecutionException, MojoFailureException { - if (System.getProperties().containsKey(PROPERTY_NAME_WF_SOURCE_DIR) && - !System.getProperties().containsKey(PROPERTY_NAME_SANDBOX_NAME)) { - String generatedSandboxName = generateSandboxName(System.getProperties().getProperty( - PROPERTY_NAME_WF_SOURCE_DIR)); - if (generatedSandboxName!=null) { - System.getProperties().setProperty(PROPERTY_NAME_SANDBOX_NAME, - generatedSandboxName); - } else { - System.out.println("unable to generate sandbox name from path: " + - System.getProperties().getProperty(PROPERTY_NAME_WF_SOURCE_DIR)); - } - } + if (System.getProperties().containsKey(PROPERTY_NAME_WF_SOURCE_DIR) + && !System.getProperties().containsKey(PROPERTY_NAME_SANDBOX_NAME)) { + String generatedSandboxName = + generateSandboxName( + System.getProperties().getProperty(PROPERTY_NAME_WF_SOURCE_DIR)); + if (generatedSandboxName != null) { + System.getProperties() + .setProperty(PROPERTY_NAME_SANDBOX_NAME, generatedSandboxName); + } else { + System.out.println( + "unable to generate sandbox name from path: " + + System.getProperties().getProperty(PROPERTY_NAME_WF_SOURCE_DIR)); + } + } } - + /** * Generates sandbox name from workflow source directory. + * * @param wfSourceDir * @return generated sandbox name */ private String generateSandboxName(String wfSourceDir) { -// utilize all dir names until finding one of the limiters - List sandboxNameParts = new ArrayList(); - String[] tokens = StringUtils.split(wfSourceDir, File.separatorChar); - ArrayUtils.reverse(tokens); - if (tokens.length>0) { - for (String token : tokens) { - for (String limiter : limiters) { - if (limiter.equals(token)) { - return sandboxNameParts.size()>0? - StringUtils.join(sandboxNameParts.toArray()):null; - } - } - if (sandboxNameParts.size()>0) { - sandboxNameParts.add(0, File.separator); - } - sandboxNameParts.add(0, token); - } - return StringUtils.join(sandboxNameParts.toArray()); - } else { - return null; - } + // utilize all dir names until finding one of the limiters + List sandboxNameParts = new ArrayList(); + String[] tokens = StringUtils.split(wfSourceDir, File.separatorChar); + ArrayUtils.reverse(tokens); + if (tokens.length > 0) { + for (String token : tokens) { + for (String limiter : limiters) { + if (limiter.equals(token)) { + return sandboxNameParts.size() > 0 + ? StringUtils.join(sandboxNameParts.toArray()) + : null; + } + } + if (sandboxNameParts.size() > 0) { + sandboxNameParts.add(0, File.separator); + } + sandboxNameParts.add(0, token); + } + return StringUtils.join(sandboxNameParts.toArray()); + } else { + return null; + } } - } diff --git a/dhp-build/dhp-build-properties-maven-plugin/src/main/java/eu/dnetlib/maven/plugin/properties/WritePredefinedProjectProperties.java b/dhp-build/dhp-build-properties-maven-plugin/src/main/java/eu/dnetlib/maven/plugin/properties/WritePredefinedProjectProperties.java index 62f04761a..acfd9a7f8 100644 --- a/dhp-build/dhp-build-properties-maven-plugin/src/main/java/eu/dnetlib/maven/plugin/properties/WritePredefinedProjectProperties.java +++ b/dhp-build/dhp-build-properties-maven-plugin/src/main/java/eu/dnetlib/maven/plugin/properties/WritePredefinedProjectProperties.java @@ -1,19 +1,17 @@ /** - * - * Licensed under the Educational Community License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at + * Licensed under the Educational Community License, Version 2.0 (the "License"); you may not use + * this file except in compliance with the License. You may obtain a copy of the License at * - * http://www.opensource.org/licenses/ecl2.php + *

http://www.opensource.org/licenses/ecl2.php * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and + *

Unless required by applicable law or agreed to in writing, software distributed under the + * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either + * express or implied. See the License for the specific language governing permissions and * limitations under the License. */ package eu.dnetlib.maven.plugin.properties; +import edu.umd.cs.findbugs.annotations.SuppressFBWarnings; import java.io.File; import java.io.FileInputStream; import java.io.IOException; @@ -26,7 +24,6 @@ import java.util.List; import java.util.Map.Entry; import java.util.Properties; import java.util.Set; - import org.apache.commons.io.FileUtils; import org.apache.commons.io.IOUtils; import org.apache.commons.lang.StringUtils; @@ -38,29 +35,24 @@ import org.springframework.core.io.DefaultResourceLoader; import org.springframework.core.io.Resource; import org.springframework.core.io.ResourceLoader; -import edu.umd.cs.findbugs.annotations.SuppressFBWarnings; - /** - * Writes project properties for the keys listed in specified properties files. - * Based on: + * Writes project properties for the keys listed in specified properties files. Based on: * http://site.kuali.org/maven/plugins/properties-maven-plugin/1.3.2/write-project-properties-mojo.html - + * * @author mhorst * @goal write-project-properties */ public class WritePredefinedProjectProperties extends AbstractMojo { - - private static final String CR = "\r"; - private static final String LF = "\n"; - private static final String TAB = "\t"; + + private static final String CR = "\r"; + private static final String LF = "\n"; + private static final String TAB = "\t"; protected static final String PROPERTY_PREFIX_ENV = "env."; private static final String ENCODING_UTF8 = "utf8"; - - /** - * @parameter property="properties.includePropertyKeysFromFiles" - */ - private String[] includePropertyKeysFromFiles; - + + /** @parameter property="properties.includePropertyKeysFromFiles" */ + private String[] includePropertyKeysFromFiles; + /** * @parameter default-value="${project}" * @required @@ -70,55 +62,57 @@ public class WritePredefinedProjectProperties extends AbstractMojo { /** * The file that properties will be written to - * + * * @parameter property="properties.outputFile" - * default-value="${project.build.directory}/properties/project.properties"; + * default-value="${project.build.directory}/properties/project.properties"; * @required */ protected File outputFile; - - /** - * If true, the plugin will silently ignore any non-existent properties files, and the build will continue + + /** + * If true, the plugin will silently ignore any non-existent properties files, and the build + * will continue * * @parameter property="properties.quiet" default-value="true" */ private boolean quiet; - + /** - * Comma separated list of characters to escape when writing property values. cr=carriage return, lf=linefeed, - * tab=tab. Any other values are taken literally. - * + * Comma separated list of characters to escape when writing property values. cr=carriage + * return, lf=linefeed, tab=tab. Any other values are taken literally. + * * @parameter default-value="cr,lf,tab" property="properties.escapeChars" */ private String escapeChars; /** - * If true, the plugin will include system properties when writing the properties file. System properties override - * both environment variables and project properties. - * + * If true, the plugin will include system properties when writing the properties file. System + * properties override both environment variables and project properties. + * * @parameter default-value="false" property="properties.includeSystemProperties" */ private boolean includeSystemProperties; /** - * If true, the plugin will include environment variables when writing the properties file. Environment variables - * are prefixed with "env". Environment variables override project properties. - * + * If true, the plugin will include environment variables when writing the properties file. + * Environment variables are prefixed with "env". Environment variables override project + * properties. + * * @parameter default-value="false" property="properties.includeEnvironmentVariables" */ private boolean includeEnvironmentVariables; /** * Comma separated set of properties to exclude when writing the properties file - * + * * @parameter property="properties.exclude" */ private String exclude; /** - * Comma separated set of properties to write to the properties file. If provided, only the properties matching - * those supplied here will be written to the properties file. - * + * Comma separated set of properties to write to the properties file. If provided, only the + * properties matching those supplied here will be written to the properties file. + * * @parameter property="properties.include" */ private String include; @@ -127,7 +121,7 @@ public class WritePredefinedProjectProperties extends AbstractMojo { * @see org.apache.maven.plugin.AbstractMojo#execute() */ @Override - @SuppressFBWarnings({"NP_UNWRITTEN_FIELD","UWF_UNWRITTEN_FIELD"}) + @SuppressFBWarnings({"NP_UNWRITTEN_FIELD", "UWF_UNWRITTEN_FIELD"}) public void execute() throws MojoExecutionException, MojoFailureException { Properties properties = new Properties(); // Add project properties @@ -149,10 +143,11 @@ public class WritePredefinedProjectProperties extends AbstractMojo { getLog().info("Creating " + outputFile); writeProperties(outputFile, comment, properties, escapeTokens); - } + } /** * Provides environment variables. + * * @return environment variables */ protected static Properties getEnvironmentVariables() { @@ -165,42 +160,45 @@ public class WritePredefinedProjectProperties extends AbstractMojo { /** * Removes properties which should not be written. + * * @param properties * @param omitCSV * @param includeCSV * @throws MojoExecutionException */ - protected void trim(Properties properties, String omitCSV, String includeCSV) throws MojoExecutionException { + protected void trim(Properties properties, String omitCSV, String includeCSV) + throws MojoExecutionException { List omitKeys = getListFromCSV(omitCSV); for (String key : omitKeys) { properties.remove(key); } - + List includeKeys = getListFromCSV(includeCSV); -// mh: including keys from predefined properties - if (includePropertyKeysFromFiles!=null && includePropertyKeysFromFiles.length>0) { - for (String currentIncludeLoc : includePropertyKeysFromFiles) { - if (validate(currentIncludeLoc)) { - Properties p = getProperties(currentIncludeLoc); - for (String key : p.stringPropertyNames()) { - includeKeys.add(key); - } - } - } + // mh: including keys from predefined properties + if (includePropertyKeysFromFiles != null && includePropertyKeysFromFiles.length > 0) { + for (String currentIncludeLoc : includePropertyKeysFromFiles) { + if (validate(currentIncludeLoc)) { + Properties p = getProperties(currentIncludeLoc); + for (String key : p.stringPropertyNames()) { + includeKeys.add(key); + } + } + } } - if (includeKeys!=null && !includeKeys.isEmpty()) { -// removing only when include keys provided - Set keys = properties.stringPropertyNames(); + if (includeKeys != null && !includeKeys.isEmpty()) { + // removing only when include keys provided + Set keys = properties.stringPropertyNames(); for (String key : keys) { if (!includeKeys.contains(key)) { properties.remove(key); } - } + } } } /** * Checks whether file exists. + * * @param location * @return true when exists, false otherwise. */ @@ -219,6 +217,7 @@ public class WritePredefinedProjectProperties extends AbstractMojo { /** * Validates resource location. + * * @param location * @return true when valid, false otherwise * @throws MojoExecutionException @@ -238,6 +237,7 @@ public class WritePredefinedProjectProperties extends AbstractMojo { /** * Provides input stream. + * * @param location * @return input stream * @throws IOException @@ -254,6 +254,7 @@ public class WritePredefinedProjectProperties extends AbstractMojo { /** * Creates properties for given location. + * * @param location * @return properties for given location * @throws MojoExecutionException @@ -278,6 +279,7 @@ public class WritePredefinedProjectProperties extends AbstractMojo { /** * Provides escape characters. + * * @param escapeChars * @return escape characters */ @@ -293,6 +295,7 @@ public class WritePredefinedProjectProperties extends AbstractMojo { /** * Provides real token. + * * @param token * @return real token */ @@ -310,6 +313,7 @@ public class WritePredefinedProjectProperties extends AbstractMojo { /** * Returns content. + * * @param comment * @param properties * @param escapeTokens @@ -332,13 +336,15 @@ public class WritePredefinedProjectProperties extends AbstractMojo { /** * Writes properties to given file. + * * @param file * @param comment * @param properties * @param escapeTokens * @throws MojoExecutionException */ - protected void writeProperties(File file, String comment, Properties properties, List escapeTokens) + protected void writeProperties( + File file, String comment, Properties properties, List escapeTokens) throws MojoExecutionException { try { String content = getContent(comment, properties, escapeTokens); @@ -347,15 +353,16 @@ public class WritePredefinedProjectProperties extends AbstractMojo { throw new MojoExecutionException("Error creating properties file", e); } } - + /** * Escapes characters. + * * @param s * @param escapeChars * @return */ protected String escape(String s, List escapeChars) { - String result = s; + String result = s; for (String escapeChar : escapeChars) { result = result.replace(escapeChar, getReplacementToken(escapeChar)); } @@ -364,6 +371,7 @@ public class WritePredefinedProjectProperties extends AbstractMojo { /** * Provides replacement token. + * * @param escapeChar * @return replacement token */ @@ -380,21 +388,22 @@ public class WritePredefinedProjectProperties extends AbstractMojo { } /** - * Returns list from csv. - * @param csv - * @return list of values generated from CSV - */ - protected static final List getListFromCSV(String csv) { - if (StringUtils.isBlank(csv)) { - return new ArrayList(); - } - List list = new ArrayList(); - String[] tokens = StringUtils.split(csv, ","); - for (String token : tokens) { - list.add(token.trim()); - } - return list; - } + * Returns list from csv. + * + * @param csv + * @return list of values generated from CSV + */ + protected static final List getListFromCSV(String csv) { + if (StringUtils.isBlank(csv)) { + return new ArrayList(); + } + List list = new ArrayList(); + String[] tokens = StringUtils.split(csv, ","); + for (String token : tokens) { + list.add(token.trim()); + } + return list; + } public void setIncludeSystemProperties(boolean includeSystemProperties) { this.includeSystemProperties = includeSystemProperties; @@ -419,18 +428,17 @@ public class WritePredefinedProjectProperties extends AbstractMojo { public void setQuiet(boolean quiet) { this.quiet = quiet; } - - /** - * Sets property files for which keys properties should be included. - * @param includePropertyKeysFromFiles - */ - public void setIncludePropertyKeysFromFiles( - String[] includePropertyKeysFromFiles) { - if (includePropertyKeysFromFiles!=null) { - this.includePropertyKeysFromFiles = Arrays.copyOf( - includePropertyKeysFromFiles, - includePropertyKeysFromFiles.length); - } - } - -} \ No newline at end of file + + /** + * Sets property files for which keys properties should be included. + * + * @param includePropertyKeysFromFiles + */ + public void setIncludePropertyKeysFromFiles(String[] includePropertyKeysFromFiles) { + if (includePropertyKeysFromFiles != null) { + this.includePropertyKeysFromFiles = + Arrays.copyOf( + includePropertyKeysFromFiles, includePropertyKeysFromFiles.length); + } + } +} diff --git a/dhp-build/dhp-build-properties-maven-plugin/src/test/java/eu/dnetlib/maven/plugin/properties/GenerateOoziePropertiesMojoTest.java b/dhp-build/dhp-build-properties-maven-plugin/src/test/java/eu/dnetlib/maven/plugin/properties/GenerateOoziePropertiesMojoTest.java index a2cb8e0f1..74da20756 100644 --- a/dhp-build/dhp-build-properties-maven-plugin/src/test/java/eu/dnetlib/maven/plugin/properties/GenerateOoziePropertiesMojoTest.java +++ b/dhp-build/dhp-build-properties-maven-plugin/src/test/java/eu/dnetlib/maven/plugin/properties/GenerateOoziePropertiesMojoTest.java @@ -1,33 +1,30 @@ package eu.dnetlib.maven.plugin.properties; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.Test; - import static eu.dnetlib.maven.plugin.properties.GenerateOoziePropertiesMojo.PROPERTY_NAME_SANDBOX_NAME; import static eu.dnetlib.maven.plugin.properties.GenerateOoziePropertiesMojo.PROPERTY_NAME_WF_SOURCE_DIR; import static org.junit.jupiter.api.Assertions.*; -/** - * @author mhorst, claudio.atzori - * - */ +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +/** @author mhorst, claudio.atzori */ public class GenerateOoziePropertiesMojoTest { private GenerateOoziePropertiesMojo mojo = new GenerateOoziePropertiesMojo(); - + @BeforeEach public void clearSystemProperties() { System.clearProperty(PROPERTY_NAME_SANDBOX_NAME); System.clearProperty(PROPERTY_NAME_WF_SOURCE_DIR); } - + @Test public void testExecuteEmpty() throws Exception { // execute mojo.execute(); - + // assert - assertNull(System.getProperty(PROPERTY_NAME_SANDBOX_NAME)); + assertNull(System.getProperty(PROPERTY_NAME_SANDBOX_NAME)); } @Test @@ -37,64 +34,63 @@ public class GenerateOoziePropertiesMojoTest { String sandboxName = "originalSandboxName"; System.setProperty(PROPERTY_NAME_WF_SOURCE_DIR, workflowSourceDir); System.setProperty(PROPERTY_NAME_SANDBOX_NAME, sandboxName); - + // execute mojo.execute(); - + // assert assertEquals(sandboxName, System.getProperty(PROPERTY_NAME_SANDBOX_NAME)); } - + @Test public void testExecuteEmptyWorkflowSourceDir() throws Exception { // given String workflowSourceDir = ""; System.setProperty(PROPERTY_NAME_WF_SOURCE_DIR, workflowSourceDir); - + // execute mojo.execute(); - + // assert assertNull(System.getProperty(PROPERTY_NAME_SANDBOX_NAME)); } - + @Test public void testExecuteNullSandboxNameGenerated() throws Exception { // given String workflowSourceDir = "eu/dnetlib/dhp/"; System.setProperty(PROPERTY_NAME_WF_SOURCE_DIR, workflowSourceDir); - + // execute mojo.execute(); - + // assert assertNull(System.getProperty(PROPERTY_NAME_SANDBOX_NAME)); } - + @Test public void testExecute() throws Exception { // given String workflowSourceDir = "eu/dnetlib/dhp/wf/transformers"; System.setProperty(PROPERTY_NAME_WF_SOURCE_DIR, workflowSourceDir); - + // execute mojo.execute(); - + // assert assertEquals("wf/transformers", System.getProperty(PROPERTY_NAME_SANDBOX_NAME)); } - + @Test public void testExecuteWithoutRoot() throws Exception { // given String workflowSourceDir = "wf/transformers"; System.setProperty(PROPERTY_NAME_WF_SOURCE_DIR, workflowSourceDir); - + // execute mojo.execute(); - + // assert assertEquals("wf/transformers", System.getProperty(PROPERTY_NAME_SANDBOX_NAME)); } - } diff --git a/dhp-build/dhp-build-properties-maven-plugin/src/test/java/eu/dnetlib/maven/plugin/properties/WritePredefinedProjectPropertiesTest.java b/dhp-build/dhp-build-properties-maven-plugin/src/test/java/eu/dnetlib/maven/plugin/properties/WritePredefinedProjectPropertiesTest.java index 4b7213078..da832042f 100644 --- a/dhp-build/dhp-build-properties-maven-plugin/src/test/java/eu/dnetlib/maven/plugin/properties/WritePredefinedProjectPropertiesTest.java +++ b/dhp-build/dhp-build-properties-maven-plugin/src/test/java/eu/dnetlib/maven/plugin/properties/WritePredefinedProjectPropertiesTest.java @@ -1,5 +1,12 @@ package eu.dnetlib.maven.plugin.properties; +import static eu.dnetlib.maven.plugin.properties.WritePredefinedProjectProperties.PROPERTY_PREFIX_ENV; +import static org.junit.jupiter.api.Assertions.*; +import static org.mockito.Mockito.doReturn; +import static org.mockito.Mockito.lenient; + +import java.io.*; +import java.util.Properties; import org.apache.maven.plugin.MojoExecutionException; import org.apache.maven.project.MavenProject; import org.junit.jupiter.api.*; @@ -9,24 +16,12 @@ import org.mockito.Mock; import org.mockito.MockitoAnnotations; import org.mockito.junit.jupiter.MockitoExtension; -import java.io.*; -import java.util.Properties; - -import static eu.dnetlib.maven.plugin.properties.WritePredefinedProjectProperties.PROPERTY_PREFIX_ENV; -import static org.junit.jupiter.api.Assertions.*; -import static org.mockito.Mockito.doReturn; -import static org.mockito.Mockito.lenient; - -/** - * @author mhorst, claudio.atzori - * - */ +/** @author mhorst, claudio.atzori */ @ExtendWith(MockitoExtension.class) public class WritePredefinedProjectPropertiesTest { - @Mock - private MavenProject mavenProject; - + @Mock private MavenProject mavenProject; + private WritePredefinedProjectProperties mojo; @BeforeEach @@ -39,18 +34,18 @@ public class WritePredefinedProjectPropertiesTest { } // ----------------------------------- TESTS --------------------------------------------- - + @Test public void testExecuteEmpty() throws Exception { // execute mojo.execute(); - + // assert assertTrue(mojo.outputFile.exists()); Properties storedProperties = getStoredProperties(mojo.outputFile.getParentFile()); assertEquals(0, storedProperties.size()); } - + @Test public void testExecuteWithProjectProperties() throws Exception { // given @@ -59,10 +54,10 @@ public class WritePredefinedProjectPropertiesTest { Properties projectProperties = new Properties(); projectProperties.setProperty(key, value); doReturn(projectProperties).when(mavenProject).getProperties(); - + // execute mojo.execute(); - + // assert assertTrue(mojo.outputFile.exists()); Properties storedProperties = getStoredProperties(mojo.outputFile.getParentFile()); @@ -70,7 +65,7 @@ public class WritePredefinedProjectPropertiesTest { assertTrue(storedProperties.containsKey(key)); assertEquals(value, storedProperties.getProperty(key)); } - + @Test() public void testExecuteWithProjectPropertiesAndInvalidOutputFile(@TempDir File testFolder) { // given @@ -80,13 +75,14 @@ public class WritePredefinedProjectPropertiesTest { projectProperties.setProperty(key, value); doReturn(projectProperties).when(mavenProject).getProperties(); mojo.outputFile = testFolder; - + // execute Assertions.assertThrows(MojoExecutionException.class, () -> mojo.execute()); } - + @Test - public void testExecuteWithProjectPropertiesExclusion(@TempDir File testFolder) throws Exception { + public void testExecuteWithProjectPropertiesExclusion(@TempDir File testFolder) + throws Exception { // given String key = "projectPropertyKey"; String value = "projectPropertyValue"; @@ -97,10 +93,10 @@ public class WritePredefinedProjectPropertiesTest { projectProperties.setProperty(excludedKey, excludedValue); doReturn(projectProperties).when(mavenProject).getProperties(); mojo.setExclude(excludedKey); - + // execute mojo.execute(); - + // assert assertTrue(mojo.outputFile.exists()); Properties storedProperties = getStoredProperties(testFolder); @@ -108,9 +104,10 @@ public class WritePredefinedProjectPropertiesTest { assertTrue(storedProperties.containsKey(key)); assertEquals(value, storedProperties.getProperty(key)); } - + @Test - public void testExecuteWithProjectPropertiesInclusion(@TempDir File testFolder) throws Exception { + public void testExecuteWithProjectPropertiesInclusion(@TempDir File testFolder) + throws Exception { // given String key = "projectPropertyKey"; String value = "projectPropertyValue"; @@ -121,10 +118,10 @@ public class WritePredefinedProjectPropertiesTest { projectProperties.setProperty(includedKey, includedValue); doReturn(projectProperties).when(mavenProject).getProperties(); mojo.setInclude(includedKey); - + // execute mojo.execute(); - + // assert assertTrue(mojo.outputFile.exists()); Properties storedProperties = getStoredProperties(testFolder); @@ -132,9 +129,10 @@ public class WritePredefinedProjectPropertiesTest { assertTrue(storedProperties.containsKey(includedKey)); assertEquals(includedValue, storedProperties.getProperty(includedKey)); } - + @Test - public void testExecuteIncludingPropertyKeysFromFile(@TempDir File testFolder) throws Exception { + public void testExecuteIncludingPropertyKeysFromFile(@TempDir File testFolder) + throws Exception { // given String key = "projectPropertyKey"; String value = "projectPropertyValue"; @@ -144,17 +142,18 @@ public class WritePredefinedProjectPropertiesTest { projectProperties.setProperty(key, value); projectProperties.setProperty(includedKey, includedValue); doReturn(projectProperties).when(mavenProject).getProperties(); - + File includedPropertiesFile = new File(testFolder, "included.properties"); Properties includedProperties = new Properties(); includedProperties.setProperty(includedKey, "irrelevantValue"); includedProperties.store(new FileWriter(includedPropertiesFile), null); - - mojo.setIncludePropertyKeysFromFiles(new String[] {includedPropertiesFile.getAbsolutePath()}); - + + mojo.setIncludePropertyKeysFromFiles( + new String[] {includedPropertiesFile.getAbsolutePath()}); + // execute mojo.execute(); - + // assert assertTrue(mojo.outputFile.exists()); Properties storedProperties = getStoredProperties(testFolder); @@ -162,9 +161,10 @@ public class WritePredefinedProjectPropertiesTest { assertTrue(storedProperties.containsKey(includedKey)); assertEquals(includedValue, storedProperties.getProperty(includedKey)); } - + @Test - public void testExecuteIncludingPropertyKeysFromClasspathResource(@TempDir File testFolder) throws Exception { + public void testExecuteIncludingPropertyKeysFromClasspathResource(@TempDir File testFolder) + throws Exception { // given String key = "projectPropertyKey"; String value = "projectPropertyValue"; @@ -174,12 +174,13 @@ public class WritePredefinedProjectPropertiesTest { projectProperties.setProperty(key, value); projectProperties.setProperty(includedKey, includedValue); doReturn(projectProperties).when(mavenProject).getProperties(); - - mojo.setIncludePropertyKeysFromFiles(new String[] {"/eu/dnetlib/maven/plugin/properties/included.properties"}); - + + mojo.setIncludePropertyKeysFromFiles( + new String[] {"/eu/dnetlib/maven/plugin/properties/included.properties"}); + // execute mojo.execute(); - + // assert assertTrue(mojo.outputFile.exists()); Properties storedProperties = getStoredProperties(testFolder); @@ -187,7 +188,7 @@ public class WritePredefinedProjectPropertiesTest { assertTrue(storedProperties.containsKey(includedKey)); assertEquals(includedValue, storedProperties.getProperty(includedKey)); } - + @Test public void testExecuteIncludingPropertyKeysFromBlankLocation() { // given @@ -199,15 +200,16 @@ public class WritePredefinedProjectPropertiesTest { projectProperties.setProperty(key, value); projectProperties.setProperty(includedKey, includedValue); doReturn(projectProperties).when(mavenProject).getProperties(); - + mojo.setIncludePropertyKeysFromFiles(new String[] {""}); - + // execute Assertions.assertThrows(MojoExecutionException.class, () -> mojo.execute()); } - + @Test - public void testExecuteIncludingPropertyKeysFromXmlFile(@TempDir File testFolder) throws Exception { + public void testExecuteIncludingPropertyKeysFromXmlFile(@TempDir File testFolder) + throws Exception { // given String key = "projectPropertyKey"; String value = "projectPropertyValue"; @@ -217,17 +219,18 @@ public class WritePredefinedProjectPropertiesTest { projectProperties.setProperty(key, value); projectProperties.setProperty(includedKey, includedValue); doReturn(projectProperties).when(mavenProject).getProperties(); - + File includedPropertiesFile = new File(testFolder, "included.xml"); Properties includedProperties = new Properties(); includedProperties.setProperty(includedKey, "irrelevantValue"); includedProperties.storeToXML(new FileOutputStream(includedPropertiesFile), null); - - mojo.setIncludePropertyKeysFromFiles(new String[] {includedPropertiesFile.getAbsolutePath()}); - + + mojo.setIncludePropertyKeysFromFiles( + new String[] {includedPropertiesFile.getAbsolutePath()}); + // execute mojo.execute(); - + // assert assertTrue(mojo.outputFile.exists()); Properties storedProperties = getStoredProperties(testFolder); @@ -235,9 +238,10 @@ public class WritePredefinedProjectPropertiesTest { assertTrue(storedProperties.containsKey(includedKey)); assertEquals(includedValue, storedProperties.getProperty(includedKey)); } - + @Test - public void testExecuteIncludingPropertyKeysFromInvalidXmlFile(@TempDir File testFolder) throws Exception { + public void testExecuteIncludingPropertyKeysFromInvalidXmlFile(@TempDir File testFolder) + throws Exception { // given String key = "projectPropertyKey"; String value = "projectPropertyValue"; @@ -247,59 +251,60 @@ public class WritePredefinedProjectPropertiesTest { projectProperties.setProperty(key, value); projectProperties.setProperty(includedKey, includedValue); doReturn(projectProperties).when(mavenProject).getProperties(); - + File includedPropertiesFile = new File(testFolder, "included.xml"); Properties includedProperties = new Properties(); includedProperties.setProperty(includedKey, "irrelevantValue"); includedProperties.store(new FileOutputStream(includedPropertiesFile), null); - - mojo.setIncludePropertyKeysFromFiles(new String[] {includedPropertiesFile.getAbsolutePath()}); - + + mojo.setIncludePropertyKeysFromFiles( + new String[] {includedPropertiesFile.getAbsolutePath()}); + // execute Assertions.assertThrows(MojoExecutionException.class, () -> mojo.execute()); } - + @Test public void testExecuteWithQuietModeOn(@TempDir File testFolder) throws Exception { // given mojo.setQuiet(true); mojo.setIncludePropertyKeysFromFiles(new String[] {"invalid location"}); - + // execute mojo.execute(); - + // assert assertTrue(mojo.outputFile.exists()); Properties storedProperties = getStoredProperties(testFolder); assertEquals(0, storedProperties.size()); } - + @Test public void testExecuteIncludingPropertyKeysFromInvalidFile() { // given mojo.setIncludePropertyKeysFromFiles(new String[] {"invalid location"}); - + // execute Assertions.assertThrows(MojoExecutionException.class, () -> mojo.execute()); } - + @Test public void testExecuteWithEnvironmentProperties(@TempDir File testFolder) throws Exception { // given mojo.setIncludeEnvironmentVariables(true); - + // execute mojo.execute(); - + // assert assertTrue(mojo.outputFile.exists()); Properties storedProperties = getStoredProperties(testFolder); assertTrue(storedProperties.size() > 0); for (Object currentKey : storedProperties.keySet()) { - assertTrue(((String)currentKey).startsWith(PROPERTY_PREFIX_ENV)); + assertTrue(((String) currentKey).startsWith(PROPERTY_PREFIX_ENV)); } } - + @Test public void testExecuteWithSystemProperties(@TempDir File testFolder) throws Exception { // given @@ -307,10 +312,10 @@ public class WritePredefinedProjectPropertiesTest { String value = "systemPropertyValue"; System.setProperty(key, value); mojo.setIncludeSystemProperties(true); - + // execute mojo.execute(); - + // assert assertTrue(mojo.outputFile.exists()); Properties storedProperties = getStoredProperties(testFolder); @@ -318,9 +323,10 @@ public class WritePredefinedProjectPropertiesTest { assertTrue(storedProperties.containsKey(key)); assertEquals(value, storedProperties.getProperty(key)); } - + @Test - public void testExecuteWithSystemPropertiesAndEscapeChars(@TempDir File testFolder) throws Exception { + public void testExecuteWithSystemPropertiesAndEscapeChars(@TempDir File testFolder) + throws Exception { // given String key = "systemPropertyKey "; String value = "systemPropertyValue"; @@ -328,10 +334,10 @@ public class WritePredefinedProjectPropertiesTest { mojo.setIncludeSystemProperties(true); String escapeChars = "cr,lf,tab,|"; mojo.setEscapeChars(escapeChars); - + // execute mojo.execute(); - + // assert assertTrue(mojo.outputFile.exists()); Properties storedProperties = getStoredProperties(testFolder); @@ -340,14 +346,15 @@ public class WritePredefinedProjectPropertiesTest { assertTrue(storedProperties.containsKey(key.trim())); assertEquals(value, storedProperties.getProperty(key.trim())); } - + // ----------------------------------- PRIVATE ------------------------------------------- - + private File getPropertiesFileLocation(File testFolder) { return new File(testFolder, "test.properties"); } - - private Properties getStoredProperties(File testFolder) throws FileNotFoundException, IOException { + + private Properties getStoredProperties(File testFolder) + throws FileNotFoundException, IOException { Properties properties = new Properties(); properties.load(new FileInputStream(getPropertiesFileLocation(testFolder))); return properties; diff --git a/dhp-common/src/main/java/eu/dnetlib/collector/worker/model/ApiDescriptor.java b/dhp-common/src/main/java/eu/dnetlib/collector/worker/model/ApiDescriptor.java index 255104eda..a95b1267f 100644 --- a/dhp-common/src/main/java/eu/dnetlib/collector/worker/model/ApiDescriptor.java +++ b/dhp-common/src/main/java/eu/dnetlib/collector/worker/model/ApiDescriptor.java @@ -3,47 +3,45 @@ package eu.dnetlib.collector.worker.model; import java.util.HashMap; import java.util.Map; - public class ApiDescriptor { - private String id; + private String id; - private String baseUrl; + private String baseUrl; - private String protocol; + private String protocol; - private Map params = new HashMap<>(); + private Map params = new HashMap<>(); - public String getBaseUrl() { - return baseUrl; - } + public String getBaseUrl() { + return baseUrl; + } - public void setBaseUrl(final String baseUrl) { - this.baseUrl = baseUrl; - } + public void setBaseUrl(final String baseUrl) { + this.baseUrl = baseUrl; + } - public String getId() { - return id; - } + public String getId() { + return id; + } - public void setId(final String id) { - this.id = id; - } + public void setId(final String id) { + this.id = id; + } - public Map getParams() { - return params; - } + public Map getParams() { + return params; + } - public void setParams(final HashMap params) { - this.params = params; - } + public void setParams(final HashMap params) { + this.params = params; + } - public String getProtocol() { - return protocol; - } - - public void setProtocol(final String protocol) { - this.protocol = protocol; - } + public String getProtocol() { + return protocol; + } + public void setProtocol(final String protocol) { + this.protocol = protocol; + } } diff --git a/dhp-common/src/main/java/eu/dnetlib/data/mdstore/manager/common/model/MDStore.java b/dhp-common/src/main/java/eu/dnetlib/data/mdstore/manager/common/model/MDStore.java index 478bda440..1284e9ab6 100644 --- a/dhp-common/src/main/java/eu/dnetlib/data/mdstore/manager/common/model/MDStore.java +++ b/dhp-common/src/main/java/eu/dnetlib/data/mdstore/manager/common/model/MDStore.java @@ -2,7 +2,6 @@ package eu.dnetlib.data.mdstore.manager.common.model; import java.io.Serializable; import java.util.UUID; - import javax.persistence.Column; import javax.persistence.Entity; import javax.persistence.Id; @@ -12,108 +11,107 @@ import javax.persistence.Table; @Table(name = "mdstores") public class MDStore implements Serializable { - /** - * - */ - private static final long serialVersionUID = 3160530489149700055L; + /** */ + private static final long serialVersionUID = 3160530489149700055L; - @Id - @Column(name = "id") - private String id; + @Id + @Column(name = "id") + private String id; - @Column(name = "format") - private String format; + @Column(name = "format") + private String format; - @Column(name = "layout") - private String layout; + @Column(name = "layout") + private String layout; - @Column(name = "interpretation") - private String interpretation; + @Column(name = "interpretation") + private String interpretation; - @Column(name = "datasource_name") - private String datasourceName; + @Column(name = "datasource_name") + private String datasourceName; - @Column(name = "datasource_id") - private String datasourceId; + @Column(name = "datasource_id") + private String datasourceId; - @Column(name = "api_id") - private String apiId; + @Column(name = "api_id") + private String apiId; - public String getId() { - return id; - } + public String getId() { + return id; + } - public void setId(final String id) { - this.id = id; - } + public void setId(final String id) { + this.id = id; + } - public String getFormat() { - return format; - } + public String getFormat() { + return format; + } - public void setFormat(final String format) { - this.format = format; - } + public void setFormat(final String format) { + this.format = format; + } - public String getLayout() { - return layout; - } + public String getLayout() { + return layout; + } - public void setLayout(final String layout) { - this.layout = layout; - } + public void setLayout(final String layout) { + this.layout = layout; + } - public String getInterpretation() { - return interpretation; - } + public String getInterpretation() { + return interpretation; + } - public void setInterpretation(final String interpretation) { - this.interpretation = interpretation; - } + public void setInterpretation(final String interpretation) { + this.interpretation = interpretation; + } - public String getDatasourceName() { - return datasourceName; - } + public String getDatasourceName() { + return datasourceName; + } - public void setDatasourceName(final String datasourceName) { - this.datasourceName = datasourceName; - } + public void setDatasourceName(final String datasourceName) { + this.datasourceName = datasourceName; + } - public String getDatasourceId() { - return datasourceId; - } + public String getDatasourceId() { + return datasourceId; + } - public void setDatasourceId(final String datasourceId) { - this.datasourceId = datasourceId; - } + public void setDatasourceId(final String datasourceId) { + this.datasourceId = datasourceId; + } - public String getApiId() { - return apiId; - } + public String getApiId() { + return apiId; + } - public void setApiId(final String apiId) { - this.apiId = apiId; - } + public void setApiId(final String apiId) { + this.apiId = apiId; + } - public static MDStore newInstance(final String format, final String layout, final String interpretation) { - return newInstance(format, layout, interpretation, null, null, null); - } - - public static MDStore newInstance(final String format, - final String layout, - final String interpretation, - final String dsName, - final String dsId, - final String apiId) { - final MDStore md = new MDStore(); - md.setId("md-" + UUID.randomUUID()); - md.setFormat(format); - md.setLayout(layout); - md.setInterpretation(interpretation); - md.setDatasourceName(dsName); - md.setDatasourceId(dsId); - md.setApiId(apiId); - return md; - } + public static MDStore newInstance( + final String format, final String layout, final String interpretation) { + return newInstance(format, layout, interpretation, null, null, null); + } + public static MDStore newInstance( + final String format, + final String layout, + final String interpretation, + final String dsName, + final String dsId, + final String apiId) { + final MDStore md = new MDStore(); + md.setId("md-" + UUID.randomUUID()); + md.setFormat(format); + md.setLayout(layout); + md.setInterpretation(interpretation); + md.setDatasourceName(dsName); + md.setDatasourceId(dsId); + md.setApiId(apiId); + return md; + } } diff --git a/dhp-common/src/main/java/eu/dnetlib/data/mdstore/manager/common/model/MDStoreCurrentVersion.java b/dhp-common/src/main/java/eu/dnetlib/data/mdstore/manager/common/model/MDStoreCurrentVersion.java index 2a52d0d1d..45f6c8ee1 100644 --- a/dhp-common/src/main/java/eu/dnetlib/data/mdstore/manager/common/model/MDStoreCurrentVersion.java +++ b/dhp-common/src/main/java/eu/dnetlib/data/mdstore/manager/common/model/MDStoreCurrentVersion.java @@ -1,7 +1,6 @@ package eu.dnetlib.data.mdstore.manager.common.model; import java.io.Serializable; - import javax.persistence.Column; import javax.persistence.Entity; import javax.persistence.Id; @@ -11,42 +10,40 @@ import javax.persistence.Table; @Table(name = "mdstore_current_versions") public class MDStoreCurrentVersion implements Serializable { - /** - * - */ - private static final long serialVersionUID = -4757725888593745773L; + /** */ + private static final long serialVersionUID = -4757725888593745773L; - @Id - @Column(name = "mdstore") - private String mdstore; + @Id + @Column(name = "mdstore") + private String mdstore; - @Column(name = "current_version") - private String currentVersion; + @Column(name = "current_version") + private String currentVersion; - public String getMdstore() { - return mdstore; - } + public String getMdstore() { + return mdstore; + } - public void setMdstore(final String mdstore) { - this.mdstore = mdstore; - } + public void setMdstore(final String mdstore) { + this.mdstore = mdstore; + } - public String getCurrentVersion() { - return currentVersion; - } + public String getCurrentVersion() { + return currentVersion; + } - public void setCurrentVersion(final String currentVersion) { - this.currentVersion = currentVersion; - } + public void setCurrentVersion(final String currentVersion) { + this.currentVersion = currentVersion; + } - public static MDStoreCurrentVersion newInstance(final String mdId, final String versionId) { - final MDStoreCurrentVersion cv = new MDStoreCurrentVersion(); - cv.setMdstore(mdId); - cv.setCurrentVersion(versionId); - return cv; - } + public static MDStoreCurrentVersion newInstance(final String mdId, final String versionId) { + final MDStoreCurrentVersion cv = new MDStoreCurrentVersion(); + cv.setMdstore(mdId); + cv.setCurrentVersion(versionId); + return cv; + } - public static MDStoreCurrentVersion newInstance(final MDStoreVersion v) { - return newInstance(v.getMdstore(), v.getId()); - } + public static MDStoreCurrentVersion newInstance(final MDStoreVersion v) { + return newInstance(v.getMdstore(), v.getId()); + } } diff --git a/dhp-common/src/main/java/eu/dnetlib/data/mdstore/manager/common/model/MDStoreVersion.java b/dhp-common/src/main/java/eu/dnetlib/data/mdstore/manager/common/model/MDStoreVersion.java index 4ad6f137c..b53d5f118 100644 --- a/dhp-common/src/main/java/eu/dnetlib/data/mdstore/manager/common/model/MDStoreVersion.java +++ b/dhp-common/src/main/java/eu/dnetlib/data/mdstore/manager/common/model/MDStoreVersion.java @@ -2,7 +2,6 @@ package eu.dnetlib.data.mdstore.manager.common.model; import java.io.Serializable; import java.util.Date; - import javax.persistence.Column; import javax.persistence.Entity; import javax.persistence.Id; @@ -14,88 +13,85 @@ import javax.persistence.TemporalType; @Table(name = "mdstore_versions") public class MDStoreVersion implements Serializable { - /** - * - */ - private static final long serialVersionUID = -4763494442274298339L; + /** */ + private static final long serialVersionUID = -4763494442274298339L; - @Id - @Column(name = "id") - private String id; + @Id + @Column(name = "id") + private String id; - @Column(name = "mdstore") - private String mdstore; + @Column(name = "mdstore") + private String mdstore; - @Column(name = "writing") - private boolean writing; + @Column(name = "writing") + private boolean writing; - @Column(name = "readcount") - private int readCount = 0; + @Column(name = "readcount") + private int readCount = 0; - @Column(name = "lastupdate") - @Temporal(TemporalType.TIMESTAMP) - private Date lastUpdate; + @Column(name = "lastupdate") + @Temporal(TemporalType.TIMESTAMP) + private Date lastUpdate; - @Column(name = "size") - private long size = 0; + @Column(name = "size") + private long size = 0; - public static MDStoreVersion newInstance(final String mdId, final boolean writing) { - final MDStoreVersion t = new MDStoreVersion(); - t.setId(mdId + "-" + new Date().getTime()); - t.setMdstore(mdId); - t.setLastUpdate(null); - t.setWriting(writing); - t.setReadCount(0); - t.setSize(0); - return t; - } + public static MDStoreVersion newInstance(final String mdId, final boolean writing) { + final MDStoreVersion t = new MDStoreVersion(); + t.setId(mdId + "-" + new Date().getTime()); + t.setMdstore(mdId); + t.setLastUpdate(null); + t.setWriting(writing); + t.setReadCount(0); + t.setSize(0); + return t; + } - public String getId() { - return id; - } + public String getId() { + return id; + } - public void setId(final String id) { - this.id = id; - } + public void setId(final String id) { + this.id = id; + } - public String getMdstore() { - return mdstore; - } + public String getMdstore() { + return mdstore; + } - public void setMdstore(final String mdstore) { - this.mdstore = mdstore; - } + public void setMdstore(final String mdstore) { + this.mdstore = mdstore; + } - public boolean isWriting() { - return writing; - } + public boolean isWriting() { + return writing; + } - public void setWriting(final boolean writing) { - this.writing = writing; - } + public void setWriting(final boolean writing) { + this.writing = writing; + } - public int getReadCount() { - return readCount; - } + public int getReadCount() { + return readCount; + } - public void setReadCount(final int readCount) { - this.readCount = readCount; - } + public void setReadCount(final int readCount) { + this.readCount = readCount; + } - public Date getLastUpdate() { - return lastUpdate; - } + public Date getLastUpdate() { + return lastUpdate; + } - public void setLastUpdate(final Date lastUpdate) { - this.lastUpdate = lastUpdate; - } + public void setLastUpdate(final Date lastUpdate) { + this.lastUpdate = lastUpdate; + } - public long getSize() { - return size; - } - - public void setSize(final long size) { - this.size = size; - } + public long getSize() { + return size; + } + public void setSize(final long size) { + this.size = size; + } } diff --git a/dhp-common/src/main/java/eu/dnetlib/data/mdstore/manager/common/model/MDStoreWithInfo.java b/dhp-common/src/main/java/eu/dnetlib/data/mdstore/manager/common/model/MDStoreWithInfo.java index dfe188f54..27c77a5e7 100644 --- a/dhp-common/src/main/java/eu/dnetlib/data/mdstore/manager/common/model/MDStoreWithInfo.java +++ b/dhp-common/src/main/java/eu/dnetlib/data/mdstore/manager/common/model/MDStoreWithInfo.java @@ -2,7 +2,6 @@ package eu.dnetlib.data.mdstore.manager.common.model; import java.io.Serializable; import java.util.Date; - import javax.persistence.Column; import javax.persistence.Entity; import javax.persistence.Id; @@ -14,132 +13,129 @@ import javax.persistence.TemporalType; @Table(name = "mdstores_with_info") public class MDStoreWithInfo implements Serializable { - /** - * - */ - private static final long serialVersionUID = -8445784770687571492L; + /** */ + private static final long serialVersionUID = -8445784770687571492L; - @Id - @Column(name = "id") - private String id; + @Id + @Column(name = "id") + private String id; - @Column(name = "format") - private String format; + @Column(name = "format") + private String format; - @Column(name = "layout") - private String layout; + @Column(name = "layout") + private String layout; - @Column(name = "interpretation") - private String interpretation; + @Column(name = "interpretation") + private String interpretation; - @Column(name = "datasource_name") - private String datasourceName; + @Column(name = "datasource_name") + private String datasourceName; - @Column(name = "datasource_id") - private String datasourceId; + @Column(name = "datasource_id") + private String datasourceId; - @Column(name = "api_id") - private String apiId; + @Column(name = "api_id") + private String apiId; - @Column(name = "current_version") - private String currentVersion; + @Column(name = "current_version") + private String currentVersion; - @Column(name = "lastupdate") - @Temporal(TemporalType.TIMESTAMP) - private Date lastUpdate; + @Column(name = "lastupdate") + @Temporal(TemporalType.TIMESTAMP) + private Date lastUpdate; - @Column(name = "size") - private long size = 0; + @Column(name = "size") + private long size = 0; - @Column(name = "n_versions") - private long numberOfVersions = 0; + @Column(name = "n_versions") + private long numberOfVersions = 0; - public String getId() { - return id; - } + public String getId() { + return id; + } - public void setId(final String id) { - this.id = id; - } + public void setId(final String id) { + this.id = id; + } - public String getFormat() { - return format; - } + public String getFormat() { + return format; + } - public void setFormat(final String format) { - this.format = format; - } + public void setFormat(final String format) { + this.format = format; + } - public String getLayout() { - return layout; - } + public String getLayout() { + return layout; + } - public void setLayout(final String layout) { - this.layout = layout; - } + public void setLayout(final String layout) { + this.layout = layout; + } - public String getInterpretation() { - return interpretation; - } + public String getInterpretation() { + return interpretation; + } - public void setInterpretation(final String interpretation) { - this.interpretation = interpretation; - } + public void setInterpretation(final String interpretation) { + this.interpretation = interpretation; + } - public String getDatasourceName() { - return datasourceName; - } + public String getDatasourceName() { + return datasourceName; + } - public void setDatasourceName(final String datasourceName) { - this.datasourceName = datasourceName; - } + public void setDatasourceName(final String datasourceName) { + this.datasourceName = datasourceName; + } - public String getDatasourceId() { - return datasourceId; - } + public String getDatasourceId() { + return datasourceId; + } - public void setDatasourceId(final String datasourceId) { - this.datasourceId = datasourceId; - } + public void setDatasourceId(final String datasourceId) { + this.datasourceId = datasourceId; + } - public String getApiId() { - return apiId; - } + public String getApiId() { + return apiId; + } - public void setApiId(final String apiId) { - this.apiId = apiId; - } + public void setApiId(final String apiId) { + this.apiId = apiId; + } - public String getCurrentVersion() { - return currentVersion; - } + public String getCurrentVersion() { + return currentVersion; + } - public void setCurrentVersion(final String currentVersion) { - this.currentVersion = currentVersion; - } + public void setCurrentVersion(final String currentVersion) { + this.currentVersion = currentVersion; + } - public Date getLastUpdate() { - return lastUpdate; - } + public Date getLastUpdate() { + return lastUpdate; + } - public void setLastUpdate(final Date lastUpdate) { - this.lastUpdate = lastUpdate; - } + public void setLastUpdate(final Date lastUpdate) { + this.lastUpdate = lastUpdate; + } - public long getSize() { - return size; - } + public long getSize() { + return size; + } - public void setSize(final long size) { - this.size = size; - } + public void setSize(final long size) { + this.size = size; + } - public long getNumberOfVersions() { - return numberOfVersions; - } - - public void setNumberOfVersions(final long numberOfVersions) { - this.numberOfVersions = numberOfVersions; - } + public long getNumberOfVersions() { + return numberOfVersions; + } + public void setNumberOfVersions(final long numberOfVersions) { + this.numberOfVersions = numberOfVersions; + } } diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/application/ArgumentApplicationParser.java b/dhp-common/src/main/java/eu/dnetlib/dhp/application/ArgumentApplicationParser.java index cbfc5caf1..f3a2273cb 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/application/ArgumentApplicationParser.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/application/ArgumentApplicationParser.java @@ -1,10 +1,6 @@ package eu.dnetlib.dhp.application; import com.fasterxml.jackson.databind.ObjectMapper; -import org.apache.commons.cli.*; -import org.apache.commons.codec.binary.Base64; -import org.apache.commons.io.IOUtils; - import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.io.Serializable; @@ -12,7 +8,9 @@ import java.io.StringWriter; import java.util.*; import java.util.zip.GZIPInputStream; import java.util.zip.GZIPOutputStream; -import java.util.zip.Inflater; +import org.apache.commons.cli.*; +import org.apache.commons.codec.binary.Base64; +import org.apache.commons.io.IOUtils; public class ArgumentApplicationParser implements Serializable { @@ -23,7 +21,8 @@ public class ArgumentApplicationParser implements Serializable { public ArgumentApplicationParser(final String json_configuration) throws Exception { final ObjectMapper mapper = new ObjectMapper(); - final OptionsParameter[] configuration = mapper.readValue(json_configuration, OptionsParameter[].class); + final OptionsParameter[] configuration = + mapper.readValue(json_configuration, OptionsParameter[].class); createOptionMap(configuration); } @@ -33,23 +32,26 @@ public class ArgumentApplicationParser implements Serializable { private void createOptionMap(final OptionsParameter[] configuration) { - Arrays.stream(configuration).map(conf -> { - final Option o = new Option(conf.getParamName(), true, conf.getParamDescription()); - o.setLongOpt(conf.getParamLongName()); - o.setRequired(conf.isParamRequired()); - if (conf.isCompressed()) { - compressedValues.add(conf.getParamLongName()); - } - return o; - }).forEach(options::addOption); - -// HelpFormatter formatter = new HelpFormatter(); -// formatter.printHelp("myapp", null, options, null, true); + Arrays.stream(configuration) + .map( + conf -> { + final Option o = + new Option( + conf.getParamName(), true, conf.getParamDescription()); + o.setLongOpt(conf.getParamLongName()); + o.setRequired(conf.isParamRequired()); + if (conf.isCompressed()) { + compressedValues.add(conf.getParamLongName()); + } + return o; + }) + .forEach(options::addOption); + // HelpFormatter formatter = new HelpFormatter(); + // formatter.printHelp("myapp", null, options, null, true); } - public static String decompressValue(final String abstractCompressed) { try { byte[] byteArray = Base64.decodeBase64(abstractCompressed.getBytes()); @@ -63,7 +65,7 @@ public class ArgumentApplicationParser implements Serializable { } } - public static String compressArgument(final String value) throws Exception{ + public static String compressArgument(final String value) throws Exception { ByteArrayOutputStream out = new ByteArrayOutputStream(); GZIPOutputStream gzip = new GZIPOutputStream(out); gzip.write(value.getBytes()); @@ -74,7 +76,14 @@ public class ArgumentApplicationParser implements Serializable { public void parseArgument(final String[] args) throws Exception { CommandLineParser parser = new BasicParser(); CommandLine cmd = parser.parse(options, args); - Arrays.stream(cmd.getOptions()).forEach(it -> objectMap.put(it.getLongOpt(), compressedValues.contains(it.getLongOpt())? decompressValue(it.getValue()): it.getValue())); + Arrays.stream(cmd.getOptions()) + .forEach( + it -> + objectMap.put( + it.getLongOpt(), + compressedValues.contains(it.getLongOpt()) + ? decompressValue(it.getValue()) + : it.getValue())); } public String get(final String key) { diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/application/OptionsParameter.java b/dhp-common/src/main/java/eu/dnetlib/dhp/application/OptionsParameter.java index 4e7c2826b..dcc537656 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/application/OptionsParameter.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/application/OptionsParameter.java @@ -1,6 +1,5 @@ package eu.dnetlib.dhp.application; - public class OptionsParameter { private String paramName; @@ -9,8 +8,7 @@ public class OptionsParameter { private boolean paramRequired; private boolean compressed; - public OptionsParameter() { - } + public OptionsParameter() {} public String getParamName() { return paramName; diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/common/FunctionalInterfaceSupport.java b/dhp-common/src/main/java/eu/dnetlib/dhp/common/FunctionalInterfaceSupport.java index d78520f55..17a039878 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/common/FunctionalInterfaceSupport.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/common/FunctionalInterfaceSupport.java @@ -3,23 +3,19 @@ package eu.dnetlib.dhp.common; import java.io.Serializable; import java.util.function.Supplier; -/** - * Provides serializable and throwing extensions to standard functional interfaces. - */ +/** Provides serializable and throwing extensions to standard functional interfaces. */ public class FunctionalInterfaceSupport { - private FunctionalInterfaceSupport() { - } + private FunctionalInterfaceSupport() {} /** - * Serializable supplier of any kind of objects. To be used withing spark processing pipelines when supplying - * functions externally. + * Serializable supplier of any kind of objects. To be used withing spark processing pipelines + * when supplying functions externally. * * @param */ @FunctionalInterface - public interface SerializableSupplier extends Supplier, Serializable { - } + public interface SerializableSupplier extends Supplier, Serializable {} /** * Extension of consumer accepting functions throwing an exception. @@ -52,5 +48,4 @@ public class FunctionalInterfaceSupport { public interface ThrowingRunnable { void run() throws E; } - -} \ No newline at end of file +} diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/common/HdfsSupport.java b/dhp-common/src/main/java/eu/dnetlib/dhp/common/HdfsSupport.java index f6b94c921..48d6d79f6 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/common/HdfsSupport.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/common/HdfsSupport.java @@ -1,5 +1,10 @@ package eu.dnetlib.dhp.common; +import static eu.dnetlib.dhp.common.ThrowingSupport.rethrowAsRuntimeException; + +import java.util.Arrays; +import java.util.List; +import java.util.stream.Collectors; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; @@ -7,66 +12,60 @@ import org.apache.hadoop.fs.Path; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.util.Arrays; -import java.util.List; -import java.util.stream.Collectors; - -import static eu.dnetlib.dhp.common.ThrowingSupport.rethrowAsRuntimeException; - -/** - * HDFS utility methods. - */ +/** HDFS utility methods. */ public class HdfsSupport { private static final Logger logger = LoggerFactory.getLogger(HdfsSupport.class); - private HdfsSupport() { - } + private HdfsSupport() {} /** * Checks a path (file or dir) exists on HDFS. * - * @param path Path to be checked + * @param path Path to be checked * @param configuration Configuration of hadoop env */ public static boolean exists(String path, Configuration configuration) { logger.info("Removing path: {}", path); - return rethrowAsRuntimeException(() -> { - Path f = new Path(path); - FileSystem fileSystem = FileSystem.get(configuration); - return fileSystem.exists(f); - }); + return rethrowAsRuntimeException( + () -> { + Path f = new Path(path); + FileSystem fileSystem = FileSystem.get(configuration); + return fileSystem.exists(f); + }); } /** * Removes a path (file or dir) from HDFS. * - * @param path Path to be removed + * @param path Path to be removed * @param configuration Configuration of hadoop env */ public static void remove(String path, Configuration configuration) { logger.info("Removing path: {}", path); - rethrowAsRuntimeException(() -> { - Path f = new Path(path); - FileSystem fileSystem = FileSystem.get(configuration); - if (fileSystem.exists(f)) { - fileSystem.delete(f, true); - } - }); + rethrowAsRuntimeException( + () -> { + Path f = new Path(path); + FileSystem fileSystem = FileSystem.get(configuration); + if (fileSystem.exists(f)) { + fileSystem.delete(f, true); + } + }); } /** * Lists hadoop files located below path or alternatively lists subdirs under path. * - * @param path Path to be listed for hadoop files + * @param path Path to be listed for hadoop files * @param configuration Configuration of hadoop env * @return List with string locations of hadoop files */ public static List listFiles(String path, Configuration configuration) { logger.info("Listing files in path: {}", path); - return rethrowAsRuntimeException(() -> Arrays - .stream(FileSystem.get(configuration).listStatus(new Path(path))) - .filter(FileStatus::isDirectory) - .map(x -> x.getPath().toString()) - .collect(Collectors.toList())); + return rethrowAsRuntimeException( + () -> + Arrays.stream(FileSystem.get(configuration).listStatus(new Path(path))) + .filter(FileStatus::isDirectory) + .map(x -> x.getPath().toString()) + .collect(Collectors.toList())); } } diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/common/SparkSessionSupport.java b/dhp-common/src/main/java/eu/dnetlib/dhp/common/SparkSessionSupport.java index 43c18a956..12e98a056 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/common/SparkSessionSupport.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/common/SparkSessionSupport.java @@ -1,61 +1,71 @@ package eu.dnetlib.dhp.common; import eu.dnetlib.dhp.common.FunctionalInterfaceSupport.ThrowingConsumer; +import java.util.Objects; +import java.util.function.Function; import org.apache.spark.SparkConf; import org.apache.spark.sql.SparkSession; -import java.util.Objects; -import java.util.function.Function; - -/** - * SparkSession utility methods. - */ +/** SparkSession utility methods. */ public class SparkSessionSupport { - private SparkSessionSupport() { + private SparkSessionSupport() {} + + /** + * Runs a given function using SparkSession created using default builder and supplied + * SparkConf. Stops SparkSession when SparkSession is managed. Allows to reuse SparkSession + * created externally. + * + * @param conf SparkConf instance + * @param isSparkSessionManaged When true will stop SparkSession + * @param fn Consumer to be applied to constructed SparkSession + */ + public static void runWithSparkSession( + SparkConf conf, + Boolean isSparkSessionManaged, + ThrowingConsumer fn) { + runWithSparkSession( + c -> SparkSession.builder().config(c).getOrCreate(), + conf, + isSparkSessionManaged, + fn); } /** - * Runs a given function using SparkSession created using default builder and supplied SparkConf. Stops SparkSession - * when SparkSession is managed. Allows to reuse SparkSession created externally. + * Runs a given function using SparkSession created with hive support and using default builder + * and supplied SparkConf. Stops SparkSession when SparkSession is managed. Allows to reuse + * SparkSession created externally. * - * @param conf SparkConf instance + * @param conf SparkConf instance * @param isSparkSessionManaged When true will stop SparkSession - * @param fn Consumer to be applied to constructed SparkSession + * @param fn Consumer to be applied to constructed SparkSession */ - public static void runWithSparkSession(SparkConf conf, - Boolean isSparkSessionManaged, - ThrowingConsumer fn) { - runWithSparkSession(c -> SparkSession.builder().config(c).getOrCreate(), conf, isSparkSessionManaged, fn); + public static void runWithSparkHiveSession( + SparkConf conf, + Boolean isSparkSessionManaged, + ThrowingConsumer fn) { + runWithSparkSession( + c -> SparkSession.builder().config(c).enableHiveSupport().getOrCreate(), + conf, + isSparkSessionManaged, + fn); } /** - * Runs a given function using SparkSession created with hive support and using default builder and supplied SparkConf. - * Stops SparkSession when SparkSession is managed. Allows to reuse SparkSession created externally. + * Runs a given function using SparkSession created using supplied builder and supplied + * SparkConf. Stops SparkSession when SparkSession is managed. Allows to reuse SparkSession + * created externally. * - * @param conf SparkConf instance + * @param sparkSessionBuilder Builder of SparkSession + * @param conf SparkConf instance * @param isSparkSessionManaged When true will stop SparkSession - * @param fn Consumer to be applied to constructed SparkSession + * @param fn Consumer to be applied to constructed SparkSession */ - public static void runWithSparkHiveSession(SparkConf conf, - Boolean isSparkSessionManaged, - ThrowingConsumer fn) { - runWithSparkSession(c -> SparkSession.builder().config(c).enableHiveSupport().getOrCreate(), conf, isSparkSessionManaged, fn); - } - - /** - * Runs a given function using SparkSession created using supplied builder and supplied SparkConf. Stops SparkSession - * when SparkSession is managed. Allows to reuse SparkSession created externally. - * - * @param sparkSessionBuilder Builder of SparkSession - * @param conf SparkConf instance - * @param isSparkSessionManaged When true will stop SparkSession - * @param fn Consumer to be applied to constructed SparkSession - */ - public static void runWithSparkSession(Function sparkSessionBuilder, - SparkConf conf, - Boolean isSparkSessionManaged, - ThrowingConsumer fn) { + public static void runWithSparkSession( + Function sparkSessionBuilder, + SparkConf conf, + Boolean isSparkSessionManaged, + ThrowingConsumer fn) { SparkSession spark = null; try { spark = sparkSessionBuilder.apply(conf); @@ -68,4 +78,4 @@ public class SparkSessionSupport { } } } -} \ No newline at end of file +} diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/common/ThrowingSupport.java b/dhp-common/src/main/java/eu/dnetlib/dhp/common/ThrowingSupport.java index b32803c37..5bc66f493 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/common/ThrowingSupport.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/common/ThrowingSupport.java @@ -3,18 +3,15 @@ package eu.dnetlib.dhp.common; import eu.dnetlib.dhp.common.FunctionalInterfaceSupport.ThrowingRunnable; import eu.dnetlib.dhp.common.FunctionalInterfaceSupport.ThrowingSupplier; -/** - * Exception handling utility methods. - */ +/** Exception handling utility methods. */ public class ThrowingSupport { - private ThrowingSupport() { - } + private ThrowingSupport() {} /** * Executes given runnable and rethrows any exceptions as RuntimeException. * - * @param fn Runnable to be executed + * @param fn Runnable to be executed * @param Type of exception thrown */ public static void rethrowAsRuntimeException(ThrowingRunnable fn) { @@ -28,11 +25,12 @@ public class ThrowingSupport { /** * Executes given runnable and rethrows any exceptions as RuntimeException with custom message. * - * @param fn Runnable to be executed + * @param fn Runnable to be executed * @param msg Message to be set for rethrown exception * @param Type of exception thrown */ - public static void rethrowAsRuntimeException(ThrowingRunnable fn, String msg) { + public static void rethrowAsRuntimeException( + ThrowingRunnable fn, String msg) { try { fn.run(); } catch (Exception e) { @@ -43,7 +41,7 @@ public class ThrowingSupport { /** * Executes given supplier and rethrows any exceptions as RuntimeException. * - * @param fn Supplier to be executed + * @param fn Supplier to be executed * @param Type of returned value * @param Type of exception thrown * @return Result of supplier execution @@ -59,18 +57,18 @@ public class ThrowingSupport { /** * Executes given supplier and rethrows any exceptions as RuntimeException with custom message. * - * @param fn Supplier to be executed + * @param fn Supplier to be executed * @param msg Message to be set for rethrown exception * @param Type of returned value * @param Type of exception thrown * @return Result of supplier execution */ - public static T rethrowAsRuntimeException(ThrowingSupplier fn, String msg) { + public static T rethrowAsRuntimeException( + ThrowingSupplier fn, String msg) { try { return fn.get(); } catch (Exception e) { throw new RuntimeException(msg, e); } } - -} \ No newline at end of file +} diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/model/mdstore/MetadataRecord.java b/dhp-common/src/main/java/eu/dnetlib/dhp/model/mdstore/MetadataRecord.java index acbb7ffbb..98ee80bd6 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/model/mdstore/MetadataRecord.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/model/mdstore/MetadataRecord.java @@ -1,66 +1,52 @@ package eu.dnetlib.dhp.model.mdstore; import eu.dnetlib.dhp.utils.DHPUtils; - import java.io.Serializable; - -/** - * This class models a record inside the new Metadata store collection on HDFS * - * - */ +/** This class models a record inside the new Metadata store collection on HDFS * */ public class MetadataRecord implements Serializable { - /** - * The D-Net Identifier associated to the record - */ + /** The D-Net Identifier associated to the record */ private String id; - /** - * The original Identifier of the record - */ + /** The original Identifier of the record */ private String originalId; - - /** - * The encoding of the record, should be JSON or XML - */ + /** The encoding of the record, should be JSON or XML */ private String encoding; /** - * The information about the provenance of the record see @{@link Provenance} - * for the model of this information + * The information about the provenance of the record see @{@link Provenance} for the model of + * this information */ private Provenance provenance; - /** - * The content of the metadata - */ + /** The content of the metadata */ private String body; - /** - * the date when the record has been stored - */ + /** the date when the record has been stored */ private long dateOfCollection; - /** - * the date when the record has been stored - */ + /** the date when the record has been stored */ private long dateOfTransformation; - public MetadataRecord() { this.dateOfCollection = System.currentTimeMillis(); } - public MetadataRecord(String originalId, String encoding, Provenance provenance, String body, long dateOfCollection) { + public MetadataRecord( + String originalId, + String encoding, + Provenance provenance, + String body, + long dateOfCollection) { this.originalId = originalId; this.encoding = encoding; this.provenance = provenance; this.body = body; this.dateOfCollection = dateOfCollection; - this.id = DHPUtils.generateIdentifier(originalId,this.provenance.getNsPrefix()); + this.id = DHPUtils.generateIdentifier(originalId, this.provenance.getNsPrefix()); } public String getId() { @@ -71,7 +57,6 @@ public class MetadataRecord implements Serializable { this.id = id; } - public String getOriginalId() { return originalId; } @@ -96,7 +81,6 @@ public class MetadataRecord implements Serializable { this.provenance = provenance; } - public String getBody() { return body; } @@ -127,7 +111,6 @@ public class MetadataRecord implements Serializable { return false; } return ((MetadataRecord) o).getId().equalsIgnoreCase(id); - } @Override diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/model/mdstore/Provenance.java b/dhp-common/src/main/java/eu/dnetlib/dhp/model/mdstore/Provenance.java index de67281f2..dbbeb9276 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/model/mdstore/Provenance.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/model/mdstore/Provenance.java @@ -2,27 +2,20 @@ package eu.dnetlib.dhp.model.mdstore; import java.io.Serializable; - /** * @author Sandro La Bruzzo - * - * Provenace class models the provenance of the record in the metadataStore - * It contains the identifier and the name of the datasource that gives the - * record - * + *

Provenace class models the provenance of the record in the metadataStore It contains the + * identifier and the name of the datasource that gives the record */ public class Provenance implements Serializable { private String datasourceId; - private String datasourceName; private String nsPrefix; - public Provenance() { - - } + public Provenance() {} public Provenance(String datasourceId, String datasourceName, String nsPrefix) { this.datasourceId = datasourceId; diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/parser/utility/VtdException.java b/dhp-common/src/main/java/eu/dnetlib/dhp/parser/utility/VtdException.java index 77b28f207..aa801acd5 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/parser/utility/VtdException.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/parser/utility/VtdException.java @@ -9,4 +9,4 @@ public class VtdException extends Exception { public VtdException(final Throwable e) { super(e); } -} \ No newline at end of file +} diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/parser/utility/VtdUtilityParser.java b/dhp-common/src/main/java/eu/dnetlib/dhp/parser/utility/VtdUtilityParser.java index 5d92e1c5f..c7a86957a 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/parser/utility/VtdUtilityParser.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/parser/utility/VtdUtilityParser.java @@ -1,20 +1,17 @@ package eu.dnetlib.dhp.parser.utility; +import com.ximpleware.AutoPilot; +import com.ximpleware.VTDNav; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; - -import com.ximpleware.AutoPilot; -import com.ximpleware.VTDNav; - -/** - * Created by sandro on 9/29/16. - */ +/** Created by sandro on 9/29/16. */ public class VtdUtilityParser { - public static List getTextValuesWithAttributes(final AutoPilot ap, final VTDNav vn, final String xpath, final List attributes) + public static List getTextValuesWithAttributes( + final AutoPilot ap, final VTDNav vn, final String xpath, final List attributes) throws VtdException { final List results = new ArrayList<>(); try { @@ -35,25 +32,28 @@ public class VtdUtilityParser { } } - private static Map getAttributes(final VTDNav vn, final List attributes) { + private static Map getAttributes( + final VTDNav vn, final List attributes) { final Map currentAttributes = new HashMap<>(); if (attributes != null) { - attributes.forEach(attributeKey -> { - try { - int attr = vn.getAttrVal(attributeKey); - if (attr > -1) { - currentAttributes.put(attributeKey, vn.toNormalizedString(attr)); - } - } catch (Throwable e) { - throw new RuntimeException(e); - } - }); + attributes.forEach( + attributeKey -> { + try { + int attr = vn.getAttrVal(attributeKey); + if (attr > -1) { + currentAttributes.put(attributeKey, vn.toNormalizedString(attr)); + } + } catch (Throwable e) { + throw new RuntimeException(e); + } + }); } return currentAttributes; } - public static List getTextValue(final AutoPilot ap, final VTDNav vn, final String xpath) throws VtdException { + public static List getTextValue(final AutoPilot ap, final VTDNav vn, final String xpath) + throws VtdException { List results = new ArrayList<>(); try { ap.selectXPath(xpath); @@ -67,13 +67,13 @@ public class VtdUtilityParser { } } - public static String getSingleValue(final AutoPilot ap, final VTDNav nav, final String xpath) throws VtdException { + public static String getSingleValue(final AutoPilot ap, final VTDNav nav, final String xpath) + throws VtdException { try { ap.selectXPath(xpath); while (ap.evalXPath() != -1) { int it = nav.getText(); - if (it > -1) - return nav.toNormalizedString(it); + if (it > -1) return nav.toNormalizedString(it); } return null; } catch (Exception e) { @@ -103,5 +103,4 @@ public class VtdUtilityParser { this.attributes = attributes; } } - } diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/utils/DHPUtils.java b/dhp-common/src/main/java/eu/dnetlib/dhp/utils/DHPUtils.java index ea8943efd..b2b86d8f2 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/utils/DHPUtils.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/utils/DHPUtils.java @@ -1,18 +1,16 @@ package eu.dnetlib.dhp.utils; import com.jayway.jsonpath.JsonPath; -import net.minidev.json.JSONArray; -import org.apache.commons.codec.binary.Base64; -import org.apache.commons.codec.binary.Base64OutputStream; -import org.apache.commons.codec.binary.Hex; -import org.apache.commons.lang3.StringUtils; - import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.nio.charset.StandardCharsets; import java.security.MessageDigest; import java.util.zip.GZIPInputStream; import java.util.zip.GZIPOutputStream; +import net.minidev.json.JSONArray; +import org.apache.commons.codec.binary.Base64; +import org.apache.commons.codec.binary.Base64OutputStream; +import org.apache.commons.codec.binary.Hex; public class DHPUtils { @@ -28,41 +26,40 @@ public class DHPUtils { } public static String generateIdentifier(final String originalId, final String nsPrefix) { - return String.format("%s::%s",nsPrefix, DHPUtils.md5(originalId)); + return String.format("%s::%s", nsPrefix, DHPUtils.md5(originalId)); } - public static String compressString(final String input ) { - try ( ByteArrayOutputStream out = new ByteArrayOutputStream(); Base64OutputStream b64os = new Base64OutputStream(out)) { + public static String compressString(final String input) { + try (ByteArrayOutputStream out = new ByteArrayOutputStream(); + Base64OutputStream b64os = new Base64OutputStream(out)) { GZIPOutputStream gzip = new GZIPOutputStream(b64os); gzip.write(input.getBytes(StandardCharsets.UTF_8)); gzip.close(); return out.toString(); - } catch (Throwable e ) { + } catch (Throwable e) { return null; } } - public static String decompressString(final String input) { byte[] byteArray = Base64.decodeBase64(input.getBytes()); int len; - try (GZIPInputStream gis = new GZIPInputStream(new ByteArrayInputStream((byteArray))); ByteArrayOutputStream bos = new ByteArrayOutputStream(byteArray.length)) { + try (GZIPInputStream gis = new GZIPInputStream(new ByteArrayInputStream((byteArray))); + ByteArrayOutputStream bos = new ByteArrayOutputStream(byteArray.length)) { byte[] buffer = new byte[1024]; - while((len = gis.read(buffer)) != -1){ + while ((len = gis.read(buffer)) != -1) { bos.write(buffer, 0, len); } return bos.toString(); } catch (Exception e) { return null; } - } public static String getJPathString(final String jsonPath, final String json) { try { Object o = JsonPath.read(json, jsonPath); - if (o instanceof String) - return (String) o; + if (o instanceof String) return (String) o; if (o instanceof JSONArray && ((JSONArray) o).size() > 0) return (String) ((JSONArray) o).get(0); return o.toString(); @@ -70,5 +67,4 @@ public class DHPUtils { return ""; } } - } diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/utils/saxon/AbstractExtensionFunction.java b/dhp-common/src/main/java/eu/dnetlib/dhp/utils/saxon/AbstractExtensionFunction.java index bd3962440..494a73a75 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/utils/saxon/AbstractExtensionFunction.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/utils/saxon/AbstractExtensionFunction.java @@ -9,10 +9,13 @@ import net.sf.saxon.trans.XPathException; public abstract class AbstractExtensionFunction extends ExtensionFunctionDefinition { - public static String DEFAULT_SAXON_EXT_NS_URI = "http://www.d-net.research-infrastructures.eu/saxon-extension"; + public static String DEFAULT_SAXON_EXT_NS_URI = + "http://www.d-net.research-infrastructures.eu/saxon-extension"; public abstract String getName(); - public abstract Sequence doCall(XPathContext context, Sequence[] arguments) throws XPathException; + + public abstract Sequence doCall(XPathContext context, Sequence[] arguments) + throws XPathException; @Override public StructuredQName getFunctionQName() { @@ -28,5 +31,4 @@ public abstract class AbstractExtensionFunction extends ExtensionFunctionDefinit } }; } - -} \ No newline at end of file +} diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/utils/saxon/ExtractYear.java b/dhp-common/src/main/java/eu/dnetlib/dhp/utils/saxon/ExtractYear.java index f90e2a23e..c027074bf 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/utils/saxon/ExtractYear.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/utils/saxon/ExtractYear.java @@ -1,5 +1,9 @@ package eu.dnetlib.dhp.utils.saxon; +import java.text.ParseException; +import java.text.SimpleDateFormat; +import java.util.Calendar; +import java.util.GregorianCalendar; import net.sf.saxon.expr.XPathContext; import net.sf.saxon.om.Item; import net.sf.saxon.om.Sequence; @@ -7,14 +11,9 @@ import net.sf.saxon.trans.XPathException; import net.sf.saxon.value.SequenceType; import net.sf.saxon.value.StringValue; -import java.text.ParseException; -import java.text.SimpleDateFormat; -import java.util.Calendar; -import java.util.GregorianCalendar; - public class ExtractYear extends AbstractExtensionFunction { - private static final String[] dateFormats = { "yyyy-MM-dd", "yyyy/MM/dd" }; + private static final String[] dateFormats = {"yyyy-MM-dd", "yyyy/MM/dd"}; @Override public String getName() { @@ -45,7 +44,7 @@ public class ExtractYear extends AbstractExtensionFunction { @Override public SequenceType[] getArgumentTypes() { - return new SequenceType[] { SequenceType.OPTIONAL_ITEM }; + return new SequenceType[] {SequenceType.OPTIONAL_ITEM}; } @Override @@ -60,7 +59,8 @@ public class ExtractYear extends AbstractExtensionFunction { c.setTime(new SimpleDateFormat(format).parse(s)); String year = String.valueOf(c.get(Calendar.YEAR)); return year; - } catch (ParseException e) {} + } catch (ParseException e) { + } } return ""; } diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/utils/saxon/NormalizeDate.java b/dhp-common/src/main/java/eu/dnetlib/dhp/utils/saxon/NormalizeDate.java index 634e08788..7465c9735 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/utils/saxon/NormalizeDate.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/utils/saxon/NormalizeDate.java @@ -1,18 +1,19 @@ package eu.dnetlib.dhp.utils.saxon; +import java.text.ParseException; +import java.text.SimpleDateFormat; +import java.util.Date; import net.sf.saxon.expr.XPathContext; import net.sf.saxon.om.Sequence; import net.sf.saxon.trans.XPathException; import net.sf.saxon.value.SequenceType; import net.sf.saxon.value.StringValue; -import java.text.ParseException; -import java.text.SimpleDateFormat; -import java.util.Date; - public class NormalizeDate extends AbstractExtensionFunction { - private static final String[] normalizeDateFormats = { "yyyy-MM-dd'T'hh:mm:ss", "yyyy-MM-dd", "yyyy/MM/dd", "yyyy" }; + private static final String[] normalizeDateFormats = { + "yyyy-MM-dd'T'hh:mm:ss", "yyyy-MM-dd", "yyyy/MM/dd", "yyyy" + }; private static final String normalizeOutFormat = new String("yyyy-MM-dd'T'hh:mm:ss'Z'"); @@ -42,7 +43,7 @@ public class NormalizeDate extends AbstractExtensionFunction { @Override public SequenceType[] getArgumentTypes() { - return new SequenceType[] { SequenceType.OPTIONAL_ITEM }; + return new SequenceType[] {SequenceType.OPTIONAL_ITEM}; } @Override @@ -58,9 +59,9 @@ public class NormalizeDate extends AbstractExtensionFunction { Date parse = new SimpleDateFormat(format).parse(date); String res = new SimpleDateFormat(normalizeOutFormat).format(parse); return res; - } catch (ParseException e) {} + } catch (ParseException e) { + } } return ""; } - } diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/utils/saxon/PickFirst.java b/dhp-common/src/main/java/eu/dnetlib/dhp/utils/saxon/PickFirst.java index a221e37c6..16ccaaa17 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/utils/saxon/PickFirst.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/utils/saxon/PickFirst.java @@ -24,7 +24,8 @@ public class PickFirst extends AbstractExtensionFunction { final String s1 = getValue(arguments[0]); final String s2 = getValue(arguments[1]); - return new StringValue(StringUtils.isNotBlank(s1) ? s1 : StringUtils.isNotBlank(s2) ? s2 : ""); + return new StringValue( + StringUtils.isNotBlank(s1) ? s1 : StringUtils.isNotBlank(s2) ? s2 : ""); } private String getValue(final Sequence arg) throws XPathException { @@ -49,12 +50,11 @@ public class PickFirst extends AbstractExtensionFunction { @Override public SequenceType[] getArgumentTypes() { - return new SequenceType[] { SequenceType.OPTIONAL_ITEM }; + return new SequenceType[] {SequenceType.OPTIONAL_ITEM}; } @Override public SequenceType getResultType(SequenceType[] suppliedArgumentTypes) { return SequenceType.SINGLE_STRING; } - } diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/utils/saxon/SaxonTransformerFactory.java b/dhp-common/src/main/java/eu/dnetlib/dhp/utils/saxon/SaxonTransformerFactory.java index 611709ff0..0ee1cfb60 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/utils/saxon/SaxonTransformerFactory.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/utils/saxon/SaxonTransformerFactory.java @@ -1,17 +1,17 @@ package eu.dnetlib.dhp.utils.saxon; -import net.sf.saxon.Configuration; -import net.sf.saxon.TransformerFactoryImpl; - +import java.io.StringReader; import javax.xml.transform.Transformer; import javax.xml.transform.TransformerException; import javax.xml.transform.stream.StreamSource; -import java.io.StringReader; +import net.sf.saxon.Configuration; +import net.sf.saxon.TransformerFactoryImpl; public class SaxonTransformerFactory { /** * Creates the index record transformer from the given XSLT + * * @param xslt * @return * @throws TransformerException @@ -26,5 +26,4 @@ public class SaxonTransformerFactory { return factory.newTransformer(new StreamSource(new StringReader(xslt))); } - } diff --git a/dhp-common/src/main/java/eu/dnetlib/message/Message.java b/dhp-common/src/main/java/eu/dnetlib/message/Message.java index 3767bd026..6accff806 100644 --- a/dhp-common/src/main/java/eu/dnetlib/message/Message.java +++ b/dhp-common/src/main/java/eu/dnetlib/message/Message.java @@ -2,7 +2,6 @@ package eu.dnetlib.message; import com.fasterxml.jackson.core.JsonProcessingException; import com.fasterxml.jackson.databind.ObjectMapper; - import java.io.IOException; import java.util.Map; @@ -16,20 +15,12 @@ public class Message { private Map body; - public static Message fromJson(final String json) throws IOException { final ObjectMapper jsonMapper = new ObjectMapper(); return jsonMapper.readValue(json, Message.class); - - } - - public Message() { - - - - } + public Message() {} public Message(String workflowId, String jobName, MessageType type, Map body) { this.workflowId = workflowId; diff --git a/dhp-common/src/main/java/eu/dnetlib/message/MessageConsumer.java b/dhp-common/src/main/java/eu/dnetlib/message/MessageConsumer.java index af5339034..a2f3eff3e 100644 --- a/dhp-common/src/main/java/eu/dnetlib/message/MessageConsumer.java +++ b/dhp-common/src/main/java/eu/dnetlib/message/MessageConsumer.java @@ -4,7 +4,6 @@ import com.rabbitmq.client.AMQP; import com.rabbitmq.client.Channel; import com.rabbitmq.client.DefaultConsumer; import com.rabbitmq.client.Envelope; - import java.io.IOException; import java.nio.charset.StandardCharsets; import java.util.concurrent.LinkedBlockingQueue; @@ -13,7 +12,6 @@ public class MessageConsumer extends DefaultConsumer { final LinkedBlockingQueue queueMessages; - /** * Constructs a new instance and records its association to the passed-in channel. * @@ -25,19 +23,20 @@ public class MessageConsumer extends DefaultConsumer { this.queueMessages = queueMessages; } - @Override - public void handleDelivery(String consumerTag, Envelope envelope, AMQP.BasicProperties properties, byte[] body) throws IOException { + public void handleDelivery( + String consumerTag, Envelope envelope, AMQP.BasicProperties properties, byte[] body) + throws IOException { final String json = new String(body, StandardCharsets.UTF_8); Message message = Message.fromJson(json); try { this.queueMessages.put(message); - System.out.println("Receiving Message "+message); + System.out.println("Receiving Message " + message); } catch (InterruptedException e) { - if (message.getType()== MessageType.REPORT) + if (message.getType() == MessageType.REPORT) throw new RuntimeException("Error on sending message"); else { - //TODO LOGGING EXCEPTION + // TODO LOGGING EXCEPTION } } finally { getChannel().basicAck(envelope.getDeliveryTag(), false); diff --git a/dhp-common/src/main/java/eu/dnetlib/message/MessageManager.java b/dhp-common/src/main/java/eu/dnetlib/message/MessageManager.java index e3d90f7e0..d226e7662 100644 --- a/dhp-common/src/main/java/eu/dnetlib/message/MessageManager.java +++ b/dhp-common/src/main/java/eu/dnetlib/message/MessageManager.java @@ -3,8 +3,6 @@ package eu.dnetlib.message; import com.rabbitmq.client.Channel; import com.rabbitmq.client.Connection; import com.rabbitmq.client.ConnectionFactory; -import sun.rmi.runtime.Log; - import java.io.IOException; import java.util.HashMap; import java.util.Map; @@ -21,23 +19,32 @@ public class MessageManager { private Connection connection; - private Map channels = new HashMap<>(); + private Map channels = new HashMap<>(); - private boolean durable; + private boolean durable; - private boolean autodelete; + private boolean autodelete; - final private LinkedBlockingQueue queueMessages; + private final LinkedBlockingQueue queueMessages; - public MessageManager(String messageHost, String username, String password, final LinkedBlockingQueue queueMessages) { + public MessageManager( + String messageHost, + String username, + String password, + final LinkedBlockingQueue queueMessages) { this.queueMessages = queueMessages; this.messageHost = messageHost; this.username = username; this.password = password; } - - public MessageManager(String messageHost, String username, String password, boolean durable, boolean autodelete, final LinkedBlockingQueue queueMessages) { + public MessageManager( + String messageHost, + String username, + String password, + boolean durable, + boolean autodelete, + final LinkedBlockingQueue queueMessages) { this.queueMessages = queueMessages; this.messageHost = messageHost; this.username = username; @@ -55,7 +62,12 @@ public class MessageManager { return factory.newConnection(); } - private Channel createChannel(final Connection connection, final String queueName, final boolean durable, final boolean autodelete ) throws Exception { + private Channel createChannel( + final Connection connection, + final String queueName, + final boolean durable, + final boolean autodelete) + throws Exception { Map args = new HashMap<>(); args.put("x-message-ttl", 10000); Channel channel = connection.createChannel(); @@ -63,9 +75,10 @@ public class MessageManager { return channel; } - private Channel getOrCreateChannel(final String queueName, boolean durable, boolean autodelete) throws Exception { + private Channel getOrCreateChannel(final String queueName, boolean durable, boolean autodelete) + throws Exception { if (channels.containsKey(queueName)) { - return channels.get(queueName); + return channels.get(queueName); } if (this.connection == null) { @@ -75,16 +88,16 @@ public class MessageManager { return channels.get(queueName); } - - public void close() throws IOException { - channels.values().forEach(ch-> { - try { - ch.close(); - } catch (Exception e) { - //TODO LOG - } - }); + channels.values() + .forEach( + ch -> { + try { + ch.close(); + } catch (Exception e) { + // TODO LOG + } + }); this.connection.close(); } @@ -92,26 +105,30 @@ public class MessageManager { public boolean sendMessage(final Message message, String queueName) throws Exception { try { Channel channel = getOrCreateChannel(queueName, this.durable, this.autodelete); - channel.basicPublish("", queueName,null, message.toString().getBytes()); + channel.basicPublish("", queueName, null, message.toString().getBytes()); return true; } catch (Throwable e) { throw new RuntimeException(e); } } - public boolean sendMessage(final Message message, String queueName, boolean durable_var, boolean autodelete_var) throws Exception { + public boolean sendMessage( + final Message message, String queueName, boolean durable_var, boolean autodelete_var) + throws Exception { try { Channel channel = getOrCreateChannel(queueName, durable_var, autodelete_var); - channel.basicPublish("", queueName,null, message.toString().getBytes()); + channel.basicPublish("", queueName, null, message.toString().getBytes()); return true; } catch (Throwable e) { throw new RuntimeException(e); } } - public void startConsumingMessage(final String queueName, final boolean durable, final boolean autodelete) throws Exception{ + public void startConsumingMessage( + final String queueName, final boolean durable, final boolean autodelete) + throws Exception { Channel channel = createChannel(createConnection(), queueName, durable, autodelete); - channel.basicConsume(queueName, false, new MessageConsumer(channel,queueMessages)); + channel.basicConsume(queueName, false, new MessageConsumer(channel, queueMessages)); } } diff --git a/dhp-common/src/main/java/eu/dnetlib/message/MessageType.java b/dhp-common/src/main/java/eu/dnetlib/message/MessageType.java index c2440c3fe..37db1ec4f 100644 --- a/dhp-common/src/main/java/eu/dnetlib/message/MessageType.java +++ b/dhp-common/src/main/java/eu/dnetlib/message/MessageType.java @@ -1,8 +1,6 @@ package eu.dnetlib.message; public enum MessageType { - ONGOING, REPORT - } diff --git a/dhp-common/src/main/java/eu/dnetlib/scholexplorer/relation/RelInfo.java b/dhp-common/src/main/java/eu/dnetlib/scholexplorer/relation/RelInfo.java index ff88cda4c..0d6cd33c1 100644 --- a/dhp-common/src/main/java/eu/dnetlib/scholexplorer/relation/RelInfo.java +++ b/dhp-common/src/main/java/eu/dnetlib/scholexplorer/relation/RelInfo.java @@ -2,7 +2,7 @@ package eu.dnetlib.scholexplorer.relation; import java.io.Serializable; -public class RelInfo implements Serializable { +public class RelInfo implements Serializable { private String original; private String inverse; diff --git a/dhp-common/src/main/java/eu/dnetlib/scholexplorer/relation/RelationMapper.java b/dhp-common/src/main/java/eu/dnetlib/scholexplorer/relation/RelationMapper.java index 647c11789..5d5659ea6 100644 --- a/dhp-common/src/main/java/eu/dnetlib/scholexplorer/relation/RelationMapper.java +++ b/dhp-common/src/main/java/eu/dnetlib/scholexplorer/relation/RelationMapper.java @@ -1,19 +1,18 @@ package eu.dnetlib.scholexplorer.relation; import com.fasterxml.jackson.databind.ObjectMapper; -import org.apache.commons.io.IOUtils; - import java.io.Serializable; import java.util.HashMap; +import org.apache.commons.io.IOUtils; -public class RelationMapper extends HashMap implements Serializable { +public class RelationMapper extends HashMap implements Serializable { public static RelationMapper load() throws Exception { - final String json = IOUtils.toString(RelationMapper.class.getResourceAsStream("relations.json")); + final String json = + IOUtils.toString(RelationMapper.class.getResourceAsStream("relations.json")); ObjectMapper mapper = new ObjectMapper(); return mapper.readValue(json, RelationMapper.class); } - } diff --git a/dhp-common/src/test/java/eu/dnetlib/dhp/application/ArgumentApplicationParserTest.java b/dhp-common/src/test/java/eu/dnetlib/dhp/application/ArgumentApplicationParserTest.java index f4598ebd4..81f86657d 100644 --- a/dhp-common/src/test/java/eu/dnetlib/dhp/application/ArgumentApplicationParserTest.java +++ b/dhp-common/src/test/java/eu/dnetlib/dhp/application/ArgumentApplicationParserTest.java @@ -1,30 +1,46 @@ package eu.dnetlib.dhp.application; -import org.apache.commons.io.IOUtils; -import org.junit.jupiter.api.Test; - import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertNotNull; +import org.apache.commons.io.IOUtils; +import org.junit.jupiter.api.Test; + public class ArgumentApplicationParserTest { @Test public void testParseParameter() throws Exception { - final String jsonConfiguration = IOUtils.toString(this.getClass().getResourceAsStream("/eu/dnetlib/application/parameters.json")); + final String jsonConfiguration = + IOUtils.toString( + this.getClass() + .getResourceAsStream("/eu/dnetlib/application/parameters.json")); assertNotNull(jsonConfiguration); ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); - parser.parseArgument(new String[]{"-p", "value0", - "-a", "value1", - "-n", "value2", - "-u", "value3", - "-ru", "value4", - "-rp", "value5", - "-rh", "value6", - "-ro", "value7", - "-rr", "value8", - "-w", "value9", - "-cc", ArgumentApplicationParser.compressArgument(jsonConfiguration) - }); + parser.parseArgument( + new String[] { + "-p", + "value0", + "-a", + "value1", + "-n", + "value2", + "-u", + "value3", + "-ru", + "value4", + "-rp", + "value5", + "-rh", + "value6", + "-ro", + "value7", + "-rr", + "value8", + "-w", + "value9", + "-cc", + ArgumentApplicationParser.compressArgument(jsonConfiguration) + }); assertNotNull(parser.get("hdfsPath")); assertNotNull(parser.get("apidescriptor")); assertNotNull(parser.get("namenode")); @@ -47,10 +63,4 @@ public class ArgumentApplicationParserTest { assertEquals("value9", parser.get("workflowId")); assertEquals(jsonConfiguration, parser.get("ccCoco")); } - - - - - - } diff --git a/dhp-common/src/test/java/eu/dnetlib/dhp/common/HdfsSupportTest.java b/dhp-common/src/test/java/eu/dnetlib/dhp/common/HdfsSupportTest.java index f1e790ee7..dac48b22d 100644 --- a/dhp-common/src/test/java/eu/dnetlib/dhp/common/HdfsSupportTest.java +++ b/dhp-common/src/test/java/eu/dnetlib/dhp/common/HdfsSupportTest.java @@ -1,9 +1,6 @@ package eu.dnetlib.dhp.common; -import org.apache.hadoop.conf.Configuration; -import org.junit.jupiter.api.Nested; -import org.junit.jupiter.api.Test; -import org.junit.jupiter.api.io.TempDir; +import static org.junit.jupiter.api.Assertions.*; import java.io.IOException; import java.nio.file.Files; @@ -11,8 +8,10 @@ import java.nio.file.Path; import java.util.Arrays; import java.util.List; import java.util.stream.Collectors; - -import static org.junit.jupiter.api.Assertions.*; +import org.apache.hadoop.conf.Configuration; +import org.junit.jupiter.api.Nested; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; public class HdfsSupportTest { @@ -22,8 +21,8 @@ public class HdfsSupportTest { @Test public void shouldThrowARuntimeExceptionOnError() { // when - assertThrows(RuntimeException.class, () -> - HdfsSupport.remove(null, new Configuration())); + assertThrows( + RuntimeException.class, () -> HdfsSupport.remove(null, new Configuration())); } @Test @@ -54,8 +53,8 @@ public class HdfsSupportTest { @Test public void shouldThrowARuntimeExceptionOnError() { // when - assertThrows(RuntimeException.class, () -> - HdfsSupport.listFiles(null, new Configuration())); + assertThrows( + RuntimeException.class, () -> HdfsSupport.listFiles(null, new Configuration())); } @Test @@ -68,11 +67,13 @@ public class HdfsSupportTest { // then assertEquals(2, paths.size()); - List expecteds = Arrays.stream(new String[]{subDir1.toString(), subDir2.toString()}) - .sorted().collect(Collectors.toList()); + List expecteds = + Arrays.stream(new String[] {subDir1.toString(), subDir2.toString()}) + .sorted() + .collect(Collectors.toList()); List actuals = paths.stream().sorted().collect(Collectors.toList()); assertTrue(actuals.get(0).contains(expecteds.get(0))); assertTrue(actuals.get(1).contains(expecteds.get(1))); } } -} \ No newline at end of file +} diff --git a/dhp-common/src/test/java/eu/dnetlib/dhp/common/SparkSessionSupportTest.java b/dhp-common/src/test/java/eu/dnetlib/dhp/common/SparkSessionSupportTest.java index bc2dce3cf..17424c38a 100644 --- a/dhp-common/src/test/java/eu/dnetlib/dhp/common/SparkSessionSupportTest.java +++ b/dhp-common/src/test/java/eu/dnetlib/dhp/common/SparkSessionSupportTest.java @@ -1,22 +1,22 @@ package eu.dnetlib.dhp.common; +import static org.mockito.Mockito.*; + import eu.dnetlib.dhp.common.FunctionalInterfaceSupport.ThrowingConsumer; +import java.util.function.Function; import org.apache.spark.SparkConf; import org.apache.spark.sql.SparkSession; import org.junit.jupiter.api.Nested; import org.junit.jupiter.api.Test; -import java.util.function.Function; - -import static org.mockito.Mockito.*; - public class SparkSessionSupportTest { @Nested class RunWithSparkSession { @Test - public void shouldExecuteFunctionAndNotStopSparkSessionWhenSparkSessionIsNotManaged() throws Exception { + public void shouldExecuteFunctionAndNotStopSparkSessionWhenSparkSessionIsNotManaged() + throws Exception { // given SparkSession spark = mock(SparkSession.class); SparkConf conf = mock(SparkConf.class); @@ -34,7 +34,8 @@ public class SparkSessionSupportTest { } @Test - public void shouldExecuteFunctionAndStopSparkSessionWhenSparkSessionIsManaged() throws Exception { + public void shouldExecuteFunctionAndStopSparkSessionWhenSparkSessionIsManaged() + throws Exception { // given SparkSession spark = mock(SparkSession.class); SparkConf conf = mock(SparkConf.class); @@ -51,4 +52,4 @@ public class SparkSessionSupportTest { verify(spark, times(1)).stop(); } } -} \ No newline at end of file +} diff --git a/dhp-common/src/test/java/eu/dnetlib/dhp/model/mdstore/MetadataRecordTest.java b/dhp-common/src/test/java/eu/dnetlib/dhp/model/mdstore/MetadataRecordTest.java index a2bac54ba..0750ad03a 100644 --- a/dhp-common/src/test/java/eu/dnetlib/dhp/model/mdstore/MetadataRecordTest.java +++ b/dhp-common/src/test/java/eu/dnetlib/dhp/model/mdstore/MetadataRecordTest.java @@ -1,9 +1,9 @@ package eu.dnetlib.dhp.model.mdstore; -import org.junit.jupiter.api.Test; - import static org.junit.jupiter.api.Assertions.assertTrue; +import org.junit.jupiter.api.Test; + public class MetadataRecordTest { @Test @@ -12,4 +12,4 @@ public class MetadataRecordTest { MetadataRecord r = new MetadataRecord(); assertTrue(r.getDateOfCollection() > 0); } -} \ No newline at end of file +} diff --git a/dhp-common/src/test/java/eu/dnetlib/message/MessageTest.java b/dhp-common/src/test/java/eu/dnetlib/message/MessageTest.java index 73df63b32..9e57c286c 100644 --- a/dhp-common/src/test/java/eu/dnetlib/message/MessageTest.java +++ b/dhp-common/src/test/java/eu/dnetlib/message/MessageTest.java @@ -1,12 +1,11 @@ package eu.dnetlib.message; -import org.junit.jupiter.api.Test; +import static org.junit.jupiter.api.Assertions.*; import java.io.IOException; import java.util.HashMap; import java.util.Map; - -import static org.junit.jupiter.api.Assertions.*; +import org.junit.jupiter.api.Test; public class MessageTest { @@ -16,7 +15,7 @@ public class MessageTest { m.setWorkflowId("wId"); m.setType(MessageType.ONGOING); m.setJobName("Collection"); - Map body= new HashMap<>(); + Map body = new HashMap<>(); body.put("parsedItem", "300"); body.put("ExecutionTime", "30s"); @@ -28,28 +27,26 @@ public class MessageTest { assertEquals(m1.getJobName(), m.getJobName()); assertNotNull(m1.getBody()); - m1.getBody().keySet().forEach(it -> assertEquals(m1.getBody().get(it), m.getBody().get(it))); + m1.getBody() + .keySet() + .forEach(it -> assertEquals(m1.getBody().get(it), m.getBody().get(it))); assertEquals(m1.getJobName(), m.getJobName()); } @Test public void toStringTest() { - final String expectedJson= "{\"workflowId\":\"wId\",\"jobName\":\"Collection\",\"type\":\"ONGOING\",\"body\":{\"ExecutionTime\":\"30s\",\"parsedItem\":\"300\"}}"; + final String expectedJson = + "{\"workflowId\":\"wId\",\"jobName\":\"Collection\",\"type\":\"ONGOING\",\"body\":{\"ExecutionTime\":\"30s\",\"parsedItem\":\"300\"}}"; Message m = new Message(); m.setWorkflowId("wId"); m.setType(MessageType.ONGOING); m.setJobName("Collection"); - Map body= new HashMap<>(); + Map body = new HashMap<>(); body.put("parsedItem", "300"); body.put("ExecutionTime", "30s"); m.setBody(body); - assertEquals(expectedJson,m.toString()); - - + assertEquals(expectedJson, m.toString()); } - - - -} \ No newline at end of file +} diff --git a/dhp-common/src/test/java/eu/dnetlib/scholexplorer/relation/RelationMapperTest.java b/dhp-common/src/test/java/eu/dnetlib/scholexplorer/relation/RelationMapperTest.java index eb9fb172d..40a2aaecd 100644 --- a/dhp-common/src/test/java/eu/dnetlib/scholexplorer/relation/RelationMapperTest.java +++ b/dhp-common/src/test/java/eu/dnetlib/scholexplorer/relation/RelationMapperTest.java @@ -2,14 +2,12 @@ package eu.dnetlib.scholexplorer.relation; import org.junit.jupiter.api.Test; - public class RelationMapperTest { @Test - public void testLoadRels() throws Exception{ + public void testLoadRels() throws Exception { RelationMapper relationMapper = RelationMapper.load(); relationMapper.keySet().forEach(System.out::println); - } } diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/action/AtomicAction.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/action/AtomicAction.java index 0f9aa3adb..f7e5edc66 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/action/AtomicAction.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/action/AtomicAction.java @@ -2,7 +2,6 @@ package eu.dnetlib.dhp.schema.action; import com.fasterxml.jackson.databind.annotation.JsonDeserialize; import eu.dnetlib.dhp.schema.oaf.Oaf; - import java.io.Serializable; @JsonDeserialize(using = AtomicActionDeserializer.class) @@ -12,8 +11,7 @@ public class AtomicAction implements Serializable { private T payload; - public AtomicAction() { - } + public AtomicAction() {} public AtomicAction(Class clazz, T payload) { this.clazz = clazz; diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/action/AtomicActionDeserializer.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/action/AtomicActionDeserializer.java index e6017288f..ad08e401c 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/action/AtomicActionDeserializer.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/action/AtomicActionDeserializer.java @@ -7,13 +7,13 @@ import com.fasterxml.jackson.databind.JsonDeserializer; import com.fasterxml.jackson.databind.JsonNode; import com.fasterxml.jackson.databind.ObjectMapper; import eu.dnetlib.dhp.schema.oaf.Oaf; - import java.io.IOException; public class AtomicActionDeserializer extends JsonDeserializer { @Override - public Object deserialize(JsonParser jp, DeserializationContext ctxt) throws IOException, JsonProcessingException { + public Object deserialize(JsonParser jp, DeserializationContext ctxt) + throws IOException, JsonProcessingException { JsonNode node = jp.getCodec().readTree(jp); String classTag = node.get("clazz").asText(); JsonNode payload = node.get("payload"); diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/common/EntityType.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/common/EntityType.java index 9e4fa7d8a..701c83fc2 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/common/EntityType.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/common/EntityType.java @@ -2,15 +2,19 @@ package eu.dnetlib.dhp.schema.common; import eu.dnetlib.dhp.schema.oaf.OafEntity; -/** - * Actual entity types in the Graph - */ +/** Actual entity types in the Graph */ public enum EntityType { - - publication, dataset, otherresearchproduct, software, datasource, organization, project; + publication, + dataset, + otherresearchproduct, + software, + datasource, + organization, + project; /** * Resolves the EntityType, given the relative class name + * * @param clazz the given class name * @param actual OafEntity subclass * @return the EntityType associated to the given class @@ -19,5 +23,4 @@ public enum EntityType { return EntityType.valueOf(clazz.getSimpleName().toLowerCase()); } - } diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/common/MainEntityType.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/common/MainEntityType.java index 7d65e39c0..c4abbfd00 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/common/MainEntityType.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/common/MainEntityType.java @@ -1,9 +1,9 @@ package eu.dnetlib.dhp.schema.common; -/** - * Main entity types in the Graph - */ +/** Main entity types in the Graph */ public enum MainEntityType { - - result, datasource, organization, project -} \ No newline at end of file + result, + datasource, + organization, + project +} diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/common/ModelSupport.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/common/ModelSupport.java index 48f711a03..bc317a6ff 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/common/ModelSupport.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/common/ModelSupport.java @@ -2,33 +2,29 @@ package eu.dnetlib.dhp.schema.common; import com.google.common.collect.Maps; import eu.dnetlib.dhp.schema.oaf.*; - import java.util.Map; -/** - * Oaf model utility methods. - */ +/** Oaf model utility methods. */ public class ModelSupport { - /** - * Defines the mapping between the actual entity type and the main entity type - */ + /** Defines the mapping between the actual entity type and the main entity type */ private static Map entityMapping = Maps.newHashMap(); static { - entityMapping.put(EntityType.publication, MainEntityType.result); - entityMapping.put(EntityType.dataset, MainEntityType.result); - entityMapping.put(EntityType.otherresearchproduct, MainEntityType.result); - entityMapping.put(EntityType.software, MainEntityType.result); - entityMapping.put(EntityType.datasource, MainEntityType.datasource); - entityMapping.put(EntityType.organization, MainEntityType.organization); - entityMapping.put(EntityType.project, MainEntityType.project); + entityMapping.put(EntityType.publication, MainEntityType.result); + entityMapping.put(EntityType.dataset, MainEntityType.result); + entityMapping.put(EntityType.otherresearchproduct, MainEntityType.result); + entityMapping.put(EntityType.software, MainEntityType.result); + entityMapping.put(EntityType.datasource, MainEntityType.datasource); + entityMapping.put(EntityType.organization, MainEntityType.organization); + entityMapping.put(EntityType.project, MainEntityType.project); } /** - * Defines the mapping between the actual entity types and the relative classes implementing them + * Defines the mapping between the actual entity types and the relative classes implementing + * them */ - public final static Map entityTypes = Maps.newHashMap(); + public static final Map entityTypes = Maps.newHashMap(); static { entityTypes.put(EntityType.datasource, Datasource.class); @@ -40,7 +36,7 @@ public class ModelSupport { entityTypes.put(EntityType.publication, Publication.class); } - public final static Map oafTypes = Maps.newHashMap(); + public static final Map oafTypes = Maps.newHashMap(); static { oafTypes.put("datasource", Datasource.class); @@ -55,19 +51,19 @@ public class ModelSupport { private static final String schemeTemplate = "dnet:%s_%s_relations"; - private ModelSupport() { - } + private ModelSupport() {} /** * Checks subclass-superclass relationship. * - * @param subClazzObject Subclass object instance + * @param subClazzObject Subclass object instance * @param superClazzObject Superclass object instance - * @param Subclass type - * @param Superclass type + * @param Subclass type + * @param Superclass type * @return True if X is a subclass of Y */ - public static Boolean isSubClass(X subClazzObject, Y superClazzObject) { + public static Boolean isSubClass( + X subClazzObject, Y superClazzObject) { return isSubClass(subClazzObject.getClass(), superClazzObject.getClass()); } @@ -75,25 +71,27 @@ public class ModelSupport { * Checks subclass-superclass relationship. * * @param subClazzObject Subclass object instance - * @param superClazz Superclass class - * @param Subclass type - * @param Superclass type + * @param superClazz Superclass class + * @param Subclass type + * @param Superclass type * @return True if X is a subclass of Y */ - public static Boolean isSubClass(X subClazzObject, Class superClazz) { + public static Boolean isSubClass( + X subClazzObject, Class superClazz) { return isSubClass(subClazzObject.getClass(), superClazz); } /** * Checks subclass-superclass relationship. * - * @param subClazz Subclass class + * @param subClazz Subclass class * @param superClazz Superclass class - * @param Subclass type - * @param Superclass type + * @param Subclass type + * @param Superclass type * @return True if X is a subclass of Y */ - public static Boolean isSubClass(Class subClazz, Class superClazz) { + public static Boolean isSubClass( + Class subClazz, Class superClazz) { return superClazz.isAssignableFrom(subClazz); } @@ -104,33 +102,33 @@ public class ModelSupport { * @return */ public static Class[] getOafModelClasses() { - return new Class[]{ - Author.class, - Context.class, - Country.class, - DataInfo.class, - Dataset.class, - Datasource.class, - ExternalReference.class, - ExtraInfo.class, - Field.class, - GeoLocation.class, - Instance.class, - Journal.class, - KeyValue.class, - Oaf.class, - OafEntity.class, - OAIProvenance.class, - Organization.class, - OriginDescription.class, - OtherResearchProduct.class, - Project.class, - Publication.class, - Qualifier.class, - Relation.class, - Result.class, - Software.class, - StructuredProperty.class + return new Class[] { + Author.class, + Context.class, + Country.class, + DataInfo.class, + Dataset.class, + Datasource.class, + ExternalReference.class, + ExtraInfo.class, + Field.class, + GeoLocation.class, + Instance.class, + Journal.class, + KeyValue.class, + Oaf.class, + OafEntity.class, + OAIProvenance.class, + Organization.class, + OriginDescription.class, + OtherResearchProduct.class, + Project.class, + Publication.class, + Qualifier.class, + Relation.class, + Result.class, + Software.class, + StructuredProperty.class }; } @@ -143,9 +141,9 @@ public class ModelSupport { } public static String getScheme(final String sourceType, final String targetType) { - return String.format(schemeTemplate, + return String.format( + schemeTemplate, entityMapping.get(EntityType.valueOf(sourceType)).name(), entityMapping.get(EntityType.valueOf(targetType)).name()); } - -} \ No newline at end of file +} diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Author.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Author.java index 93ab60b05..e8022bfc1 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Author.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Author.java @@ -71,12 +71,12 @@ public class Author implements Serializable { if (this == o) return true; if (o == null || getClass() != o.getClass()) return false; Author author = (Author) o; - return Objects.equals(fullname, author.fullname) && - Objects.equals(name, author.name) && - Objects.equals(surname, author.surname) && - Objects.equals(rank, author.rank) && - Objects.equals(pid, author.pid) && - Objects.equals(affiliation, author.affiliation); + return Objects.equals(fullname, author.fullname) + && Objects.equals(name, author.name) + && Objects.equals(surname, author.surname) + && Objects.equals(rank, author.rank) + && Objects.equals(pid, author.pid) + && Objects.equals(affiliation, author.affiliation); } @Override diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Context.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Context.java index 809200463..0e0d1ad32 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Context.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Context.java @@ -26,17 +26,14 @@ public class Context implements Serializable { @Override public int hashCode() { - return id ==null? 0 : id.hashCode(); + return id == null ? 0 : id.hashCode(); } @Override public boolean equals(Object obj) { - if (this == obj) - return true; - if (obj == null) - return false; - if (getClass() != obj.getClass()) - return false; + if (this == obj) return true; + if (obj == null) return false; + if (getClass() != obj.getClass()) return false; Context other = (Context) obj; diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/DataInfo.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/DataInfo.java index 49a186701..0e3f36671 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/DataInfo.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/DataInfo.java @@ -1,7 +1,6 @@ package eu.dnetlib.dhp.schema.oaf; import java.io.Serializable; -import java.util.List; import java.util.Objects; public class DataInfo implements Serializable { @@ -13,7 +12,6 @@ public class DataInfo implements Serializable { private String inferenceprovenance; private Qualifier provenanceaction; - public Boolean getInvisible() { return invisible; } @@ -67,16 +65,22 @@ public class DataInfo implements Serializable { if (this == o) return true; if (o == null || getClass() != o.getClass()) return false; DataInfo dataInfo = (DataInfo) o; - return Objects.equals(invisible, dataInfo.invisible) && - Objects.equals(inferred, dataInfo.inferred) && - Objects.equals(deletedbyinference, dataInfo.deletedbyinference) && - Objects.equals(trust, dataInfo.trust) && - Objects.equals(inferenceprovenance, dataInfo.inferenceprovenance) && - Objects.equals(provenanceaction, dataInfo.provenanceaction); + return Objects.equals(invisible, dataInfo.invisible) + && Objects.equals(inferred, dataInfo.inferred) + && Objects.equals(deletedbyinference, dataInfo.deletedbyinference) + && Objects.equals(trust, dataInfo.trust) + && Objects.equals(inferenceprovenance, dataInfo.inferenceprovenance) + && Objects.equals(provenanceaction, dataInfo.provenanceaction); } @Override public int hashCode() { - return Objects.hash(invisible, inferred, deletedbyinference, trust, inferenceprovenance, provenanceaction); + return Objects.hash( + invisible, + inferred, + deletedbyinference, + trust, + inferenceprovenance, + provenanceaction); } } diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Dataset.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Dataset.java index 70a6d1f31..fef6ed748 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Dataset.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Dataset.java @@ -20,7 +20,7 @@ public class Dataset extends Result implements Serializable { private List geolocation; - public Field getStoragedate() { + public Field getStoragedate() { return storagedate; } @@ -80,23 +80,32 @@ public class Dataset extends Result implements Serializable { public void mergeFrom(OafEntity e) { super.mergeFrom(e); - if (!Dataset.class.isAssignableFrom(e.getClass())){ + if (!Dataset.class.isAssignableFrom(e.getClass())) { return; } final Dataset d = (Dataset) e; - storagedate = d.getStoragedate() != null && compareTrust(this, e)<0? d.getStoragedate() : storagedate; + storagedate = + d.getStoragedate() != null && compareTrust(this, e) < 0 + ? d.getStoragedate() + : storagedate; - device= d.getDevice() != null && compareTrust(this, e)<0? d.getDevice() : device; + device = d.getDevice() != null && compareTrust(this, e) < 0 ? d.getDevice() : device; - size= d.getSize() != null && compareTrust(this, e)<0? d.getSize() : size; + size = d.getSize() != null && compareTrust(this, e) < 0 ? d.getSize() : size; - version= d.getVersion() != null && compareTrust(this, e)<0? d.getVersion() : version; + version = d.getVersion() != null && compareTrust(this, e) < 0 ? d.getVersion() : version; - lastmetadataupdate= d.getLastmetadataupdate() != null && compareTrust(this, e)<0? d.getLastmetadataupdate() :lastmetadataupdate; + lastmetadataupdate = + d.getLastmetadataupdate() != null && compareTrust(this, e) < 0 + ? d.getLastmetadataupdate() + : lastmetadataupdate; - metadataversionnumber= d.getMetadataversionnumber() != null && compareTrust(this, e)<0? d.getMetadataversionnumber() : metadataversionnumber; + metadataversionnumber = + d.getMetadataversionnumber() != null && compareTrust(this, e) < 0 + ? d.getMetadataversionnumber() + : metadataversionnumber; geolocation = mergeLists(geolocation, d.getGeolocation()); @@ -109,17 +118,25 @@ public class Dataset extends Result implements Serializable { if (o == null || getClass() != o.getClass()) return false; if (!super.equals(o)) return false; Dataset dataset = (Dataset) o; - return Objects.equals(storagedate, dataset.storagedate) && - Objects.equals(device, dataset.device) && - Objects.equals(size, dataset.size) && - Objects.equals(version, dataset.version) && - Objects.equals(lastmetadataupdate, dataset.lastmetadataupdate) && - Objects.equals(metadataversionnumber, dataset.metadataversionnumber) && - Objects.equals(geolocation, dataset.geolocation); + return Objects.equals(storagedate, dataset.storagedate) + && Objects.equals(device, dataset.device) + && Objects.equals(size, dataset.size) + && Objects.equals(version, dataset.version) + && Objects.equals(lastmetadataupdate, dataset.lastmetadataupdate) + && Objects.equals(metadataversionnumber, dataset.metadataversionnumber) + && Objects.equals(geolocation, dataset.geolocation); } @Override public int hashCode() { - return Objects.hash(super.hashCode(), storagedate, device, size, version, lastmetadataupdate, metadataversionnumber, geolocation); + return Objects.hash( + super.hashCode(), + storagedate, + device, + size, + version, + lastmetadataupdate, + metadataversionnumber, + geolocation); } } diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Datasource.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Datasource.java index c8d9736c5..f2755b86b 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Datasource.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Datasource.java @@ -72,7 +72,7 @@ public class Datasource extends OafEntity implements Serializable { private Field citationguidelineurl; - //{yes, no, uknown} + // {yes, no, uknown} private Field qualitymanagementkind; private Field pidsystems; @@ -367,65 +367,148 @@ public class Datasource extends OafEntity implements Serializable { public void mergeFrom(OafEntity e) { super.mergeFrom(e); - if (!Datasource.class.isAssignableFrom(e.getClass())){ + if (!Datasource.class.isAssignableFrom(e.getClass())) { return; } - Datasource d = (Datasource)e; + Datasource d = (Datasource) e; - datasourcetype = d.getDatasourcetype() != null && compareTrust(this, e)<0? d.getDatasourcetype() : datasourcetype; - openairecompatibility = d.getOpenairecompatibility() != null && compareTrust(this, e)<0? d.getOpenairecompatibility() : openairecompatibility; - officialname = d.getOfficialname() != null && compareTrust(this, e)<0? d.getOfficialname() : officialname; - englishname = d.getEnglishname() != null && compareTrust(this, e)<0? d.getEnglishname() : officialname; - websiteurl = d.getWebsiteurl() != null && compareTrust(this, e)<0? d.getWebsiteurl() : websiteurl; - logourl = d.getLogourl() != null && compareTrust(this, e)<0? d.getLogourl() : getLogourl(); - contactemail = d.getContactemail() != null && compareTrust(this, e)<0? d.getContactemail() : contactemail; - namespaceprefix = d.getNamespaceprefix() != null && compareTrust(this, e)<0? d.getNamespaceprefix() : namespaceprefix; - latitude = d.getLatitude() != null && compareTrust(this, e)<0? d.getLatitude() : latitude; - longitude = d.getLongitude() != null && compareTrust(this, e)<0? d.getLongitude() : longitude; - dateofvalidation = d.getDateofvalidation() != null && compareTrust(this, e)<0? d.getDateofvalidation() : dateofvalidation; - description = d.getDescription() != null && compareTrust(this, e)<0? d.getDescription() : description; + datasourcetype = + d.getDatasourcetype() != null && compareTrust(this, e) < 0 + ? d.getDatasourcetype() + : datasourcetype; + openairecompatibility = + d.getOpenairecompatibility() != null && compareTrust(this, e) < 0 + ? d.getOpenairecompatibility() + : openairecompatibility; + officialname = + d.getOfficialname() != null && compareTrust(this, e) < 0 + ? d.getOfficialname() + : officialname; + englishname = + d.getEnglishname() != null && compareTrust(this, e) < 0 + ? d.getEnglishname() + : officialname; + websiteurl = + d.getWebsiteurl() != null && compareTrust(this, e) < 0 + ? d.getWebsiteurl() + : websiteurl; + logourl = + d.getLogourl() != null && compareTrust(this, e) < 0 ? d.getLogourl() : getLogourl(); + contactemail = + d.getContactemail() != null && compareTrust(this, e) < 0 + ? d.getContactemail() + : contactemail; + namespaceprefix = + d.getNamespaceprefix() != null && compareTrust(this, e) < 0 + ? d.getNamespaceprefix() + : namespaceprefix; + latitude = + d.getLatitude() != null && compareTrust(this, e) < 0 ? d.getLatitude() : latitude; + longitude = + d.getLongitude() != null && compareTrust(this, e) < 0 + ? d.getLongitude() + : longitude; + dateofvalidation = + d.getDateofvalidation() != null && compareTrust(this, e) < 0 + ? d.getDateofvalidation() + : dateofvalidation; + description = + d.getDescription() != null && compareTrust(this, e) < 0 + ? d.getDescription() + : description; subjects = mergeLists(subjects, d.getSubjects()); // opendoar specific fields (od*) - odnumberofitems = d.getOdnumberofitems() != null && compareTrust(this, e)<0? d.getOdnumberofitems() : odnumberofitems; - odnumberofitemsdate = d.getOdnumberofitemsdate() != null && compareTrust(this, e)<0? d.getOdnumberofitemsdate() : odnumberofitemsdate; - odpolicies = d.getOdpolicies() != null && compareTrust(this, e)<0? d.getOdpolicies() : odpolicies; + odnumberofitems = + d.getOdnumberofitems() != null && compareTrust(this, e) < 0 + ? d.getOdnumberofitems() + : odnumberofitems; + odnumberofitemsdate = + d.getOdnumberofitemsdate() != null && compareTrust(this, e) < 0 + ? d.getOdnumberofitemsdate() + : odnumberofitemsdate; + odpolicies = + d.getOdpolicies() != null && compareTrust(this, e) < 0 + ? d.getOdpolicies() + : odpolicies; odlanguages = mergeLists(odlanguages, d.getOdlanguages()); odcontenttypes = mergeLists(odcontenttypes, d.getOdcontenttypes()); accessinfopackage = mergeLists(accessinfopackage, d.getAccessinfopackage()); // re3data fields - releasestartdate = d.getReleasestartdate() != null && compareTrust(this, e)<0? d.getReleasestartdate() : releasestartdate; - releaseenddate = d.getReleaseenddate() != null && compareTrust(this, e)<0? d.getReleaseenddate() : releaseenddate; - missionstatementurl = d.getMissionstatementurl() != null && compareTrust(this, e)<0? d.getMissionstatementurl() : missionstatementurl; - dataprovider = d.getDataprovider() != null && compareTrust(this, e)<0? d.getDataprovider() : dataprovider; - serviceprovider = d.getServiceprovider() != null && compareTrust(this, e)<0? d.getServiceprovider() : serviceprovider; + releasestartdate = + d.getReleasestartdate() != null && compareTrust(this, e) < 0 + ? d.getReleasestartdate() + : releasestartdate; + releaseenddate = + d.getReleaseenddate() != null && compareTrust(this, e) < 0 + ? d.getReleaseenddate() + : releaseenddate; + missionstatementurl = + d.getMissionstatementurl() != null && compareTrust(this, e) < 0 + ? d.getMissionstatementurl() + : missionstatementurl; + dataprovider = + d.getDataprovider() != null && compareTrust(this, e) < 0 + ? d.getDataprovider() + : dataprovider; + serviceprovider = + d.getServiceprovider() != null && compareTrust(this, e) < 0 + ? d.getServiceprovider() + : serviceprovider; // {open, restricted or closed} - databaseaccesstype = d.getDatabaseaccesstype() != null && compareTrust(this, e)<0? d.getDatabaseaccesstype() : databaseaccesstype; + databaseaccesstype = + d.getDatabaseaccesstype() != null && compareTrust(this, e) < 0 + ? d.getDatabaseaccesstype() + : databaseaccesstype; // {open, restricted or closed} - datauploadtype = d.getDatauploadtype() != null && compareTrust(this, e)<0? d.getDatauploadtype() : datauploadtype; + datauploadtype = + d.getDatauploadtype() != null && compareTrust(this, e) < 0 + ? d.getDatauploadtype() + : datauploadtype; // {feeRequired, registration, other} - databaseaccessrestriction = d.getDatabaseaccessrestriction() != null && compareTrust(this, e)<0? d.getDatabaseaccessrestriction() : databaseaccessrestriction; + databaseaccessrestriction = + d.getDatabaseaccessrestriction() != null && compareTrust(this, e) < 0 + ? d.getDatabaseaccessrestriction() + : databaseaccessrestriction; // {feeRequired, registration, other} - datauploadrestriction = d.getDatauploadrestriction() != null && compareTrust(this, e)<0? d.getDatauploadrestriction() : datauploadrestriction; + datauploadrestriction = + d.getDatauploadrestriction() != null && compareTrust(this, e) < 0 + ? d.getDatauploadrestriction() + : datauploadrestriction; - versioning = d.getVersioning() != null && compareTrust(this, e)<0? d.getVersioning() : versioning; - citationguidelineurl = d.getCitationguidelineurl() != null && compareTrust(this, e)<0? d.getCitationguidelineurl() : citationguidelineurl; + versioning = + d.getVersioning() != null && compareTrust(this, e) < 0 + ? d.getVersioning() + : versioning; + citationguidelineurl = + d.getCitationguidelineurl() != null && compareTrust(this, e) < 0 + ? d.getCitationguidelineurl() + : citationguidelineurl; - //{yes, no, unknown} - qualitymanagementkind = d.getQualitymanagementkind() != null && compareTrust(this, e)<0? d.getQualitymanagementkind() : qualitymanagementkind; - pidsystems = d.getPidsystems() != null && compareTrust(this, e)<0? d.getPidsystems() : pidsystems; + // {yes, no, unknown} + qualitymanagementkind = + d.getQualitymanagementkind() != null && compareTrust(this, e) < 0 + ? d.getQualitymanagementkind() + : qualitymanagementkind; + pidsystems = + d.getPidsystems() != null && compareTrust(this, e) < 0 + ? d.getPidsystems() + : pidsystems; - certificates = d.getCertificates() != null && compareTrust(this, e)<0? d.getCertificates() : certificates; + certificates = + d.getCertificates() != null && compareTrust(this, e) < 0 + ? d.getCertificates() + : certificates; policies = mergeLists(policies, d.getPolicies()); - journal = d.getJournal() != null && compareTrust(this, e)<0? d.getJournal() : journal; + journal = d.getJournal() != null && compareTrust(this, e) < 0 ? d.getJournal() : journal; mergeOAFDataInfo(e); } @@ -436,45 +519,81 @@ public class Datasource extends OafEntity implements Serializable { if (o == null || getClass() != o.getClass()) return false; if (!super.equals(o)) return false; Datasource that = (Datasource) o; - return Objects.equals(datasourcetype, that.datasourcetype) && - Objects.equals(openairecompatibility, that.openairecompatibility) && - Objects.equals(officialname, that.officialname) && - Objects.equals(englishname, that.englishname) && - Objects.equals(websiteurl, that.websiteurl) && - Objects.equals(logourl, that.logourl) && - Objects.equals(contactemail, that.contactemail) && - Objects.equals(namespaceprefix, that.namespaceprefix) && - Objects.equals(latitude, that.latitude) && - Objects.equals(longitude, that.longitude) && - Objects.equals(dateofvalidation, that.dateofvalidation) && - Objects.equals(description, that.description) && - Objects.equals(subjects, that.subjects) && - Objects.equals(odnumberofitems, that.odnumberofitems) && - Objects.equals(odnumberofitemsdate, that.odnumberofitemsdate) && - Objects.equals(odpolicies, that.odpolicies) && - Objects.equals(odlanguages, that.odlanguages) && - Objects.equals(odcontenttypes, that.odcontenttypes) && - Objects.equals(accessinfopackage, that.accessinfopackage) && - Objects.equals(releasestartdate, that.releasestartdate) && - Objects.equals(releaseenddate, that.releaseenddate) && - Objects.equals(missionstatementurl, that.missionstatementurl) && - Objects.equals(dataprovider, that.dataprovider) && - Objects.equals(serviceprovider, that.serviceprovider) && - Objects.equals(databaseaccesstype, that.databaseaccesstype) && - Objects.equals(datauploadtype, that.datauploadtype) && - Objects.equals(databaseaccessrestriction, that.databaseaccessrestriction) && - Objects.equals(datauploadrestriction, that.datauploadrestriction) && - Objects.equals(versioning, that.versioning) && - Objects.equals(citationguidelineurl, that.citationguidelineurl) && - Objects.equals(qualitymanagementkind, that.qualitymanagementkind) && - Objects.equals(pidsystems, that.pidsystems) && - Objects.equals(certificates, that.certificates) && - Objects.equals(policies, that.policies) && - Objects.equals(journal, that.journal); + return Objects.equals(datasourcetype, that.datasourcetype) + && Objects.equals(openairecompatibility, that.openairecompatibility) + && Objects.equals(officialname, that.officialname) + && Objects.equals(englishname, that.englishname) + && Objects.equals(websiteurl, that.websiteurl) + && Objects.equals(logourl, that.logourl) + && Objects.equals(contactemail, that.contactemail) + && Objects.equals(namespaceprefix, that.namespaceprefix) + && Objects.equals(latitude, that.latitude) + && Objects.equals(longitude, that.longitude) + && Objects.equals(dateofvalidation, that.dateofvalidation) + && Objects.equals(description, that.description) + && Objects.equals(subjects, that.subjects) + && Objects.equals(odnumberofitems, that.odnumberofitems) + && Objects.equals(odnumberofitemsdate, that.odnumberofitemsdate) + && Objects.equals(odpolicies, that.odpolicies) + && Objects.equals(odlanguages, that.odlanguages) + && Objects.equals(odcontenttypes, that.odcontenttypes) + && Objects.equals(accessinfopackage, that.accessinfopackage) + && Objects.equals(releasestartdate, that.releasestartdate) + && Objects.equals(releaseenddate, that.releaseenddate) + && Objects.equals(missionstatementurl, that.missionstatementurl) + && Objects.equals(dataprovider, that.dataprovider) + && Objects.equals(serviceprovider, that.serviceprovider) + && Objects.equals(databaseaccesstype, that.databaseaccesstype) + && Objects.equals(datauploadtype, that.datauploadtype) + && Objects.equals(databaseaccessrestriction, that.databaseaccessrestriction) + && Objects.equals(datauploadrestriction, that.datauploadrestriction) + && Objects.equals(versioning, that.versioning) + && Objects.equals(citationguidelineurl, that.citationguidelineurl) + && Objects.equals(qualitymanagementkind, that.qualitymanagementkind) + && Objects.equals(pidsystems, that.pidsystems) + && Objects.equals(certificates, that.certificates) + && Objects.equals(policies, that.policies) + && Objects.equals(journal, that.journal); } @Override public int hashCode() { - return Objects.hash(super.hashCode(), datasourcetype, openairecompatibility, officialname, englishname, websiteurl, logourl, contactemail, namespaceprefix, latitude, longitude, dateofvalidation, description, subjects, odnumberofitems, odnumberofitemsdate, odpolicies, odlanguages, odcontenttypes, accessinfopackage, releasestartdate, releaseenddate, missionstatementurl, dataprovider, serviceprovider, databaseaccesstype, datauploadtype, databaseaccessrestriction, datauploadrestriction, versioning, citationguidelineurl, qualitymanagementkind, pidsystems, certificates, policies, journal); + return Objects.hash( + super.hashCode(), + datasourcetype, + openairecompatibility, + officialname, + englishname, + websiteurl, + logourl, + contactemail, + namespaceprefix, + latitude, + longitude, + dateofvalidation, + description, + subjects, + odnumberofitems, + odnumberofitemsdate, + odpolicies, + odlanguages, + odcontenttypes, + accessinfopackage, + releasestartdate, + releaseenddate, + missionstatementurl, + dataprovider, + serviceprovider, + databaseaccesstype, + datauploadtype, + databaseaccessrestriction, + datauploadrestriction, + versioning, + citationguidelineurl, + qualitymanagementkind, + pidsystems, + certificates, + policies, + journal); } } diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/ExternalReference.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/ExternalReference.java index 24c94065b..35f84a9a7 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/ExternalReference.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/ExternalReference.java @@ -97,18 +97,19 @@ public class ExternalReference implements Serializable { if (this == o) return true; if (o == null || getClass() != o.getClass()) return false; ExternalReference that = (ExternalReference) o; - return Objects.equals(sitename, that.sitename) && - Objects.equals(label, that.label) && - Objects.equals(url, that.url) && - Objects.equals(description, that.description) && - Objects.equals(qualifier, that.qualifier) && - Objects.equals(refidentifier, that.refidentifier) && - Objects.equals(query, that.query) && - Objects.equals(dataInfo, that.dataInfo); + return Objects.equals(sitename, that.sitename) + && Objects.equals(label, that.label) + && Objects.equals(url, that.url) + && Objects.equals(description, that.description) + && Objects.equals(qualifier, that.qualifier) + && Objects.equals(refidentifier, that.refidentifier) + && Objects.equals(query, that.query) + && Objects.equals(dataInfo, that.dataInfo); } @Override public int hashCode() { - return Objects.hash(sitename, label, url, description, qualifier, refidentifier, query, dataInfo); + return Objects.hash( + sitename, label, url, description, qualifier, refidentifier, query, dataInfo); } } diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/ExtraInfo.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/ExtraInfo.java index 46107b214..d01f1035c 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/ExtraInfo.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/ExtraInfo.java @@ -60,11 +60,11 @@ public class ExtraInfo implements Serializable { if (this == o) return true; if (o == null || getClass() != o.getClass()) return false; ExtraInfo extraInfo = (ExtraInfo) o; - return Objects.equals(name, extraInfo.name) && - Objects.equals(typology, extraInfo.typology) && - Objects.equals(provenance, extraInfo.provenance) && - Objects.equals(trust, extraInfo.trust) && - Objects.equals(value, extraInfo.value); + return Objects.equals(name, extraInfo.name) + && Objects.equals(typology, extraInfo.typology) + && Objects.equals(provenance, extraInfo.provenance) + && Objects.equals(trust, extraInfo.trust) + && Objects.equals(value, extraInfo.value); } @Override diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Field.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Field.java index 2ab0b4d3c..ec9e1e0e7 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Field.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Field.java @@ -31,12 +31,9 @@ public class Field implements Serializable { @Override public boolean equals(Object obj) { - if (this == obj) - return true; - if (obj == null) - return false; - if (getClass() != obj.getClass()) - return false; + if (this == obj) return true; + if (obj == null) return false; + if (getClass() != obj.getClass()) return false; Field other = (Field) obj; return getValue().equals(other.getValue()); } diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/GeoLocation.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/GeoLocation.java index 1839fbd53..85a3dca95 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/GeoLocation.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/GeoLocation.java @@ -1,9 +1,8 @@ package eu.dnetlib.dhp.schema.oaf; import com.fasterxml.jackson.annotation.JsonIgnore; -import org.apache.commons.lang3.StringUtils; - import java.io.Serializable; +import org.apache.commons.lang3.StringUtils; public class GeoLocation implements Serializable { @@ -39,13 +38,17 @@ public class GeoLocation implements Serializable { @JsonIgnore public boolean isBlank() { - return StringUtils.isBlank(point) && - StringUtils.isBlank(box) && - StringUtils.isBlank(place); + return StringUtils.isBlank(point) && StringUtils.isBlank(box) && StringUtils.isBlank(place); } public String toComparableString() { - return isBlank()?"":String.format("%s::%s%s", point != null ? point.toLowerCase() : "", box != null ? box.toLowerCase() : "", place != null ? place.toLowerCase() : ""); + return isBlank() + ? "" + : String.format( + "%s::%s%s", + point != null ? point.toLowerCase() : "", + box != null ? box.toLowerCase() : "", + place != null ? place.toLowerCase() : ""); } @Override @@ -55,16 +58,12 @@ public class GeoLocation implements Serializable { @Override public boolean equals(Object obj) { - if (this == obj) - return true; - if (obj == null) - return false; - if (getClass() != obj.getClass()) - return false; + if (this == obj) return true; + if (obj == null) return false; + if (getClass() != obj.getClass()) return false; GeoLocation other = (GeoLocation) obj; - return toComparableString() - .equals(other.toComparableString()); + return toComparableString().equals(other.toComparableString()); } } diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Instance.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Instance.java index f82296d8b..a215c342a 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Instance.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Instance.java @@ -16,19 +16,21 @@ public class Instance implements Serializable { private List url; // other research products specifc - private String distributionlocation; + private String distributionlocation; private KeyValue collectedfrom; private Field dateofacceptance; - // ( article | book ) processing charges. Defined here to cope with possible wrongly typed results + // ( article | book ) processing charges. Defined here to cope with possible wrongly typed + // results private Field processingchargeamount; - // currency - alphabetic code describe in ISO-4217. Defined here to cope with possible wrongly typed results + // currency - alphabetic code describe in ISO-4217. Defined here to cope with possible wrongly + // typed results private Field processingchargecurrency; - private Field refereed; //peer-review status + private Field refereed; // peer-review status public Field getLicense() { return license; @@ -118,12 +120,19 @@ public class Instance implements Serializable { this.refereed = refereed; } - public String toComparableString(){ - return String.format("%s::%s::%s::%s", - hostedby != null && hostedby.getKey()!= null ? hostedby.getKey().toLowerCase() : "", - accessright!= null && accessright.getClassid()!= null ? accessright.getClassid() : "", - instancetype!= null && instancetype.getClassid()!= null ? instancetype.getClassid() : "", - url != null ? url:""); + public String toComparableString() { + return String.format( + "%s::%s::%s::%s", + hostedby != null && hostedby.getKey() != null + ? hostedby.getKey().toLowerCase() + : "", + accessright != null && accessright.getClassid() != null + ? accessright.getClassid() + : "", + instancetype != null && instancetype.getClassid() != null + ? instancetype.getClassid() + : "", + url != null ? url : ""); } @Override @@ -133,16 +142,12 @@ public class Instance implements Serializable { @Override public boolean equals(Object obj) { - if (this == obj) - return true; - if (obj == null) - return false; - if (getClass() != obj.getClass()) - return false; + if (this == obj) return true; + if (obj == null) return false; + if (getClass() != obj.getClass()) return false; Instance other = (Instance) obj; - return toComparableString() - .equals(other.toComparableString()); + return toComparableString().equals(other.toComparableString()); } } diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Journal.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Journal.java index e3c80b120..c45c93af9 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Journal.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Journal.java @@ -130,22 +130,34 @@ public class Journal implements Serializable { if (this == o) return true; if (o == null || getClass() != o.getClass()) return false; Journal journal = (Journal) o; - return Objects.equals(name, journal.name) && - Objects.equals(issnPrinted, journal.issnPrinted) && - Objects.equals(issnOnline, journal.issnOnline) && - Objects.equals(issnLinking, journal.issnLinking) && - Objects.equals(ep, journal.ep) && - Objects.equals(iss, journal.iss) && - Objects.equals(sp, journal.sp) && - Objects.equals(vol, journal.vol) && - Objects.equals(edition, journal.edition) && - Objects.equals(conferenceplace, journal.conferenceplace) && - Objects.equals(conferencedate, journal.conferencedate) && - Objects.equals(dataInfo, journal.dataInfo); + return Objects.equals(name, journal.name) + && Objects.equals(issnPrinted, journal.issnPrinted) + && Objects.equals(issnOnline, journal.issnOnline) + && Objects.equals(issnLinking, journal.issnLinking) + && Objects.equals(ep, journal.ep) + && Objects.equals(iss, journal.iss) + && Objects.equals(sp, journal.sp) + && Objects.equals(vol, journal.vol) + && Objects.equals(edition, journal.edition) + && Objects.equals(conferenceplace, journal.conferenceplace) + && Objects.equals(conferencedate, journal.conferencedate) + && Objects.equals(dataInfo, journal.dataInfo); } @Override public int hashCode() { - return Objects.hash(name, issnPrinted, issnOnline, issnLinking, ep, iss, sp, vol, edition, conferenceplace, conferencedate, dataInfo); + return Objects.hash( + name, + issnPrinted, + issnOnline, + issnLinking, + ep, + iss, + sp, + vol, + edition, + conferenceplace, + conferencedate, + dataInfo); } } diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/KeyValue.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/KeyValue.java index 5a841b96f..27b2f4a18 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/KeyValue.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/KeyValue.java @@ -1,9 +1,8 @@ package eu.dnetlib.dhp.schema.oaf; import com.fasterxml.jackson.annotation.JsonIgnore; -import org.apache.commons.lang3.StringUtils; - import java.io.Serializable; +import org.apache.commons.lang3.StringUtils; public class KeyValue implements Serializable { @@ -38,7 +37,12 @@ public class KeyValue implements Serializable { } public String toComparableString() { - return isBlank()?"":String.format("%s::%s", key != null ? key.toLowerCase() : "", value != null ? value.toLowerCase() : ""); + return isBlank() + ? "" + : String.format( + "%s::%s", + key != null ? key.toLowerCase() : "", + value != null ? value.toLowerCase() : ""); } @JsonIgnore @@ -53,12 +57,9 @@ public class KeyValue implements Serializable { @Override public boolean equals(Object obj) { - if (this == obj) - return true; - if (obj == null) - return false; - if (getClass() != obj.getClass()) - return false; + if (this == obj) return true; + if (obj == null) return false; + if (getClass() != obj.getClass()) return false; KeyValue other = (KeyValue) obj; diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/OAIProvenance.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/OAIProvenance.java index 52ddf27d8..d85580a68 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/OAIProvenance.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/OAIProvenance.java @@ -3,7 +3,7 @@ package eu.dnetlib.dhp.schema.oaf; import java.io.Serializable; import java.util.Objects; -public class OAIProvenance implements Serializable { +public class OAIProvenance implements Serializable { private OriginDescription originDescription; diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Oaf.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Oaf.java index 64217e5d8..7a7a9a89c 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Oaf.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Oaf.java @@ -4,7 +4,7 @@ import java.io.Serializable; import java.util.Objects; public abstract class Oaf implements Serializable { - + private DataInfo dataInfo; private Long lastupdatetimestamp; @@ -25,24 +25,18 @@ public abstract class Oaf implements Serializable { this.lastupdatetimestamp = lastupdatetimestamp; } - public void mergeOAFDataInfo(Oaf e) { - if (e.getDataInfo()!= null && compareTrust(this,e)<0) - dataInfo = e.getDataInfo(); + if (e.getDataInfo() != null && compareTrust(this, e) < 0) dataInfo = e.getDataInfo(); } protected String extractTrust(Oaf e) { - if (e == null || e.getDataInfo()== null || e.getDataInfo().getTrust()== null) - return "0.0"; + if (e == null || e.getDataInfo() == null || e.getDataInfo().getTrust() == null) + return "0.0"; return e.getDataInfo().getTrust(); - - - } protected int compareTrust(Oaf a, Oaf b) { return extractTrust(a).compareTo(extractTrust(b)); - } @Override @@ -50,8 +44,8 @@ public abstract class Oaf implements Serializable { if (this == o) return true; if (o == null || getClass() != o.getClass()) return false; Oaf oaf = (Oaf) o; - return Objects.equals(dataInfo, oaf.dataInfo) && - Objects.equals(lastupdatetimestamp, oaf.lastupdatetimestamp); + return Objects.equals(dataInfo, oaf.dataInfo) + && Objects.equals(lastupdatetimestamp, oaf.lastupdatetimestamp); } @Override diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/OafEntity.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/OafEntity.java index cd196243a..86f4ff616 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/OafEntity.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/OafEntity.java @@ -86,11 +86,9 @@ public abstract class OafEntity extends Oaf implements Serializable { this.oaiprovenance = oaiprovenance; } - public void mergeFrom(OafEntity e) { - if (e == null) - return; + if (e == null) return; originalId = mergeLists(originalId, e.getOriginalId()); @@ -108,12 +106,15 @@ public abstract class OafEntity extends Oaf implements Serializable { if (e.getOaiprovenance() != null && compareTrust(this, e) < 0) oaiprovenance = e.getOaiprovenance(); - } protected List mergeLists(final List... lists) { - return Arrays.stream(lists).filter(Objects::nonNull).flatMap(List::stream).distinct().collect(Collectors.toList()); + return Arrays.stream(lists) + .filter(Objects::nonNull) + .flatMap(List::stream) + .distinct() + .collect(Collectors.toList()); } @Override @@ -122,18 +123,27 @@ public abstract class OafEntity extends Oaf implements Serializable { if (o == null || getClass() != o.getClass()) return false; if (!super.equals(o)) return false; OafEntity oafEntity = (OafEntity) o; - return Objects.equals(id, oafEntity.id) && - Objects.equals(originalId, oafEntity.originalId) && - Objects.equals(collectedfrom, oafEntity.collectedfrom) && - Objects.equals(pid, oafEntity.pid) && - Objects.equals(dateofcollection, oafEntity.dateofcollection) && - Objects.equals(dateoftransformation, oafEntity.dateoftransformation) && - Objects.equals(extraInfo, oafEntity.extraInfo) && - Objects.equals(oaiprovenance, oafEntity.oaiprovenance); + return Objects.equals(id, oafEntity.id) + && Objects.equals(originalId, oafEntity.originalId) + && Objects.equals(collectedfrom, oafEntity.collectedfrom) + && Objects.equals(pid, oafEntity.pid) + && Objects.equals(dateofcollection, oafEntity.dateofcollection) + && Objects.equals(dateoftransformation, oafEntity.dateoftransformation) + && Objects.equals(extraInfo, oafEntity.extraInfo) + && Objects.equals(oaiprovenance, oafEntity.oaiprovenance); } @Override public int hashCode() { - return Objects.hash(super.hashCode(), id, originalId, collectedfrom, pid, dateofcollection, dateoftransformation, extraInfo, oaiprovenance); + return Objects.hash( + super.hashCode(), + id, + originalId, + collectedfrom, + pid, + dateofcollection, + dateoftransformation, + extraInfo, + oaiprovenance); } } diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Organization.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Organization.java index a73a7f845..7352b4847 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Organization.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Organization.java @@ -122,7 +122,8 @@ public class Organization extends OafEntity implements Serializable { return ecinternationalorganizationeurinterests; } - public void setEcinternationalorganizationeurinterests(Field ecinternationalorganizationeurinterests) { + public void setEcinternationalorganizationeurinterests( + Field ecinternationalorganizationeurinterests) { this.ecinternationalorganizationeurinterests = ecinternationalorganizationeurinterests; } @@ -166,32 +167,70 @@ public class Organization extends OafEntity implements Serializable { this.country = country; } - @Override public void mergeFrom(OafEntity e) { super.mergeFrom(e); - if (!Organization.class.isAssignableFrom(e.getClass())){ + if (!Organization.class.isAssignableFrom(e.getClass())) { return; } final Organization o = (Organization) e; - legalshortname = o.getLegalshortname() != null && compareTrust(this, e)<0? o.getLegalshortname() : legalshortname; - legalname = o.getLegalname() != null && compareTrust(this, e)<0 ? o.getLegalname() : legalname; + legalshortname = + o.getLegalshortname() != null && compareTrust(this, e) < 0 + ? o.getLegalshortname() + : legalshortname; + legalname = + o.getLegalname() != null && compareTrust(this, e) < 0 + ? o.getLegalname() + : legalname; alternativeNames = mergeLists(o.getAlternativeNames(), alternativeNames); - websiteurl = o.getWebsiteurl() != null && compareTrust(this, e)<0? o.getWebsiteurl() : websiteurl; - logourl = o.getLogourl() != null && compareTrust(this, e)<0? o.getLogourl() : logourl; - eclegalbody = o.getEclegalbody() != null && compareTrust(this, e)<0? o.getEclegalbody() : eclegalbody; - eclegalperson = o.getEclegalperson() != null && compareTrust(this, e)<0? o.getEclegalperson() : eclegalperson; - ecnonprofit = o.getEcnonprofit() != null && compareTrust(this, e)<0? o.getEcnonprofit() : ecnonprofit; - ecresearchorganization = o.getEcresearchorganization() != null && compareTrust(this, e)<0? o.getEcresearchorganization() : ecresearchorganization; - echighereducation = o.getEchighereducation() != null && compareTrust(this, e)<0? o.getEchighereducation() : echighereducation; - ecinternationalorganizationeurinterests = o.getEcinternationalorganizationeurinterests() != null && compareTrust(this, e)<0? o.getEcinternationalorganizationeurinterests() : ecinternationalorganizationeurinterests; - ecinternationalorganization = o.getEcinternationalorganization() != null && compareTrust(this, e)<0? o.getEcinternationalorganization() : ecinternationalorganization; - ecenterprise = o.getEcenterprise() != null && compareTrust(this, e)<0? o.getEcenterprise() :ecenterprise; - ecsmevalidated = o.getEcsmevalidated() != null && compareTrust(this, e)<0? o.getEcsmevalidated() :ecsmevalidated; - ecnutscode = o.getEcnutscode() != null && compareTrust(this, e)<0? o.getEcnutscode() :ecnutscode; - country = o.getCountry() != null && compareTrust(this, e)<0 ? o.getCountry() :country; + websiteurl = + o.getWebsiteurl() != null && compareTrust(this, e) < 0 + ? o.getWebsiteurl() + : websiteurl; + logourl = o.getLogourl() != null && compareTrust(this, e) < 0 ? o.getLogourl() : logourl; + eclegalbody = + o.getEclegalbody() != null && compareTrust(this, e) < 0 + ? o.getEclegalbody() + : eclegalbody; + eclegalperson = + o.getEclegalperson() != null && compareTrust(this, e) < 0 + ? o.getEclegalperson() + : eclegalperson; + ecnonprofit = + o.getEcnonprofit() != null && compareTrust(this, e) < 0 + ? o.getEcnonprofit() + : ecnonprofit; + ecresearchorganization = + o.getEcresearchorganization() != null && compareTrust(this, e) < 0 + ? o.getEcresearchorganization() + : ecresearchorganization; + echighereducation = + o.getEchighereducation() != null && compareTrust(this, e) < 0 + ? o.getEchighereducation() + : echighereducation; + ecinternationalorganizationeurinterests = + o.getEcinternationalorganizationeurinterests() != null && compareTrust(this, e) < 0 + ? o.getEcinternationalorganizationeurinterests() + : ecinternationalorganizationeurinterests; + ecinternationalorganization = + o.getEcinternationalorganization() != null && compareTrust(this, e) < 0 + ? o.getEcinternationalorganization() + : ecinternationalorganization; + ecenterprise = + o.getEcenterprise() != null && compareTrust(this, e) < 0 + ? o.getEcenterprise() + : ecenterprise; + ecsmevalidated = + o.getEcsmevalidated() != null && compareTrust(this, e) < 0 + ? o.getEcsmevalidated() + : ecsmevalidated; + ecnutscode = + o.getEcnutscode() != null && compareTrust(this, e) < 0 + ? o.getEcnutscode() + : ecnutscode; + country = o.getCountry() != null && compareTrust(this, e) < 0 ? o.getCountry() : country; mergeOAFDataInfo(o); } @@ -201,26 +240,45 @@ public class Organization extends OafEntity implements Serializable { if (o == null || getClass() != o.getClass()) return false; if (!super.equals(o)) return false; Organization that = (Organization) o; - return Objects.equals(legalshortname, that.legalshortname) && - Objects.equals(legalname, that.legalname) && - Objects.equals(alternativeNames, that.alternativeNames) && - Objects.equals(websiteurl, that.websiteurl) && - Objects.equals(logourl, that.logourl) && - Objects.equals(eclegalbody, that.eclegalbody) && - Objects.equals(eclegalperson, that.eclegalperson) && - Objects.equals(ecnonprofit, that.ecnonprofit) && - Objects.equals(ecresearchorganization, that.ecresearchorganization) && - Objects.equals(echighereducation, that.echighereducation) && - Objects.equals(ecinternationalorganizationeurinterests, that.ecinternationalorganizationeurinterests) && - Objects.equals(ecinternationalorganization, that.ecinternationalorganization) && - Objects.equals(ecenterprise, that.ecenterprise) && - Objects.equals(ecsmevalidated, that.ecsmevalidated) && - Objects.equals(ecnutscode, that.ecnutscode) && - Objects.equals(country, that.country); + return Objects.equals(legalshortname, that.legalshortname) + && Objects.equals(legalname, that.legalname) + && Objects.equals(alternativeNames, that.alternativeNames) + && Objects.equals(websiteurl, that.websiteurl) + && Objects.equals(logourl, that.logourl) + && Objects.equals(eclegalbody, that.eclegalbody) + && Objects.equals(eclegalperson, that.eclegalperson) + && Objects.equals(ecnonprofit, that.ecnonprofit) + && Objects.equals(ecresearchorganization, that.ecresearchorganization) + && Objects.equals(echighereducation, that.echighereducation) + && Objects.equals( + ecinternationalorganizationeurinterests, + that.ecinternationalorganizationeurinterests) + && Objects.equals(ecinternationalorganization, that.ecinternationalorganization) + && Objects.equals(ecenterprise, that.ecenterprise) + && Objects.equals(ecsmevalidated, that.ecsmevalidated) + && Objects.equals(ecnutscode, that.ecnutscode) + && Objects.equals(country, that.country); } @Override public int hashCode() { - return Objects.hash(super.hashCode(), legalshortname, legalname, alternativeNames, websiteurl, logourl, eclegalbody, eclegalperson, ecnonprofit, ecresearchorganization, echighereducation, ecinternationalorganizationeurinterests, ecinternationalorganization, ecenterprise, ecsmevalidated, ecnutscode, country); + return Objects.hash( + super.hashCode(), + legalshortname, + legalname, + alternativeNames, + websiteurl, + logourl, + eclegalbody, + eclegalperson, + ecnonprofit, + ecresearchorganization, + echighereducation, + ecinternationalorganizationeurinterests, + ecinternationalorganization, + ecenterprise, + ecsmevalidated, + ecnutscode, + country); } } diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/OriginDescription.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/OriginDescription.java index 8038242ca..6574ddf4e 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/OriginDescription.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/OriginDescription.java @@ -70,16 +70,17 @@ public class OriginDescription implements Serializable { if (this == o) return true; if (o == null || getClass() != o.getClass()) return false; OriginDescription that = (OriginDescription) o; - return Objects.equals(harvestDate, that.harvestDate) && - Objects.equals(altered, that.altered) && - Objects.equals(baseURL, that.baseURL) && - Objects.equals(identifier, that.identifier) && - Objects.equals(datestamp, that.datestamp) && - Objects.equals(metadataNamespace, that.metadataNamespace); + return Objects.equals(harvestDate, that.harvestDate) + && Objects.equals(altered, that.altered) + && Objects.equals(baseURL, that.baseURL) + && Objects.equals(identifier, that.identifier) + && Objects.equals(datestamp, that.datestamp) + && Objects.equals(metadataNamespace, that.metadataNamespace); } @Override public int hashCode() { - return Objects.hash(harvestDate, altered, baseURL, identifier, datestamp, metadataNamespace); + return Objects.hash( + harvestDate, altered, baseURL, identifier, datestamp, metadataNamespace); } } diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/OtherResearchProduct.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/OtherResearchProduct.java index cc36817bb..00b24a374 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/OtherResearchProduct.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/OtherResearchProduct.java @@ -40,11 +40,11 @@ public class OtherResearchProduct extends Result implements Serializable { public void mergeFrom(OafEntity e) { super.mergeFrom(e); - if (!OtherResearchProduct.class.isAssignableFrom(e.getClass())){ + if (!OtherResearchProduct.class.isAssignableFrom(e.getClass())) { return; } - OtherResearchProduct o = (OtherResearchProduct)e; + OtherResearchProduct o = (OtherResearchProduct) e; contactperson = mergeLists(contactperson, o.getContactperson()); contactgroup = mergeLists(contactgroup, o.getContactgroup()); @@ -58,9 +58,9 @@ public class OtherResearchProduct extends Result implements Serializable { if (o == null || getClass() != o.getClass()) return false; if (!super.equals(o)) return false; OtherResearchProduct that = (OtherResearchProduct) o; - return Objects.equals(contactperson, that.contactperson) && - Objects.equals(contactgroup, that.contactgroup) && - Objects.equals(tool, that.tool); + return Objects.equals(contactperson, that.contactperson) + && Objects.equals(contactgroup, that.contactgroup) + && Objects.equals(tool, that.tool); } @Override diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Project.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Project.java index 023be8fc0..7ed816fbe 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Project.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Project.java @@ -266,44 +266,91 @@ public class Project extends OafEntity implements Serializable { this.fundedamount = fundedamount; } - @Override public void mergeFrom(OafEntity e) { super.mergeFrom(e); - if (!Project.class.isAssignableFrom(e.getClass())){ + if (!Project.class.isAssignableFrom(e.getClass())) { return; } - Project p = (Project)e; + Project p = (Project) e; - websiteurl= p.getWebsiteurl()!= null && compareTrust(this,e)<0?p.getWebsiteurl():websiteurl; - code= p.getCode()!=null && compareTrust(this,e)<0?p.getCode():code; - acronym= p.getAcronym()!= null && compareTrust(this,e)<0?p.getAcronym():acronym; - title= p.getTitle()!= null && compareTrust(this,e)<0?p.getTitle():title; - startdate= p.getStartdate()!=null && compareTrust(this,e)<0?p.getStartdate():startdate; - enddate= p.getEnddate()!=null && compareTrust(this,e)<0?p.getEnddate():enddate; - callidentifier= p.getCallidentifier()!=null && compareTrust(this,e)<0?p.getCallidentifier():callidentifier; - keywords= p.getKeywords()!=null && compareTrust(this,e)<0?p.getKeywords():keywords; - duration= p.getDuration()!=null && compareTrust(this,e)<0?p.getDuration():duration; - ecsc39= p.getEcsc39()!=null && compareTrust(this,e)<0?p.getEcsc39():ecsc39; - oamandatepublications= p.getOamandatepublications()!=null && compareTrust(this,e)<0?p.getOamandatepublications():oamandatepublications; - ecarticle29_3= p.getEcarticle29_3()!=null && compareTrust(this,e)<0?p.getEcarticle29_3():ecarticle29_3; - subjects= mergeLists(subjects, p.getSubjects()); - fundingtree= mergeLists(fundingtree, p.getFundingtree()); - contracttype= p.getContracttype()!=null && compareTrust(this,e)<0?p.getContracttype():contracttype; - optional1= p.getOptional1()!=null && compareTrust(this,e)<0?p.getOptional1():optional1; - optional2= p.getOptional2()!=null && compareTrust(this,e)<0?p.getOptional2():optional2; - jsonextrainfo= p.getJsonextrainfo()!=null && compareTrust(this,e)<0?p.getJsonextrainfo():jsonextrainfo; - contactfullname= p.getContactfullname()!=null && compareTrust(this,e)<0?p.getContactfullname():contactfullname; - contactfax= p.getContactfax()!=null && compareTrust(this,e)<0?p.getContactfax():contactfax; - contactphone= p.getContactphone()!=null && compareTrust(this,e)<0?p.getContactphone():contactphone; - contactemail= p.getContactemail()!=null && compareTrust(this,e)<0?p.getContactemail():contactemail; - summary= p.getSummary()!=null && compareTrust(this,e)<0?p.getSummary():summary; - currency= p.getCurrency()!=null && compareTrust(this,e)<0?p.getCurrency():currency; - totalcost= p.getTotalcost()!=null && compareTrust(this,e)<0?p.getTotalcost():totalcost; - fundedamount= p.getFundedamount()!= null && compareTrust(this,e)<0?p.getFundedamount():fundedamount; - mergeOAFDataInfo(e); + websiteurl = + p.getWebsiteurl() != null && compareTrust(this, e) < 0 + ? p.getWebsiteurl() + : websiteurl; + code = p.getCode() != null && compareTrust(this, e) < 0 ? p.getCode() : code; + acronym = p.getAcronym() != null && compareTrust(this, e) < 0 ? p.getAcronym() : acronym; + title = p.getTitle() != null && compareTrust(this, e) < 0 ? p.getTitle() : title; + startdate = + p.getStartdate() != null && compareTrust(this, e) < 0 + ? p.getStartdate() + : startdate; + enddate = p.getEnddate() != null && compareTrust(this, e) < 0 ? p.getEnddate() : enddate; + callidentifier = + p.getCallidentifier() != null && compareTrust(this, e) < 0 + ? p.getCallidentifier() + : callidentifier; + keywords = + p.getKeywords() != null && compareTrust(this, e) < 0 ? p.getKeywords() : keywords; + duration = + p.getDuration() != null && compareTrust(this, e) < 0 ? p.getDuration() : duration; + ecsc39 = p.getEcsc39() != null && compareTrust(this, e) < 0 ? p.getEcsc39() : ecsc39; + oamandatepublications = + p.getOamandatepublications() != null && compareTrust(this, e) < 0 + ? p.getOamandatepublications() + : oamandatepublications; + ecarticle29_3 = + p.getEcarticle29_3() != null && compareTrust(this, e) < 0 + ? p.getEcarticle29_3() + : ecarticle29_3; + subjects = mergeLists(subjects, p.getSubjects()); + fundingtree = mergeLists(fundingtree, p.getFundingtree()); + contracttype = + p.getContracttype() != null && compareTrust(this, e) < 0 + ? p.getContracttype() + : contracttype; + optional1 = + p.getOptional1() != null && compareTrust(this, e) < 0 + ? p.getOptional1() + : optional1; + optional2 = + p.getOptional2() != null && compareTrust(this, e) < 0 + ? p.getOptional2() + : optional2; + jsonextrainfo = + p.getJsonextrainfo() != null && compareTrust(this, e) < 0 + ? p.getJsonextrainfo() + : jsonextrainfo; + contactfullname = + p.getContactfullname() != null && compareTrust(this, e) < 0 + ? p.getContactfullname() + : contactfullname; + contactfax = + p.getContactfax() != null && compareTrust(this, e) < 0 + ? p.getContactfax() + : contactfax; + contactphone = + p.getContactphone() != null && compareTrust(this, e) < 0 + ? p.getContactphone() + : contactphone; + contactemail = + p.getContactemail() != null && compareTrust(this, e) < 0 + ? p.getContactemail() + : contactemail; + summary = p.getSummary() != null && compareTrust(this, e) < 0 ? p.getSummary() : summary; + currency = + p.getCurrency() != null && compareTrust(this, e) < 0 ? p.getCurrency() : currency; + totalcost = + p.getTotalcost() != null && compareTrust(this, e) < 0 + ? p.getTotalcost() + : totalcost; + fundedamount = + p.getFundedamount() != null && compareTrust(this, e) < 0 + ? p.getFundedamount() + : fundedamount; + mergeOAFDataInfo(e); } @Override @@ -312,36 +359,63 @@ public class Project extends OafEntity implements Serializable { if (o == null || getClass() != o.getClass()) return false; if (!super.equals(o)) return false; Project project = (Project) o; - return Objects.equals(websiteurl, project.websiteurl) && - Objects.equals(code, project.code) && - Objects.equals(acronym, project.acronym) && - Objects.equals(title, project.title) && - Objects.equals(startdate, project.startdate) && - Objects.equals(enddate, project.enddate) && - Objects.equals(callidentifier, project.callidentifier) && - Objects.equals(keywords, project.keywords) && - Objects.equals(duration, project.duration) && - Objects.equals(ecsc39, project.ecsc39) && - Objects.equals(oamandatepublications, project.oamandatepublications) && - Objects.equals(ecarticle29_3, project.ecarticle29_3) && - Objects.equals(subjects, project.subjects) && - Objects.equals(fundingtree, project.fundingtree) && - Objects.equals(contracttype, project.contracttype) && - Objects.equals(optional1, project.optional1) && - Objects.equals(optional2, project.optional2) && - Objects.equals(jsonextrainfo, project.jsonextrainfo) && - Objects.equals(contactfullname, project.contactfullname) && - Objects.equals(contactfax, project.contactfax) && - Objects.equals(contactphone, project.contactphone) && - Objects.equals(contactemail, project.contactemail) && - Objects.equals(summary, project.summary) && - Objects.equals(currency, project.currency) && - Objects.equals(totalcost, project.totalcost) && - Objects.equals(fundedamount, project.fundedamount); + return Objects.equals(websiteurl, project.websiteurl) + && Objects.equals(code, project.code) + && Objects.equals(acronym, project.acronym) + && Objects.equals(title, project.title) + && Objects.equals(startdate, project.startdate) + && Objects.equals(enddate, project.enddate) + && Objects.equals(callidentifier, project.callidentifier) + && Objects.equals(keywords, project.keywords) + && Objects.equals(duration, project.duration) + && Objects.equals(ecsc39, project.ecsc39) + && Objects.equals(oamandatepublications, project.oamandatepublications) + && Objects.equals(ecarticle29_3, project.ecarticle29_3) + && Objects.equals(subjects, project.subjects) + && Objects.equals(fundingtree, project.fundingtree) + && Objects.equals(contracttype, project.contracttype) + && Objects.equals(optional1, project.optional1) + && Objects.equals(optional2, project.optional2) + && Objects.equals(jsonextrainfo, project.jsonextrainfo) + && Objects.equals(contactfullname, project.contactfullname) + && Objects.equals(contactfax, project.contactfax) + && Objects.equals(contactphone, project.contactphone) + && Objects.equals(contactemail, project.contactemail) + && Objects.equals(summary, project.summary) + && Objects.equals(currency, project.currency) + && Objects.equals(totalcost, project.totalcost) + && Objects.equals(fundedamount, project.fundedamount); } @Override public int hashCode() { - return Objects.hash(super.hashCode(), websiteurl, code, acronym, title, startdate, enddate, callidentifier, keywords, duration, ecsc39, oamandatepublications, ecarticle29_3, subjects, fundingtree, contracttype, optional1, optional2, jsonextrainfo, contactfullname, contactfax, contactphone, contactemail, summary, currency, totalcost, fundedamount); + return Objects.hash( + super.hashCode(), + websiteurl, + code, + acronym, + title, + startdate, + enddate, + callidentifier, + keywords, + duration, + ecsc39, + oamandatepublications, + ecarticle29_3, + subjects, + fundingtree, + contracttype, + optional1, + optional2, + jsonextrainfo, + contactfullname, + contactfax, + contactphone, + contactemail, + summary, + currency, + totalcost, + fundedamount); } } diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Publication.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Publication.java index 20a0b2121..1542b7933 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Publication.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Publication.java @@ -20,14 +20,13 @@ public class Publication extends Result implements Serializable { public void mergeFrom(OafEntity e) { super.mergeFrom(e); - if (!Publication.class.isAssignableFrom(e.getClass())){ + if (!Publication.class.isAssignableFrom(e.getClass())) { return; } Publication p = (Publication) e; - if (p.getJournal() != null && compareTrust(this, e)<0) - journal = p.getJournal(); + if (p.getJournal() != null && compareTrust(this, e) < 0) journal = p.getJournal(); mergeOAFDataInfo(e); } diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Qualifier.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Qualifier.java index ae2bf1a60..9e42bdae3 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Qualifier.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Qualifier.java @@ -1,9 +1,8 @@ package eu.dnetlib.dhp.schema.oaf; import com.fasterxml.jackson.annotation.JsonIgnore; -import org.apache.commons.lang3.StringUtils; - import java.io.Serializable; +import org.apache.commons.lang3.StringUtils; public class Qualifier implements Serializable { @@ -45,20 +44,24 @@ public class Qualifier implements Serializable { } public String toComparableString() { - return isBlank()?"": String.format("%s::%s::%s::%s", - classid != null ? classid : "", - classname != null ? classname : "", - schemeid != null ? schemeid : "", - schemename != null ? schemename : ""); + return isBlank() + ? "" + : String.format( + "%s::%s::%s::%s", + classid != null ? classid : "", + classname != null ? classname : "", + schemeid != null ? schemeid : "", + schemename != null ? schemename : ""); } @JsonIgnore public boolean isBlank() { - return StringUtils.isBlank(classid) && - StringUtils.isBlank(classname) && - StringUtils.isBlank(schemeid) && - StringUtils.isBlank(schemename); + return StringUtils.isBlank(classid) + && StringUtils.isBlank(classname) + && StringUtils.isBlank(schemeid) + && StringUtils.isBlank(schemename); } + @Override public int hashCode() { return toComparableString().hashCode(); @@ -66,16 +69,12 @@ public class Qualifier implements Serializable { @Override public boolean equals(Object obj) { - if (this == obj) - return true; - if (obj == null) - return false; - if (getClass() != obj.getClass()) - return false; + if (this == obj) return true; + if (obj == null) return false; + if (getClass() != obj.getClass()) return false; Qualifier other = (Qualifier) obj; - return toComparableString() - .equals(other.toComparableString()); + return toComparableString().equals(other.toComparableString()); } } diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Relation.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Relation.java index 6871c0197..9d211d058 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Relation.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Relation.java @@ -1,104 +1,108 @@ package eu.dnetlib.dhp.schema.oaf; +import static com.google.common.base.Preconditions.checkArgument; + import java.util.*; import java.util.stream.Collectors; import java.util.stream.Stream; -import static com.google.common.base.Preconditions.checkArgument; - public class Relation extends Oaf { - private String relType; + private String relType; - private String subRelType; + private String subRelType; - private String relClass; + private String relClass; - private String source; + private String source; - private String target; + private String target; - private List collectedFrom = new ArrayList<>(); + private List collectedFrom = new ArrayList<>(); - public String getRelType() { - return relType; - } + public String getRelType() { + return relType; + } - public void setRelType(final String relType) { - this.relType = relType; - } + public void setRelType(final String relType) { + this.relType = relType; + } - public String getSubRelType() { - return subRelType; - } + public String getSubRelType() { + return subRelType; + } - public void setSubRelType(final String subRelType) { - this.subRelType = subRelType; - } + public void setSubRelType(final String subRelType) { + this.subRelType = subRelType; + } - public String getRelClass() { - return relClass; - } + public String getRelClass() { + return relClass; + } - public void setRelClass(final String relClass) { - this.relClass = relClass; - } + public void setRelClass(final String relClass) { + this.relClass = relClass; + } - public String getSource() { - return source; - } + public String getSource() { + return source; + } - public void setSource(final String source) { - this.source = source; - } + public void setSource(final String source) { + this.source = source; + } - public String getTarget() { - return target; - } + public String getTarget() { + return target; + } - public void setTarget(final String target) { - this.target = target; - } + public void setTarget(final String target) { + this.target = target; + } - public List getCollectedFrom() { - return collectedFrom; - } + public List getCollectedFrom() { + return collectedFrom; + } - public void setCollectedFrom(final List collectedFrom) { - this.collectedFrom = collectedFrom; - } + public void setCollectedFrom(final List collectedFrom) { + this.collectedFrom = collectedFrom; + } - public void mergeFrom(final Relation r) { + public void mergeFrom(final Relation r) { - checkArgument(Objects.equals(getSource(), r.getSource()),"source ids must be equal"); - checkArgument(Objects.equals(getTarget(), r.getTarget()),"target ids must be equal"); - checkArgument(Objects.equals(getRelType(), r.getRelType()),"relType(s) must be equal"); - checkArgument(Objects.equals(getSubRelType(), r.getSubRelType()),"subRelType(s) must be equal"); - checkArgument(Objects.equals(getRelClass(), r.getRelClass()),"relClass(es) must be equal"); + checkArgument(Objects.equals(getSource(), r.getSource()), "source ids must be equal"); + checkArgument(Objects.equals(getTarget(), r.getTarget()), "target ids must be equal"); + checkArgument(Objects.equals(getRelType(), r.getRelType()), "relType(s) must be equal"); + checkArgument( + Objects.equals(getSubRelType(), r.getSubRelType()), "subRelType(s) must be equal"); + checkArgument(Objects.equals(getRelClass(), r.getRelClass()), "relClass(es) must be equal"); - setCollectedFrom( - Stream - .concat(Optional.ofNullable(getCollectedFrom()).map(Collection::stream).orElse(Stream.empty()), - Optional.ofNullable(r.getCollectedFrom()).map(Collection::stream).orElse(Stream.empty())) - .distinct() // relies on KeyValue.equals - .collect(Collectors.toList())); - } + setCollectedFrom( + Stream.concat( + Optional.ofNullable(getCollectedFrom()) + .map(Collection::stream) + .orElse(Stream.empty()), + Optional.ofNullable(r.getCollectedFrom()) + .map(Collection::stream) + .orElse(Stream.empty())) + .distinct() // relies on KeyValue.equals + .collect(Collectors.toList())); + } - @Override - public boolean equals(Object o) { - if (this == o) return true; - if (o == null || getClass() != o.getClass()) return false; - Relation relation = (Relation) o; - return relType.equals(relation.relType) && - subRelType.equals(relation.subRelType) && - relClass.equals(relation.relClass) && - source.equals(relation.source) && - target.equals(relation.target); - } - - @Override - public int hashCode() { - return Objects.hash(relType, subRelType, relClass, source, target, collectedFrom); - } + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + Relation relation = (Relation) o; + return relType.equals(relation.relType) + && subRelType.equals(relation.subRelType) + && relClass.equals(relation.relClass) + && source.equals(relation.source) + && target.equals(relation.target); + } + @Override + public int hashCode() { + return Objects.hash(relType, subRelType, relClass, source, target, collectedFrom); + } } diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Result.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Result.java index f98bcec93..f9d9756d5 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Result.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Result.java @@ -223,7 +223,7 @@ public class Result extends OafEntity implements Serializable { public void mergeFrom(OafEntity e) { super.mergeFrom(e); - if (!Result.class.isAssignableFrom(e.getClass())){ + if (!Result.class.isAssignableFrom(e.getClass())) { return; } @@ -231,11 +231,9 @@ public class Result extends OafEntity implements Serializable { instance = mergeLists(instance, r.getInstance()); - if (r.getResulttype() != null && compareTrust(this, r) < 0) - resulttype = r.getResulttype(); + if (r.getResulttype() != null && compareTrust(this, r) < 0) resulttype = r.getResulttype(); - if (r.getLanguage() != null && compareTrust(this, r) < 0) - language = r.getLanguage(); + if (r.getLanguage() != null && compareTrust(this, r) < 0) language = r.getLanguage(); country = mergeLists(country, r.getCountry()); @@ -247,8 +245,7 @@ public class Result extends OafEntity implements Serializable { description = longestLists(description, r.getDescription()); - if (r.getPublisher() != null && compareTrust(this, r) < 0) - publisher = r.getPublisher(); + if (r.getPublisher() != null && compareTrust(this, r) < 0) publisher = r.getPublisher(); if (r.getEmbargoenddate() != null && compareTrust(this, r) < 0) embargoenddate = r.getEmbargoenddate(); @@ -261,8 +258,7 @@ public class Result extends OafEntity implements Serializable { contributor = mergeLists(contributor, r.getContributor()); - if (r.getResourcetype() != null) - resourcetype = r.getResourcetype(); + if (r.getResourcetype() != null) resourcetype = r.getResourcetype(); coverage = mergeLists(coverage, r.getCoverage()); @@ -271,13 +267,21 @@ public class Result extends OafEntity implements Serializable { externalReference = mergeLists(externalReference, r.getExternalReference()); } - private List> longestLists(List> a, List> b) { - if (a == null || b == null) - return a == null ? b : a; + if (a == null || b == null) return a == null ? b : a; if (a.size() == b.size()) { - int msa = a.stream().filter(i -> i.getValue() != null).map(i -> i.getValue().length()).max(Comparator.naturalOrder()).orElse(0); - int msb = b.stream().filter(i -> i.getValue() != null).map(i -> i.getValue().length()).max(Comparator.naturalOrder()).orElse(0); + int msa = + a.stream() + .filter(i -> i.getValue() != null) + .map(i -> i.getValue().length()) + .max(Comparator.naturalOrder()) + .orElse(0); + int msb = + b.stream() + .filter(i -> i.getValue() != null) + .map(i -> i.getValue().length()) + .max(Comparator.naturalOrder()) + .orElse(0); return msa > msb ? a : b; } return a.size() > b.size() ? a : b; @@ -289,31 +293,53 @@ public class Result extends OafEntity implements Serializable { if (o == null || getClass() != o.getClass()) return false; if (!super.equals(o)) return false; Result result = (Result) o; - return Objects.equals(author, result.author) && - Objects.equals(resulttype, result.resulttype) && - Objects.equals(language, result.language) && - Objects.equals(country, result.country) && - Objects.equals(subject, result.subject) && - Objects.equals(title, result.title) && - Objects.equals(relevantdate, result.relevantdate) && - Objects.equals(description, result.description) && - Objects.equals(dateofacceptance, result.dateofacceptance) && - Objects.equals(publisher, result.publisher) && - Objects.equals(embargoenddate, result.embargoenddate) && - Objects.equals(source, result.source) && - Objects.equals(fulltext, result.fulltext) && - Objects.equals(format, result.format) && - Objects.equals(contributor, result.contributor) && - Objects.equals(resourcetype, result.resourcetype) && - Objects.equals(coverage, result.coverage) && - Objects.equals(bestaccessright, result.bestaccessright) && - Objects.equals(context, result.context) && - Objects.equals(externalReference, result.externalReference) && - Objects.equals(instance, result.instance); + return Objects.equals(author, result.author) + && Objects.equals(resulttype, result.resulttype) + && Objects.equals(language, result.language) + && Objects.equals(country, result.country) + && Objects.equals(subject, result.subject) + && Objects.equals(title, result.title) + && Objects.equals(relevantdate, result.relevantdate) + && Objects.equals(description, result.description) + && Objects.equals(dateofacceptance, result.dateofacceptance) + && Objects.equals(publisher, result.publisher) + && Objects.equals(embargoenddate, result.embargoenddate) + && Objects.equals(source, result.source) + && Objects.equals(fulltext, result.fulltext) + && Objects.equals(format, result.format) + && Objects.equals(contributor, result.contributor) + && Objects.equals(resourcetype, result.resourcetype) + && Objects.equals(coverage, result.coverage) + && Objects.equals(bestaccessright, result.bestaccessright) + && Objects.equals(context, result.context) + && Objects.equals(externalReference, result.externalReference) + && Objects.equals(instance, result.instance); } @Override public int hashCode() { - return Objects.hash(super.hashCode(), author, resulttype, language, country, subject, title, relevantdate, description, dateofacceptance, publisher, embargoenddate, source, fulltext, format, contributor, resourcetype, coverage, bestaccessright, context, externalReference, instance); + return Objects.hash( + super.hashCode(), + author, + resulttype, + language, + country, + subject, + title, + relevantdate, + description, + dateofacceptance, + publisher, + embargoenddate, + source, + fulltext, + format, + contributor, + resourcetype, + coverage, + bestaccessright, + context, + externalReference, + instance); } } diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Software.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Software.java index 6b51a6ada..a7d36638e 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Software.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Software.java @@ -50,7 +50,7 @@ public class Software extends Result implements Serializable { public void mergeFrom(OafEntity e) { super.mergeFrom(e); - if (!Software.class.isAssignableFrom(e.getClass())){ + if (!Software.class.isAssignableFrom(e.getClass())) { return; } @@ -59,9 +59,15 @@ public class Software extends Result implements Serializable { license = mergeLists(license, s.getLicense()); - codeRepositoryUrl = s.getCodeRepositoryUrl()!= null && compareTrust(this, s)<0?s.getCodeRepositoryUrl():codeRepositoryUrl; + codeRepositoryUrl = + s.getCodeRepositoryUrl() != null && compareTrust(this, s) < 0 + ? s.getCodeRepositoryUrl() + : codeRepositoryUrl; - programmingLanguage= s.getProgrammingLanguage()!= null && compareTrust(this, s)<0?s.getProgrammingLanguage():programmingLanguage; + programmingLanguage = + s.getProgrammingLanguage() != null && compareTrust(this, s) < 0 + ? s.getProgrammingLanguage() + : programmingLanguage; mergeOAFDataInfo(e); } @@ -72,14 +78,19 @@ public class Software extends Result implements Serializable { if (o == null || getClass() != o.getClass()) return false; if (!super.equals(o)) return false; Software software = (Software) o; - return Objects.equals(documentationUrl, software.documentationUrl) && - Objects.equals(license, software.license) && - Objects.equals(codeRepositoryUrl, software.codeRepositoryUrl) && - Objects.equals(programmingLanguage, software.programmingLanguage); + return Objects.equals(documentationUrl, software.documentationUrl) + && Objects.equals(license, software.license) + && Objects.equals(codeRepositoryUrl, software.codeRepositoryUrl) + && Objects.equals(programmingLanguage, software.programmingLanguage); } @Override public int hashCode() { - return Objects.hash(super.hashCode(), documentationUrl, license, codeRepositoryUrl, programmingLanguage); + return Objects.hash( + super.hashCode(), + documentationUrl, + license, + codeRepositoryUrl, + programmingLanguage); } } diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/StructuredProperty.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/StructuredProperty.java index 5df6b80f3..a42071b18 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/StructuredProperty.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/StructuredProperty.java @@ -2,7 +2,7 @@ package eu.dnetlib.dhp.schema.oaf; import java.io.Serializable; -public class StructuredProperty implements Serializable { +public class StructuredProperty implements Serializable { private String value; @@ -34,8 +34,8 @@ public class StructuredProperty implements Serializable { this.dataInfo = dataInfo; } - public String toComparableString(){ - return value != null ? value.toLowerCase() : ""; + public String toComparableString() { + return value != null ? value.toLowerCase() : ""; } @Override @@ -45,16 +45,12 @@ public class StructuredProperty implements Serializable { @Override public boolean equals(Object obj) { - if (this == obj) - return true; - if (obj == null) - return false; - if (getClass() != obj.getClass()) - return false; + if (this == obj) return true; + if (obj == null) return false; + if (getClass() != obj.getClass()) return false; StructuredProperty other = (StructuredProperty) obj; - return toComparableString() - .equals(other.toComparableString()); + return toComparableString().equals(other.toComparableString()); } } diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/scholexplorer/DLIDataset.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/scholexplorer/DLIDataset.java index 10aafaa4c..b6c8f75a0 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/scholexplorer/DLIDataset.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/scholexplorer/DLIDataset.java @@ -2,12 +2,11 @@ package eu.dnetlib.dhp.schema.scholexplorer; import eu.dnetlib.dhp.schema.oaf.Dataset; import eu.dnetlib.dhp.schema.oaf.OafEntity; -import org.apache.commons.lang3.StringUtils; - import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; +import org.apache.commons.lang3.StringUtils; public class DLIDataset extends Dataset { @@ -47,33 +46,45 @@ public class DLIDataset extends Dataset { DLIDataset p = (DLIDataset) e; if (StringUtils.isBlank(completionStatus) && StringUtils.isNotBlank(p.completionStatus)) completionStatus = p.completionStatus; - if ("complete".equalsIgnoreCase(p.completionStatus)) - completionStatus = "complete"; + if ("complete".equalsIgnoreCase(p.completionStatus)) completionStatus = "complete"; dlicollectedfrom = mergeProvenance(dlicollectedfrom, p.getDlicollectedfrom()); } - private List mergeProvenance(final List a, final List b) { + private List mergeProvenance( + final List a, final List b) { Map result = new HashMap<>(); if (a != null) - a.forEach(p -> { - if (p != null && StringUtils.isNotBlank(p.getId()) && result.containsKey(p.getId())) { - if ("incomplete".equalsIgnoreCase(result.get(p.getId()).getCompletionStatus()) && StringUtils.isNotBlank(p.getCompletionStatus())) { - result.put(p.getId(), p); - } + a.forEach( + p -> { + if (p != null + && StringUtils.isNotBlank(p.getId()) + && result.containsKey(p.getId())) { + if ("incomplete" + .equalsIgnoreCase( + result.get(p.getId()).getCompletionStatus()) + && StringUtils.isNotBlank(p.getCompletionStatus())) { + result.put(p.getId(), p); + } - } else if (p != null && p.getId() != null && !result.containsKey(p.getId())) - result.put(p.getId(), p); - }); + } else if (p != null && p.getId() != null && !result.containsKey(p.getId())) + result.put(p.getId(), p); + }); if (b != null) - b.forEach(p -> { - if (p != null && StringUtils.isNotBlank(p.getId()) && result.containsKey(p.getId())) { - if ("incomplete".equalsIgnoreCase(result.get(p.getId()).getCompletionStatus()) && StringUtils.isNotBlank(p.getCompletionStatus())) { - result.put(p.getId(), p); - } + b.forEach( + p -> { + if (p != null + && StringUtils.isNotBlank(p.getId()) + && result.containsKey(p.getId())) { + if ("incomplete" + .equalsIgnoreCase( + result.get(p.getId()).getCompletionStatus()) + && StringUtils.isNotBlank(p.getCompletionStatus())) { + result.put(p.getId(), p); + } - } else if (p != null && p.getId() != null && !result.containsKey(p.getId())) - result.put(p.getId(), p); - }); + } else if (p != null && p.getId() != null && !result.containsKey(p.getId())) + result.put(p.getId(), p); + }); return new ArrayList<>(result.values()); } diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/scholexplorer/DLIPublication.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/scholexplorer/DLIPublication.java index ebd56eaa9..6b2d440bc 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/scholexplorer/DLIPublication.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/scholexplorer/DLIPublication.java @@ -2,9 +2,9 @@ package eu.dnetlib.dhp.schema.scholexplorer; import eu.dnetlib.dhp.schema.oaf.OafEntity; import eu.dnetlib.dhp.schema.oaf.Publication; -import org.apache.commons.lang3.StringUtils; import java.io.Serializable; import java.util.*; +import org.apache.commons.lang3.StringUtils; public class DLIPublication extends Publication implements Serializable { @@ -44,33 +44,45 @@ public class DLIPublication extends Publication implements Serializable { DLIPublication p = (DLIPublication) e; if (StringUtils.isBlank(completionStatus) && StringUtils.isNotBlank(p.completionStatus)) completionStatus = p.completionStatus; - if ("complete".equalsIgnoreCase(p.completionStatus)) - completionStatus = "complete"; + if ("complete".equalsIgnoreCase(p.completionStatus)) completionStatus = "complete"; dlicollectedfrom = mergeProvenance(dlicollectedfrom, p.getDlicollectedfrom()); } - private List mergeProvenance(final List a, final List b) { + private List mergeProvenance( + final List a, final List b) { Map result = new HashMap<>(); if (a != null) - a.forEach(p -> { - if (p != null && StringUtils.isNotBlank(p.getId()) && result.containsKey(p.getId())) { - if ("incomplete".equalsIgnoreCase(result.get(p.getId()).getCompletionStatus()) && StringUtils.isNotBlank(p.getCompletionStatus())) { - result.put(p.getId(), p); - } + a.forEach( + p -> { + if (p != null + && StringUtils.isNotBlank(p.getId()) + && result.containsKey(p.getId())) { + if ("incomplete" + .equalsIgnoreCase( + result.get(p.getId()).getCompletionStatus()) + && StringUtils.isNotBlank(p.getCompletionStatus())) { + result.put(p.getId(), p); + } - } else if (p != null && p.getId() != null && !result.containsKey(p.getId())) - result.put(p.getId(), p); - }); + } else if (p != null && p.getId() != null && !result.containsKey(p.getId())) + result.put(p.getId(), p); + }); if (b != null) - b.forEach(p -> { - if (p != null && StringUtils.isNotBlank(p.getId()) && result.containsKey(p.getId())) { - if ("incomplete".equalsIgnoreCase(result.get(p.getId()).getCompletionStatus()) && StringUtils.isNotBlank(p.getCompletionStatus())) { - result.put(p.getId(), p); - } + b.forEach( + p -> { + if (p != null + && StringUtils.isNotBlank(p.getId()) + && result.containsKey(p.getId())) { + if ("incomplete" + .equalsIgnoreCase( + result.get(p.getId()).getCompletionStatus()) + && StringUtils.isNotBlank(p.getCompletionStatus())) { + result.put(p.getId(), p); + } - } else if (p != null && p.getId() != null && !result.containsKey(p.getId())) - result.put(p.getId(), p); - }); + } else if (p != null && p.getId() != null && !result.containsKey(p.getId())) + result.put(p.getId(), p); + }); return new ArrayList<>(result.values()); } diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/scholexplorer/DLIUnknown.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/scholexplorer/DLIUnknown.java index c7e6dda27..155a0f608 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/scholexplorer/DLIUnknown.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/scholexplorer/DLIUnknown.java @@ -1,15 +1,13 @@ package eu.dnetlib.dhp.schema.scholexplorer; import eu.dnetlib.dhp.schema.oaf.Oaf; -import eu.dnetlib.dhp.schema.oaf.OafEntity; import eu.dnetlib.dhp.schema.oaf.StructuredProperty; -import org.apache.commons.lang3.StringUtils; - import java.io.Serializable; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; +import org.apache.commons.lang3.StringUtils; public class DLIUnknown extends Oaf implements Serializable { @@ -49,7 +47,6 @@ public class DLIUnknown extends Oaf implements Serializable { this.id = id; } - public List getPid() { return pid; } @@ -75,33 +72,45 @@ public class DLIUnknown extends Oaf implements Serializable { } public void mergeFrom(DLIUnknown p) { - if ("complete".equalsIgnoreCase(p.completionStatus)) - completionStatus = "complete"; + if ("complete".equalsIgnoreCase(p.completionStatus)) completionStatus = "complete"; dlicollectedfrom = mergeProvenance(dlicollectedfrom, p.getDlicollectedfrom()); } - private List mergeProvenance(final List a, final List b) { + private List mergeProvenance( + final List a, final List b) { Map result = new HashMap<>(); if (a != null) - a.forEach(p -> { - if (p != null && StringUtils.isNotBlank(p.getId()) && result.containsKey(p.getId())) { - if ("incomplete".equalsIgnoreCase(result.get(p.getId()).getCompletionStatus()) && StringUtils.isNotBlank(p.getCompletionStatus())) { - result.put(p.getId(), p); - } + a.forEach( + p -> { + if (p != null + && StringUtils.isNotBlank(p.getId()) + && result.containsKey(p.getId())) { + if ("incomplete" + .equalsIgnoreCase( + result.get(p.getId()).getCompletionStatus()) + && StringUtils.isNotBlank(p.getCompletionStatus())) { + result.put(p.getId(), p); + } - } else if (p != null && p.getId() != null && !result.containsKey(p.getId())) - result.put(p.getId(), p); - }); + } else if (p != null && p.getId() != null && !result.containsKey(p.getId())) + result.put(p.getId(), p); + }); if (b != null) - b.forEach(p -> { - if (p != null && StringUtils.isNotBlank(p.getId()) && result.containsKey(p.getId())) { - if ("incomplete".equalsIgnoreCase(result.get(p.getId()).getCompletionStatus()) && StringUtils.isNotBlank(p.getCompletionStatus())) { - result.put(p.getId(), p); - } + b.forEach( + p -> { + if (p != null + && StringUtils.isNotBlank(p.getId()) + && result.containsKey(p.getId())) { + if ("incomplete" + .equalsIgnoreCase( + result.get(p.getId()).getCompletionStatus()) + && StringUtils.isNotBlank(p.getCompletionStatus())) { + result.put(p.getId(), p); + } - } else if (p != null && p.getId() != null && !result.containsKey(p.getId())) - result.put(p.getId(), p); - }); + } else if (p != null && p.getId() != null && !result.containsKey(p.getId())) + result.put(p.getId(), p); + }); return new ArrayList<>(result.values()); } diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/scholexplorer/ProvenaceInfo.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/scholexplorer/ProvenaceInfo.java index 3fe069b03..73384378c 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/scholexplorer/ProvenaceInfo.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/scholexplorer/ProvenaceInfo.java @@ -10,7 +10,7 @@ public class ProvenaceInfo implements Serializable { private String completionStatus; - private String collectionMode ="collected"; + private String collectionMode = "collected"; public String getId() { return id; diff --git a/dhp-schemas/src/test/java/eu/dnetlib/dhp/schema/action/AtomicActionTest.java b/dhp-schemas/src/test/java/eu/dnetlib/dhp/schema/action/AtomicActionTest.java index d216c05d5..30fbe551c 100644 --- a/dhp-schemas/src/test/java/eu/dnetlib/dhp/schema/action/AtomicActionTest.java +++ b/dhp-schemas/src/test/java/eu/dnetlib/dhp/schema/action/AtomicActionTest.java @@ -1,18 +1,14 @@ package eu.dnetlib.dhp.schema.action; -import com.fasterxml.jackson.databind.ObjectMapper; -import eu.dnetlib.dhp.schema.oaf.Relation; -import org.apache.commons.lang3.StringUtils; - -import org.junit.jupiter.api.Test; - -import java.io.IOException; - import static org.junit.jupiter.api.Assertions.*; -/** - * @author claudio.atzori - */ +import com.fasterxml.jackson.databind.ObjectMapper; +import eu.dnetlib.dhp.schema.oaf.Relation; +import java.io.IOException; +import org.apache.commons.lang3.StringUtils; +import org.junit.jupiter.api.Test; + +/** @author claudio.atzori */ public class AtomicActionTest { @Test @@ -36,7 +32,5 @@ public class AtomicActionTest { assertEquals(aa1.getClazz(), aa2.getClazz()); assertEquals(aa1.getPayload(), aa2.getPayload()); - } - } diff --git a/dhp-schemas/src/test/java/eu/dnetlib/dhp/schema/common/ModelSupportTest.java b/dhp-schemas/src/test/java/eu/dnetlib/dhp/schema/common/ModelSupportTest.java index 21583cd44..65b65ebeb 100644 --- a/dhp-schemas/src/test/java/eu/dnetlib/dhp/schema/common/ModelSupportTest.java +++ b/dhp-schemas/src/test/java/eu/dnetlib/dhp/schema/common/ModelSupportTest.java @@ -1,14 +1,14 @@ package eu.dnetlib.dhp.schema.common; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertTrue; + import eu.dnetlib.dhp.schema.oaf.OafEntity; import eu.dnetlib.dhp.schema.oaf.Relation; import eu.dnetlib.dhp.schema.oaf.Result; import org.junit.jupiter.api.Nested; import org.junit.jupiter.api.Test; -import static org.junit.jupiter.api.Assertions.assertFalse; -import static org.junit.jupiter.api.Assertions.assertTrue; - public class ModelSupportTest { @Nested @@ -32,4 +32,4 @@ public class ModelSupportTest { assertTrue(result); } } -} \ No newline at end of file +} diff --git a/dhp-schemas/src/test/java/eu/dnetlib/dhp/schema/oaf/MergeTest.java b/dhp-schemas/src/test/java/eu/dnetlib/dhp/schema/oaf/MergeTest.java index ac4bd5d27..58ff670bc 100644 --- a/dhp-schemas/src/test/java/eu/dnetlib/dhp/schema/oaf/MergeTest.java +++ b/dhp-schemas/src/test/java/eu/dnetlib/dhp/schema/oaf/MergeTest.java @@ -1,11 +1,11 @@ package eu.dnetlib.dhp.schema.oaf; import static org.junit.jupiter.api.Assertions.*; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.Test; import java.util.Arrays; import java.util.List; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; public class MergeTest { @@ -19,7 +19,7 @@ public class MergeTest { @Test public void mergeListsTest() { - //string list merge test + // string list merge test List a = Arrays.asList("a", "b", "c", "e"); List b = Arrays.asList("a", "b", "c", "d"); List c = null; @@ -44,7 +44,6 @@ public class MergeTest { assertNotNull(a.getCollectedfrom()); assertEquals(3, a.getCollectedfrom().size()); - } @Test @@ -60,7 +59,6 @@ public class MergeTest { assertNotNull(a.getSubject()); assertEquals(3, a.getSubject().size()); - } private KeyValue setKV(final String key, final String value) { @@ -73,7 +71,8 @@ public class MergeTest { return k; } - private StructuredProperty setSP(final String value, final String schema, final String classname) { + private StructuredProperty setSP( + final String value, final String schema, final String classname) { StructuredProperty s = new StructuredProperty(); s.setValue(value); Qualifier q = new Qualifier(); diff --git a/dhp-schemas/src/test/java/eu/dnetlib/dhp/schema/scholexplorer/DLItest.java b/dhp-schemas/src/test/java/eu/dnetlib/dhp/schema/scholexplorer/DLItest.java index 6a88151c9..63d3135db 100644 --- a/dhp-schemas/src/test/java/eu/dnetlib/dhp/schema/scholexplorer/DLItest.java +++ b/dhp-schemas/src/test/java/eu/dnetlib/dhp/schema/scholexplorer/DLItest.java @@ -6,48 +6,44 @@ import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.SerializationFeature; import eu.dnetlib.dhp.schema.oaf.Qualifier; import eu.dnetlib.dhp.schema.oaf.StructuredProperty; -import org.junit.jupiter.api.Test; - import java.io.IOException; import java.util.Arrays; import java.util.Collections; +import org.junit.jupiter.api.Test; public class DLItest { - @Test public void testMergePublication() throws JsonProcessingException { DLIPublication a1 = new DLIPublication(); - a1.setPid(Arrays.asList( createSP("123456","pdb","dnet:pid_types"))); + a1.setPid(Arrays.asList(createSP("123456", "pdb", "dnet:pid_types"))); a1.setTitle(Collections.singletonList(createSP("Un Titolo", "title", "dnetTitle"))); - a1.setDlicollectedfrom(Arrays.asList(createCollectedFrom("znd","Zenodo","complete"))); + a1.setDlicollectedfrom(Arrays.asList(createCollectedFrom("znd", "Zenodo", "complete"))); a1.setCompletionStatus("complete"); DLIPublication a = new DLIPublication(); - a.setPid(Arrays.asList(createSP("10.11","doi","dnet:pid_types"), createSP("123456","pdb","dnet:pid_types"))); + a.setPid( + Arrays.asList( + createSP("10.11", "doi", "dnet:pid_types"), + createSP("123456", "pdb", "dnet:pid_types"))); a.setTitle(Collections.singletonList(createSP("A Title", "title", "dnetTitle"))); - a.setDlicollectedfrom(Arrays.asList(createCollectedFrom("dct","datacite","complete"),createCollectedFrom("dct","datacite","incomplete"))); + a.setDlicollectedfrom( + Arrays.asList( + createCollectedFrom("dct", "datacite", "complete"), + createCollectedFrom("dct", "datacite", "incomplete"))); a.setCompletionStatus("incomplete"); a.mergeFrom(a1); ObjectMapper mapper = new ObjectMapper(); System.out.println(mapper.writeValueAsString(a)); - - - - - - - } - - @Test public void testDeserialization() throws IOException { - final String json ="{\"dataInfo\":{\"invisible\":false,\"inferred\":null,\"deletedbyinference\":false,\"trust\":\"0.9\",\"inferenceprovenance\":null,\"provenanceaction\":null},\"lastupdatetimestamp\":null,\"id\":\"60|bd9352547098929a394655ad1a44a479\",\"originalId\":[\"bd9352547098929a394655ad1a44a479\"],\"collectedfrom\":[{\"key\":\"dli_________::datacite\",\"value\":\"Datasets in Datacite\",\"dataInfo\":null,\"blank\":false}],\"pid\":[{\"value\":\"10.7925/DRS1.DUCHAS_5078760\",\"qualifier\":{\"classid\":\"doi\",\"classname\":\"doi\",\"schemeid\":\"dnet:pid_types\",\"schemename\":\"dnet:pid_types\",\"blank\":false},\"dataInfo\":null}],\"dateofcollection\":\"2020-01-09T08:29:31.885Z\",\"dateoftransformation\":null,\"extraInfo\":null,\"oaiprovenance\":null,\"author\":[{\"fullname\":\"Cathail, S. Ó\",\"name\":null,\"surname\":null,\"rank\":null,\"pid\":null,\"affiliation\":null},{\"fullname\":\"Donnell, Breda Mc\",\"name\":null,\"surname\":null,\"rank\":null,\"pid\":null,\"affiliation\":null},{\"fullname\":\"Ireland. Department of Arts, Culture, and the Gaeltacht\",\"name\":null,\"surname\":null,\"rank\":null,\"pid\":null,\"affiliation\":null},{\"fullname\":\"University College Dublin\",\"name\":null,\"surname\":null,\"rank\":null,\"pid\":null,\"affiliation\":null},{\"fullname\":\"National Folklore Foundation\",\"name\":null,\"surname\":null,\"rank\":null,\"pid\":null,\"affiliation\":null},{\"fullname\":\"Cathail, S. Ó\",\"name\":null,\"surname\":null,\"rank\":null,\"pid\":null,\"affiliation\":null},{\"fullname\":\"Donnell, Breda Mc\",\"name\":null,\"surname\":null,\"rank\":null,\"pid\":null,\"affiliation\":null}],\"resulttype\":null,\"language\":null,\"country\":null,\"subject\":[{\"value\":\"Recreation\",\"qualifier\":{\"classid\":\"dnet:subject\",\"classname\":\"dnet:subject\",\"schemeid\":\"unknown\",\"schemename\":\"unknown\",\"blank\":false},\"dataInfo\":null},{\"value\":\"Entertainments and recreational activities\",\"qualifier\":{\"classid\":\"dnet:subject\",\"classname\":\"dnet:subject\",\"schemeid\":\"unknown\",\"schemename\":\"unknown\",\"blank\":false},\"dataInfo\":null},{\"value\":\"Siamsaíocht agus caitheamh aimsire\",\"qualifier\":{\"classid\":\"dnet:subject\",\"classname\":\"dnet:subject\",\"schemeid\":\"unknown\",\"schemename\":\"unknown\",\"blank\":false},\"dataInfo\":null}],\"title\":[{\"value\":\"Games We Play\",\"qualifier\":null,\"dataInfo\":null}],\"relevantdate\":[{\"value\":\"1938-09-28\",\"qualifier\":{\"classid\":\"date\",\"classname\":\"date\",\"schemeid\":\"dnet::date\",\"schemename\":\"dnet::date\",\"blank\":false},\"dataInfo\":null}],\"description\":[{\"value\":\"Story collected by Breda Mc Donnell, a student at Tenure school (Tinure, Co. Louth) (no informant identified).\",\"dataInfo\":null}],\"dateofacceptance\":null,\"publisher\":{\"value\":\"University College Dublin\",\"dataInfo\":null},\"embargoenddate\":null,\"source\":null,\"fulltext\":null,\"format\":null,\"contributor\":null,\"resourcetype\":null,\"coverage\":null,\"refereed\":null,\"context\":null,\"processingchargeamount\":null,\"processingchargecurrency\":null,\"externalReference\":null,\"instance\":[],\"storagedate\":null,\"device\":null,\"size\":null,\"version\":null,\"lastmetadataupdate\":null,\"metadataversionnumber\":null,\"geolocation\":null,\"dlicollectedfrom\":[{\"id\":\"dli_________::datacite\",\"name\":\"Datasets in Datacite\",\"completionStatus\":\"complete\",\"collectionMode\":\"resolved\"}],\"completionStatus\":\"complete\"}"; + final String json = + "{\"dataInfo\":{\"invisible\":false,\"inferred\":null,\"deletedbyinference\":false,\"trust\":\"0.9\",\"inferenceprovenance\":null,\"provenanceaction\":null},\"lastupdatetimestamp\":null,\"id\":\"60|bd9352547098929a394655ad1a44a479\",\"originalId\":[\"bd9352547098929a394655ad1a44a479\"],\"collectedfrom\":[{\"key\":\"dli_________::datacite\",\"value\":\"Datasets in Datacite\",\"dataInfo\":null,\"blank\":false}],\"pid\":[{\"value\":\"10.7925/DRS1.DUCHAS_5078760\",\"qualifier\":{\"classid\":\"doi\",\"classname\":\"doi\",\"schemeid\":\"dnet:pid_types\",\"schemename\":\"dnet:pid_types\",\"blank\":false},\"dataInfo\":null}],\"dateofcollection\":\"2020-01-09T08:29:31.885Z\",\"dateoftransformation\":null,\"extraInfo\":null,\"oaiprovenance\":null,\"author\":[{\"fullname\":\"Cathail, S. Ó\",\"name\":null,\"surname\":null,\"rank\":null,\"pid\":null,\"affiliation\":null},{\"fullname\":\"Donnell, Breda Mc\",\"name\":null,\"surname\":null,\"rank\":null,\"pid\":null,\"affiliation\":null},{\"fullname\":\"Ireland. Department of Arts, Culture, and the Gaeltacht\",\"name\":null,\"surname\":null,\"rank\":null,\"pid\":null,\"affiliation\":null},{\"fullname\":\"University College Dublin\",\"name\":null,\"surname\":null,\"rank\":null,\"pid\":null,\"affiliation\":null},{\"fullname\":\"National Folklore Foundation\",\"name\":null,\"surname\":null,\"rank\":null,\"pid\":null,\"affiliation\":null},{\"fullname\":\"Cathail, S. Ó\",\"name\":null,\"surname\":null,\"rank\":null,\"pid\":null,\"affiliation\":null},{\"fullname\":\"Donnell, Breda Mc\",\"name\":null,\"surname\":null,\"rank\":null,\"pid\":null,\"affiliation\":null}],\"resulttype\":null,\"language\":null,\"country\":null,\"subject\":[{\"value\":\"Recreation\",\"qualifier\":{\"classid\":\"dnet:subject\",\"classname\":\"dnet:subject\",\"schemeid\":\"unknown\",\"schemename\":\"unknown\",\"blank\":false},\"dataInfo\":null},{\"value\":\"Entertainments and recreational activities\",\"qualifier\":{\"classid\":\"dnet:subject\",\"classname\":\"dnet:subject\",\"schemeid\":\"unknown\",\"schemename\":\"unknown\",\"blank\":false},\"dataInfo\":null},{\"value\":\"Siamsaíocht agus caitheamh aimsire\",\"qualifier\":{\"classid\":\"dnet:subject\",\"classname\":\"dnet:subject\",\"schemeid\":\"unknown\",\"schemename\":\"unknown\",\"blank\":false},\"dataInfo\":null}],\"title\":[{\"value\":\"Games We Play\",\"qualifier\":null,\"dataInfo\":null}],\"relevantdate\":[{\"value\":\"1938-09-28\",\"qualifier\":{\"classid\":\"date\",\"classname\":\"date\",\"schemeid\":\"dnet::date\",\"schemename\":\"dnet::date\",\"blank\":false},\"dataInfo\":null}],\"description\":[{\"value\":\"Story collected by Breda Mc Donnell, a student at Tenure school (Tinure, Co. Louth) (no informant identified).\",\"dataInfo\":null}],\"dateofacceptance\":null,\"publisher\":{\"value\":\"University College Dublin\",\"dataInfo\":null},\"embargoenddate\":null,\"source\":null,\"fulltext\":null,\"format\":null,\"contributor\":null,\"resourcetype\":null,\"coverage\":null,\"refereed\":null,\"context\":null,\"processingchargeamount\":null,\"processingchargecurrency\":null,\"externalReference\":null,\"instance\":[],\"storagedate\":null,\"device\":null,\"size\":null,\"version\":null,\"lastmetadataupdate\":null,\"metadataversionnumber\":null,\"geolocation\":null,\"dlicollectedfrom\":[{\"id\":\"dli_________::datacite\",\"name\":\"Datasets in Datacite\",\"completionStatus\":\"complete\",\"collectionMode\":\"resolved\"}],\"completionStatus\":\"complete\"}"; ObjectMapper mapper = new ObjectMapper(); mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); @@ -56,7 +52,8 @@ public class DLItest { System.out.println(mapper.writeValueAsString(dliDataset)); } - private ProvenaceInfo createCollectedFrom(final String id, final String name, final String completionStatus) { + private ProvenaceInfo createCollectedFrom( + final String id, final String name, final String completionStatus) { ProvenaceInfo p = new ProvenaceInfo(); p.setId(id); p.setName(name); @@ -64,8 +61,8 @@ public class DLItest { return p; } - - private StructuredProperty createSP(final String value, final String className, final String schemeName) { + private StructuredProperty createSP( + final String value, final String className, final String schemeName) { StructuredProperty p = new StructuredProperty(); p.setValue(value); Qualifier schema = new Qualifier(); @@ -76,6 +73,4 @@ public class DLItest { p.setQualifier(schema); return p; } - - } diff --git a/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/ISClient.java b/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/ISClient.java index 994108b1c..f3a876388 100644 --- a/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/ISClient.java +++ b/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/ISClient.java @@ -10,117 +10,130 @@ import eu.dnetlib.dhp.actionmanager.partition.PartitionActionSetsByPayloadTypeJo import eu.dnetlib.dhp.utils.ISLookupClientFactory; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; -import org.dom4j.Document; -import org.dom4j.Element; -import org.dom4j.io.SAXReader; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - import java.io.Serializable; import java.io.StringReader; import java.util.ArrayList; import java.util.List; import java.util.NoSuchElementException; import java.util.stream.Collectors; +import org.dom4j.Document; +import org.dom4j.Element; +import org.dom4j.io.SAXReader; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; public class ISClient implements Serializable { - private static final Logger log = LoggerFactory.getLogger(PartitionActionSetsByPayloadTypeJob.class); + private static final Logger log = + LoggerFactory.getLogger(PartitionActionSetsByPayloadTypeJob.class); - private static final String INPUT_ACTION_SET_ID_SEPARATOR = ","; + private static final String INPUT_ACTION_SET_ID_SEPARATOR = ","; - private ISLookUpService isLookup; + private ISLookUpService isLookup; - public ISClient(String isLookupUrl) { - isLookup = ISLookupClientFactory.getLookUpService(isLookupUrl); - } + public ISClient(String isLookupUrl) { + isLookup = ISLookupClientFactory.getLookUpService(isLookupUrl); + } - public List getLatestRawsetPaths(String setIds) { + public List getLatestRawsetPaths(String setIds) { - List ids = Lists.newArrayList(Splitter.on(INPUT_ACTION_SET_ID_SEPARATOR) - .omitEmptyStrings() - .trimResults() - .split(setIds)); + List ids = + Lists.newArrayList( + Splitter.on(INPUT_ACTION_SET_ID_SEPARATOR) + .omitEmptyStrings() + .trimResults() + .split(setIds)); - return ids.stream() - .map(id -> getSet(isLookup, id)) - .map(as -> as.getPathToLatest()) - .collect(Collectors.toCollection(ArrayList::new)); - } + return ids.stream() + .map(id -> getSet(isLookup, id)) + .map(as -> as.getPathToLatest()) + .collect(Collectors.toCollection(ArrayList::new)); + } - private ActionManagerSet getSet(ISLookUpService isLookup, final String setId) { + private ActionManagerSet getSet(ISLookUpService isLookup, final String setId) { - final String q = "for $x in collection('/db/DRIVER/ActionManagerSetDSResources/ActionManagerSetDSResourceType') " - + "where $x//SET/@id = '" + setId + "' return $x"; + final String q = + "for $x in collection('/db/DRIVER/ActionManagerSetDSResources/ActionManagerSetDSResourceType') " + + "where $x//SET/@id = '" + + setId + + "' return $x"; - try { - final String basePath = getBasePathHDFS(isLookup); - final String setProfile = isLookup.getResourceProfileByQuery(q); - return getActionManagerSet(basePath, setProfile); - } catch (ISLookUpException | ActionManagerException e) { - throw new RuntimeException("Error accessing Sets, using query: " + q); - } - } + try { + final String basePath = getBasePathHDFS(isLookup); + final String setProfile = isLookup.getResourceProfileByQuery(q); + return getActionManagerSet(basePath, setProfile); + } catch (ISLookUpException | ActionManagerException e) { + throw new RuntimeException("Error accessing Sets, using query: " + q); + } + } - private ActionManagerSet getActionManagerSet(final String basePath, final String profile) throws ActionManagerException { - final SAXReader reader = new SAXReader(); - final ActionManagerSet set = new ActionManagerSet(); + private ActionManagerSet getActionManagerSet(final String basePath, final String profile) + throws ActionManagerException { + final SAXReader reader = new SAXReader(); + final ActionManagerSet set = new ActionManagerSet(); - try { - final Document doc = reader.read(new StringReader(profile)); + try { + final Document doc = reader.read(new StringReader(profile)); - set.setId(doc.valueOf("//SET/@id").trim()); - set.setName(doc.valueOf("//SET").trim()); - set.setImpact(ImpactTypes.valueOf(doc.valueOf("//IMPACT").trim())); - set.setLatest(doc.valueOf("//RAW_SETS/LATEST/@id"), doc.valueOf("//RAW_SETS/LATEST/@creationDate"), doc.valueOf("//RAW_SETS/LATEST/@lastUpdate")); - set.setDirectory(doc.valueOf("//SET/@directory")); - final List expiredNodes = doc.selectNodes("//RAW_SETS/EXPIRED"); - if (expiredNodes != null) { - for (int i = 0; i < expiredNodes.size(); i++) { - Element ex = (Element) expiredNodes.get(i); - set.addExpired(ex.attributeValue("id"), ex.attributeValue("creationDate"), ex.attributeValue("lastUpdate")); - } - } + set.setId(doc.valueOf("//SET/@id").trim()); + set.setName(doc.valueOf("//SET").trim()); + set.setImpact(ImpactTypes.valueOf(doc.valueOf("//IMPACT").trim())); + set.setLatest( + doc.valueOf("//RAW_SETS/LATEST/@id"), + doc.valueOf("//RAW_SETS/LATEST/@creationDate"), + doc.valueOf("//RAW_SETS/LATEST/@lastUpdate")); + set.setDirectory(doc.valueOf("//SET/@directory")); + final List expiredNodes = doc.selectNodes("//RAW_SETS/EXPIRED"); + if (expiredNodes != null) { + for (int i = 0; i < expiredNodes.size(); i++) { + Element ex = (Element) expiredNodes.get(i); + set.addExpired( + ex.attributeValue("id"), + ex.attributeValue("creationDate"), + ex.attributeValue("lastUpdate")); + } + } - final StringBuilder sb = new StringBuilder(); - sb.append(basePath); - sb.append("/"); - sb.append(doc.valueOf("//SET/@directory")); - sb.append("/"); - sb.append(doc.valueOf("//RAW_SETS/LATEST/@id")); - set.setPathToLatest(sb.toString()); + final StringBuilder sb = new StringBuilder(); + sb.append(basePath); + sb.append("/"); + sb.append(doc.valueOf("//SET/@directory")); + sb.append("/"); + sb.append(doc.valueOf("//RAW_SETS/LATEST/@id")); + set.setPathToLatest(sb.toString()); - return set; - } catch (Exception e) { - throw new ActionManagerException("Error creating set from profile: " + profile, e); - } - } - - private String getBasePathHDFS(ISLookUpService isLookup) throws ActionManagerException { - return queryServiceProperty(isLookup, "basePath"); - } - - private String queryServiceProperty(ISLookUpService isLookup, final String propertyName) throws ActionManagerException { - final String q = "for $x in /RESOURCE_PROFILE[.//RESOURCE_TYPE/@value='ActionManagerServiceResourceType'] return $x//SERVICE_PROPERTIES/PROPERTY[./@ key='" - + propertyName + "']/@value/string()"; - log.debug("quering for service property: " + q); - try { - final List value = isLookup.quickSearchProfile(q); - return Iterables.getOnlyElement(value); - } catch (ISLookUpException e) { - String msg = "Error accessing service profile, using query: " + q; - log.error(msg, e); - throw new ActionManagerException(msg, e); - } catch (NoSuchElementException e) { - String msg = "missing service property: " + propertyName; - log.error(msg, e); - throw new ActionManagerException(msg, e); - } catch (IllegalArgumentException e) { - String msg = "found more than one service property: " + propertyName; - log.error(msg, e); - throw new ActionManagerException(msg, e); - } - } + return set; + } catch (Exception e) { + throw new ActionManagerException("Error creating set from profile: " + profile, e); + } + } + private String getBasePathHDFS(ISLookUpService isLookup) throws ActionManagerException { + return queryServiceProperty(isLookup, "basePath"); + } + private String queryServiceProperty(ISLookUpService isLookup, final String propertyName) + throws ActionManagerException { + final String q = + "for $x in /RESOURCE_PROFILE[.//RESOURCE_TYPE/@value='ActionManagerServiceResourceType'] return $x//SERVICE_PROPERTIES/PROPERTY[./@ key='" + + propertyName + + "']/@value/string()"; + log.debug("quering for service property: " + q); + try { + final List value = isLookup.quickSearchProfile(q); + return Iterables.getOnlyElement(value); + } catch (ISLookUpException e) { + String msg = "Error accessing service profile, using query: " + q; + log.error(msg, e); + throw new ActionManagerException(msg, e); + } catch (NoSuchElementException e) { + String msg = "missing service property: " + propertyName; + log.error(msg, e); + throw new ActionManagerException(msg, e); + } catch (IllegalArgumentException e) { + String msg = "found more than one service property: " + propertyName; + log.error(msg, e); + throw new ActionManagerException(msg, e); + } + } } diff --git a/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/partition/PartitionActionSetsByPayloadTypeJob.java b/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/partition/PartitionActionSetsByPayloadTypeJob.java index 7c4d0616c..9ead5411f 100644 --- a/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/partition/PartitionActionSetsByPayloadTypeJob.java +++ b/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/partition/PartitionActionSetsByPayloadTypeJob.java @@ -1,9 +1,15 @@ package eu.dnetlib.dhp.actionmanager.partition; +import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; +import static org.apache.spark.sql.functions.*; + import eu.dnetlib.dhp.actionmanager.ISClient; -import eu.dnetlib.dhp.common.HdfsSupport; import eu.dnetlib.dhp.actionmanager.promote.PromoteActionPayloadForGraphTableJob; import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.common.HdfsSupport; +import java.util.Arrays; +import java.util.List; +import java.util.Optional; import org.apache.commons.io.IOUtils; import org.apache.hadoop.io.Text; import org.apache.spark.SparkConf; @@ -14,32 +20,27 @@ import org.apache.spark.sql.types.*; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.util.Arrays; -import java.util.Collections; -import java.util.List; -import java.util.Optional; - -import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; -import static org.apache.spark.sql.functions.*; - -/** - * Partitions given set of action sets by payload type. - */ +/** Partitions given set of action sets by payload type. */ public class PartitionActionSetsByPayloadTypeJob { - private static final Logger logger = LoggerFactory.getLogger(PartitionActionSetsByPayloadTypeJob.class); + private static final Logger logger = + LoggerFactory.getLogger(PartitionActionSetsByPayloadTypeJob.class); - private static final StructType KV_SCHEMA = StructType$.MODULE$.apply( - Arrays.asList( - StructField$.MODULE$.apply("key", DataTypes.StringType, false, Metadata.empty()), - StructField$.MODULE$.apply("value", DataTypes.StringType, false, Metadata.empty()) - )); + private static final StructType KV_SCHEMA = + StructType$.MODULE$.apply( + Arrays.asList( + StructField$.MODULE$.apply( + "key", DataTypes.StringType, false, Metadata.empty()), + StructField$.MODULE$.apply( + "value", DataTypes.StringType, false, Metadata.empty()))); - private static final StructType ATOMIC_ACTION_SCHEMA = StructType$.MODULE$.apply( - Arrays.asList( - StructField$.MODULE$.apply("clazz", DataTypes.StringType, false, Metadata.empty()), - StructField$.MODULE$.apply("payload", DataTypes.StringType, false, Metadata.empty()) - )); + private static final StructType ATOMIC_ACTION_SCHEMA = + StructType$.MODULE$.apply( + Arrays.asList( + StructField$.MODULE$.apply( + "clazz", DataTypes.StringType, false, Metadata.empty()), + StructField$.MODULE$.apply( + "payload", DataTypes.StringType, false, Metadata.empty()))); private ISClient isClient; @@ -47,20 +48,20 @@ public class PartitionActionSetsByPayloadTypeJob { this.isClient = new ISClient(isLookupUrl); } - public PartitionActionSetsByPayloadTypeJob() { - } + public PartitionActionSetsByPayloadTypeJob() {} public static void main(String[] args) throws Exception { - String jsonConfiguration = IOUtils.toString( - PromoteActionPayloadForGraphTableJob.class - .getResourceAsStream("/eu/dnetlib/dhp/actionmanager/partition/partition_action_sets_by_payload_type_input_parameters.json")); + String jsonConfiguration = + IOUtils.toString( + PromoteActionPayloadForGraphTableJob.class.getResourceAsStream( + "/eu/dnetlib/dhp/actionmanager/partition/partition_action_sets_by_payload_type_input_parameters.json")); final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); parser.parseArgument(args); - Boolean isSparkSessionManaged = Optional - .ofNullable(parser.get("isSparkSessionManaged")) - .map(Boolean::valueOf) - .orElse(Boolean.TRUE); + Boolean isSparkSessionManaged = + Optional.ofNullable(parser.get("isSparkSessionManaged")) + .map(Boolean::valueOf) + .orElse(Boolean.TRUE); logger.info("isSparkSessionManaged: {}", isSparkSessionManaged); String inputActionSetIds = parser.get("inputActionSetIds"); @@ -72,7 +73,8 @@ public class PartitionActionSetsByPayloadTypeJob { String isLookupUrl = parser.get("isLookupUrl"); logger.info("isLookupUrl: {}", isLookupUrl); - new PartitionActionSetsByPayloadTypeJob(isLookupUrl).run(isSparkSessionManaged, inputActionSetIds, outputPath); + new PartitionActionSetsByPayloadTypeJob(isLookupUrl) + .run(isSparkSessionManaged, inputActionSetIds, outputPath); } protected void run(Boolean isSparkSessionManaged, String inputActionSetIds, String outputPath) { @@ -83,53 +85,51 @@ public class PartitionActionSetsByPayloadTypeJob { SparkConf conf = new SparkConf(); conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); - runWithSparkSession(conf, isSparkSessionManaged, + runWithSparkSession( + conf, + isSparkSessionManaged, spark -> { removeOutputDir(spark, outputPath); readAndWriteActionSetsFromPaths(spark, inputActionSetPaths, outputPath); }); } - private static void removeOutputDir(SparkSession spark, - String path) { + private static void removeOutputDir(SparkSession spark, String path) { HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration()); } - private static void readAndWriteActionSetsFromPaths(SparkSession spark, - List inputActionSetPaths, - String outputPath) { - inputActionSetPaths - .stream() - .filter(path -> HdfsSupport.exists(path, spark.sparkContext().hadoopConfiguration())) - .forEach(inputActionSetPath -> { - Dataset actionDS = readActionSetFromPath(spark, inputActionSetPath); - saveActions(actionDS, outputPath); - }); + private static void readAndWriteActionSetsFromPaths( + SparkSession spark, List inputActionSetPaths, String outputPath) { + inputActionSetPaths.stream() + .filter( + path -> + HdfsSupport.exists( + path, spark.sparkContext().hadoopConfiguration())) + .forEach( + inputActionSetPath -> { + Dataset actionDS = + readActionSetFromPath(spark, inputActionSetPath); + saveActions(actionDS, outputPath); + }); } - private static Dataset readActionSetFromPath(SparkSession spark, - String path) { + private static Dataset readActionSetFromPath(SparkSession spark, String path) { logger.info("Reading actions from path: {}", path); JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); - JavaRDD rdd = sc - .sequenceFile(path, Text.class, Text.class) - .map(x -> RowFactory.create(x._1().toString(), x._2().toString())); + JavaRDD rdd = + sc.sequenceFile(path, Text.class, Text.class) + .map(x -> RowFactory.create(x._1().toString(), x._2().toString())); return spark.createDataFrame(rdd, KV_SCHEMA) .withColumn("atomic_action", from_json(col("value"), ATOMIC_ACTION_SCHEMA)) .select(expr("atomic_action.*")); } - private static void saveActions(Dataset actionDS, - String path) { + private static void saveActions(Dataset actionDS, String path) { logger.info("Saving actions to path: {}", path); - actionDS - .write() - .partitionBy("clazz") - .mode(SaveMode.Append) - .parquet(path); + actionDS.write().partitionBy("clazz").mode(SaveMode.Append).parquet(path); } public ISClient getIsClient() { diff --git a/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/promote/MergeAndGet.java b/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/promote/MergeAndGet.java index 19b2104bc..eb839245b 100644 --- a/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/promote/MergeAndGet.java +++ b/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/promote/MergeAndGet.java @@ -1,41 +1,39 @@ package eu.dnetlib.dhp.actionmanager.promote; +import static eu.dnetlib.dhp.schema.common.ModelSupport.isSubClass; + import eu.dnetlib.dhp.common.FunctionalInterfaceSupport.SerializableSupplier; import eu.dnetlib.dhp.schema.oaf.Oaf; import eu.dnetlib.dhp.schema.oaf.OafEntity; import eu.dnetlib.dhp.schema.oaf.Relation; - import java.util.function.BiFunction; -import static eu.dnetlib.dhp.schema.common.ModelSupport.isSubClass; - -/** - * OAF model merging support. - */ +/** OAF model merging support. */ public class MergeAndGet { - private MergeAndGet() { - } + private MergeAndGet() {} /** * Strategy for merging OAF model objects. - *

- * MERGE_FROM_AND_GET: use OAF 'mergeFrom' method - * SELECT_NEWER_AND_GET: use last update timestamp to return newer instance + * + *

MERGE_FROM_AND_GET: use OAF 'mergeFrom' method SELECT_NEWER_AND_GET: use last update + * timestamp to return newer instance */ public enum Strategy { - MERGE_FROM_AND_GET, SELECT_NEWER_AND_GET + MERGE_FROM_AND_GET, + SELECT_NEWER_AND_GET } /** * Returns a function for merging OAF model objects. * * @param strategy Strategy to be used to merge objects - * @param Graph table type - * @param Action payload type + * @param Graph table type + * @param Action payload type * @return BiFunction to be used to merge OAF objects */ - public static SerializableSupplier> functionFor(Strategy strategy) { + public static + SerializableSupplier> functionFor(Strategy strategy) { switch (strategy) { case MERGE_FROM_AND_GET: return () -> MergeAndGet::mergeFromAndGet; @@ -49,26 +47,36 @@ public class MergeAndGet { if (isSubClass(x, Relation.class) && isSubClass(y, Relation.class)) { ((Relation) x).mergeFrom((Relation) y); return x; - } else if (isSubClass(x, OafEntity.class) && isSubClass(y, OafEntity.class) && isSubClass(x, y)) { + } else if (isSubClass(x, OafEntity.class) + && isSubClass(y, OafEntity.class) + && isSubClass(x, y)) { ((OafEntity) x).mergeFrom((OafEntity) y); return x; } - throw new RuntimeException(String.format("MERGE_FROM_AND_GET incompatible types: %s, %s", - x.getClass().getCanonicalName(), y.getClass().getCanonicalName())); + throw new RuntimeException( + String.format( + "MERGE_FROM_AND_GET incompatible types: %s, %s", + x.getClass().getCanonicalName(), y.getClass().getCanonicalName())); } private static G selectNewerAndGet(G x, A y) { - if (x.getClass().equals(y.getClass()) && x.getLastupdatetimestamp() > y.getLastupdatetimestamp()) { + if (x.getClass().equals(y.getClass()) + && x.getLastupdatetimestamp() > y.getLastupdatetimestamp()) { return x; - } else if (x.getClass().equals(y.getClass()) && x.getLastupdatetimestamp() < y.getLastupdatetimestamp()) { + } else if (x.getClass().equals(y.getClass()) + && x.getLastupdatetimestamp() < y.getLastupdatetimestamp()) { return (G) y; } else if (isSubClass(x, y) && x.getLastupdatetimestamp() > y.getLastupdatetimestamp()) { return x; } else if (isSubClass(x, y) && x.getLastupdatetimestamp() < y.getLastupdatetimestamp()) { - throw new RuntimeException(String.format("SELECT_NEWER_AND_GET cannot return right type when it is not the same as left type: %s, %s", - x.getClass().getCanonicalName(), y.getClass().getCanonicalName())); + throw new RuntimeException( + String.format( + "SELECT_NEWER_AND_GET cannot return right type when it is not the same as left type: %s, %s", + x.getClass().getCanonicalName(), y.getClass().getCanonicalName())); } - throw new RuntimeException(String.format("SELECT_NEWER_AND_GET cannot be used when left is not subtype of right: %s, %s", - x.getClass().getCanonicalName(), y.getClass().getCanonicalName())); + throw new RuntimeException( + String.format( + "SELECT_NEWER_AND_GET cannot be used when left is not subtype of right: %s, %s", + x.getClass().getCanonicalName(), y.getClass().getCanonicalName())); } } diff --git a/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/promote/PromoteActionPayloadForGraphTableJob.java b/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/promote/PromoteActionPayloadForGraphTableJob.java index 20b75842c..1fa173e9c 100644 --- a/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/promote/PromoteActionPayloadForGraphTableJob.java +++ b/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/promote/PromoteActionPayloadForGraphTableJob.java @@ -1,14 +1,20 @@ package eu.dnetlib.dhp.actionmanager.promote; +import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; +import static eu.dnetlib.dhp.schema.common.ModelSupport.isSubClass; + import com.fasterxml.jackson.databind.ObjectMapper; +import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.common.FunctionalInterfaceSupport.SerializableSupplier; import eu.dnetlib.dhp.common.HdfsSupport; -import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.schema.common.ModelSupport; import eu.dnetlib.dhp.schema.oaf.*; +import java.util.Objects; +import java.util.Optional; +import java.util.function.BiFunction; +import java.util.function.Function; import org.apache.commons.io.IOUtils; import org.apache.spark.SparkConf; -import org.apache.spark.api.java.function.FilterFunction; import org.apache.spark.api.java.function.MapFunction; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Encoders; @@ -17,33 +23,25 @@ import org.apache.spark.sql.SparkSession; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.util.Objects; -import java.util.Optional; -import java.util.function.BiFunction; -import java.util.function.Function; - -import static eu.dnetlib.dhp.schema.common.ModelSupport.isSubClass; -import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; - -/** - * Applies a given action payload file to graph table of compatible type. - */ +/** Applies a given action payload file to graph table of compatible type. */ public class PromoteActionPayloadForGraphTableJob { - private static final Logger logger = LoggerFactory.getLogger(PromoteActionPayloadForGraphTableJob.class); + private static final Logger logger = + LoggerFactory.getLogger(PromoteActionPayloadForGraphTableJob.class); private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); public static void main(String[] args) throws Exception { - String jsonConfiguration = IOUtils.toString( - PromoteActionPayloadForGraphTableJob.class - .getResourceAsStream("/eu/dnetlib/dhp/actionmanager/promote/promote_action_payload_for_graph_table_input_parameters.json")); + String jsonConfiguration = + IOUtils.toString( + PromoteActionPayloadForGraphTableJob.class.getResourceAsStream( + "/eu/dnetlib/dhp/actionmanager/promote/promote_action_payload_for_graph_table_input_parameters.json")); final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); parser.parseArgument(args); - Boolean isSparkSessionManaged = Optional - .ofNullable(parser.get("isSparkSessionManaged")) - .map(Boolean::valueOf) - .orElse(Boolean.TRUE); + Boolean isSparkSessionManaged = + Optional.ofNullable(parser.get("isSparkSessionManaged")) + .map(Boolean::valueOf) + .orElse(Boolean.TRUE); logger.info("isSparkSessionManaged: {}", isSparkSessionManaged); String inputGraphTablePath = parser.get("inputGraphTablePath"); @@ -61,11 +59,13 @@ public class PromoteActionPayloadForGraphTableJob { String outputGraphTablePath = parser.get("outputGraphTablePath"); logger.info("outputGraphTablePath: {}", outputGraphTablePath); - MergeAndGet.Strategy strategy = MergeAndGet.Strategy.valueOf(parser.get("mergeAndGetStrategy").toUpperCase()); + MergeAndGet.Strategy strategy = + MergeAndGet.Strategy.valueOf(parser.get("mergeAndGetStrategy").toUpperCase()); logger.info("strategy: {}", strategy); Class rowClazz = (Class) Class.forName(graphTableClassName); - Class actionPayloadClazz = (Class) Class.forName(actionPayloadClassName); + Class actionPayloadClazz = + (Class) Class.forName(actionPayloadClassName); throwIfGraphTableClassIsNotSubClassOfActionPayloadClass(rowClazz, actionPayloadClazz); @@ -73,10 +73,13 @@ public class PromoteActionPayloadForGraphTableJob { conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); conf.registerKryoClasses(ModelSupport.getOafModelClasses()); - runWithSparkSession(conf, isSparkSessionManaged, + runWithSparkSession( + conf, + isSparkSessionManaged, spark -> { removeOutputDir(spark, outputGraphTablePath); - promoteActionPayloadForGraphTable(spark, + promoteActionPayloadForGraphTable( + spark, inputGraphTablePath, inputActionPayloadPath, outputGraphTablePath, @@ -86,44 +89,50 @@ public class PromoteActionPayloadForGraphTableJob { }); } - private static void throwIfGraphTableClassIsNotSubClassOfActionPayloadClass(Class rowClazz, - Class actionPayloadClazz) { + private static void throwIfGraphTableClassIsNotSubClassOfActionPayloadClass( + Class rowClazz, Class actionPayloadClazz) { if (!isSubClass(rowClazz, actionPayloadClazz)) { - String msg = String.format("graph table class is not a subclass of action payload class: graph=%s, action=%s", - rowClazz.getCanonicalName(), actionPayloadClazz.getCanonicalName()); + String msg = + String.format( + "graph table class is not a subclass of action payload class: graph=%s, action=%s", + rowClazz.getCanonicalName(), actionPayloadClazz.getCanonicalName()); throw new RuntimeException(msg); } } - private static void removeOutputDir(SparkSession spark, - String path) { + private static void removeOutputDir(SparkSession spark, String path) { HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration()); } - private static void promoteActionPayloadForGraphTable(SparkSession spark, - String inputGraphTablePath, - String inputActionPayloadPath, - String outputGraphTablePath, - MergeAndGet.Strategy strategy, - Class rowClazz, - Class actionPayloadClazz) { + private static void promoteActionPayloadForGraphTable( + SparkSession spark, + String inputGraphTablePath, + String inputActionPayloadPath, + String outputGraphTablePath, + MergeAndGet.Strategy strategy, + Class rowClazz, + Class actionPayloadClazz) { Dataset rowDS = readGraphTable(spark, inputGraphTablePath, rowClazz); - Dataset actionPayloadDS = readActionPayload(spark, inputActionPayloadPath, actionPayloadClazz); + Dataset actionPayloadDS = + readActionPayload(spark, inputActionPayloadPath, actionPayloadClazz); - Dataset result = promoteActionPayloadForGraphTable(rowDS, actionPayloadDS, strategy, rowClazz, actionPayloadClazz) - .map((MapFunction) value -> value, Encoders.bean(rowClazz)); + Dataset result = + promoteActionPayloadForGraphTable( + rowDS, actionPayloadDS, strategy, rowClazz, actionPayloadClazz) + .map((MapFunction) value -> value, Encoders.bean(rowClazz)); saveGraphTable(result, outputGraphTablePath); } - private static Dataset readGraphTable(SparkSession spark, - String path, - Class rowClazz) { + private static Dataset readGraphTable( + SparkSession spark, String path, Class rowClazz) { logger.info("Reading graph table from path: {}", path); return spark.read() .textFile(path) - .map((MapFunction) value -> OBJECT_MAPPER.readValue(value, rowClazz), Encoders.bean(rowClazz)); + .map( + (MapFunction) value -> OBJECT_MAPPER.readValue(value, rowClazz), + Encoders.bean(rowClazz)); /* return spark @@ -133,33 +142,44 @@ public class PromoteActionPayloadForGraphTableJob { */ } - private static Dataset readActionPayload(SparkSession spark, - String path, - Class actionPayloadClazz) { + private static Dataset readActionPayload( + SparkSession spark, String path, Class actionPayloadClazz) { logger.info("Reading action payload from path: {}", path); - return spark - .read() + return spark.read() .parquet(path) - .map((MapFunction) value -> OBJECT_MAPPER.readValue(value.getAs("payload"), - actionPayloadClazz), Encoders.bean(actionPayloadClazz)); + .map( + (MapFunction) + value -> + OBJECT_MAPPER.readValue( + value.getAs("payload"), actionPayloadClazz), + Encoders.bean(actionPayloadClazz)); } - private static Dataset promoteActionPayloadForGraphTable(Dataset rowDS, - Dataset actionPayloadDS, - MergeAndGet.Strategy strategy, - Class rowClazz, - Class actionPayloadClazz) { - logger.info("Promoting action payload for graph table: payload={}, table={}", actionPayloadClazz.getSimpleName(), rowClazz.getSimpleName()); + private static Dataset promoteActionPayloadForGraphTable( + Dataset rowDS, + Dataset actionPayloadDS, + MergeAndGet.Strategy strategy, + Class rowClazz, + Class actionPayloadClazz) { + logger.info( + "Promoting action payload for graph table: payload={}, table={}", + actionPayloadClazz.getSimpleName(), + rowClazz.getSimpleName()); - SerializableSupplier> rowIdFn = PromoteActionPayloadForGraphTableJob::idFn; - SerializableSupplier> actionPayloadIdFn = PromoteActionPayloadForGraphTableJob::idFn; - SerializableSupplier> mergeRowWithActionPayloadAndGetFn = MergeAndGet.functionFor(strategy); - SerializableSupplier> mergeRowsAndGetFn = MergeAndGet.functionFor(strategy); + SerializableSupplier> rowIdFn = + PromoteActionPayloadForGraphTableJob::idFn; + SerializableSupplier> actionPayloadIdFn = + PromoteActionPayloadForGraphTableJob::idFn; + SerializableSupplier> mergeRowWithActionPayloadAndGetFn = + MergeAndGet.functionFor(strategy); + SerializableSupplier> mergeRowsAndGetFn = + MergeAndGet.functionFor(strategy); SerializableSupplier zeroFn = zeroFn(rowClazz); - SerializableSupplier> isNotZeroFn = PromoteActionPayloadForGraphTableJob::isNotZeroFnUsingIdOrSource; + SerializableSupplier> isNotZeroFn = + PromoteActionPayloadForGraphTableJob::isNotZeroFnUsingIdOrSource; - Dataset joinedAndMerged = PromoteActionPayloadFunctions - .joinGraphTableWithActionPayloadAndMerge( + Dataset joinedAndMerged = + PromoteActionPayloadFunctions.joinGraphTableWithActionPayloadAndMerge( rowDS, actionPayloadDS, rowIdFn, @@ -168,14 +188,8 @@ public class PromoteActionPayloadForGraphTableJob { rowClazz, actionPayloadClazz); - return PromoteActionPayloadFunctions - .groupGraphTableByIdAndMerge( - joinedAndMerged, - rowIdFn, - mergeRowsAndGetFn, - zeroFn, - isNotZeroFn, - rowClazz); + return PromoteActionPayloadFunctions.groupGraphTableByIdAndMerge( + joinedAndMerged, rowIdFn, mergeRowsAndGetFn, zeroFn, isNotZeroFn, rowClazz); } private static Function idFn() { @@ -190,19 +204,49 @@ public class PromoteActionPayloadForGraphTableJob { private static String idFnForRelation(T t) { Relation r = (Relation) t; return Optional.ofNullable(r.getSource()) - .map(source -> Optional.ofNullable(r.getTarget()) - .map(target -> Optional.ofNullable(r.getRelType()) - .map(relType -> Optional.ofNullable(r.getSubRelType()) - .map(subRelType -> Optional.ofNullable(r.getRelClass()) - .map(relClass -> String.join(source, target, relType, subRelType, relClass)) - .orElse(String.join(source, target, relType, subRelType)) - ) - .orElse(String.join(source, target, relType)) - ) - .orElse(String.join(source, target)) - ) - .orElse(source) - ) + .map( + source -> + Optional.ofNullable(r.getTarget()) + .map( + target -> + Optional.ofNullable(r.getRelType()) + .map( + relType -> + Optional.ofNullable( + r + .getSubRelType()) + .map( + subRelType -> + Optional + .ofNullable( + r + .getRelClass()) + .map( + relClass -> + String + .join( + source, + target, + relType, + subRelType, + relClass)) + .orElse( + String + .join( + source, + target, + relType, + subRelType))) + .orElse( + String + .join( + source, + target, + relType))) + .orElse( + String.join( + source, target))) + .orElse(source)) .orElse(null); } @@ -242,13 +286,8 @@ public class PromoteActionPayloadForGraphTableJob { }; } - private static void saveGraphTable(Dataset result, - String path) { + private static void saveGraphTable(Dataset result, String path) { logger.info("Saving graph table to path: {}", path); - result - .toJSON() - .write() - .option("compression", "gzip") - .text(path); + result.toJSON().write().option("compression", "gzip").text(path); } } diff --git a/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/promote/PromoteActionPayloadFunctions.java b/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/promote/PromoteActionPayloadFunctions.java index fda86cb19..c78374029 100644 --- a/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/promote/PromoteActionPayloadFunctions.java +++ b/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/promote/PromoteActionPayloadFunctions.java @@ -1,7 +1,13 @@ package eu.dnetlib.dhp.actionmanager.promote; +import static eu.dnetlib.dhp.schema.common.ModelSupport.isSubClass; + import eu.dnetlib.dhp.common.FunctionalInterfaceSupport.SerializableSupplier; import eu.dnetlib.dhp.schema.oaf.Oaf; +import java.util.Objects; +import java.util.Optional; +import java.util.function.BiFunction; +import java.util.function.Function; import org.apache.spark.api.java.function.FilterFunction; import org.apache.spark.api.java.function.MapFunction; import org.apache.spark.sql.Dataset; @@ -11,95 +17,113 @@ import org.apache.spark.sql.TypedColumn; import org.apache.spark.sql.expressions.Aggregator; import scala.Tuple2; -import java.util.Objects; -import java.util.Optional; -import java.util.function.BiFunction; -import java.util.function.Function; - -import static eu.dnetlib.dhp.schema.common.ModelSupport.isSubClass; - -/** - * Promote action payload functions. - */ +/** Promote action payload functions. */ public class PromoteActionPayloadFunctions { - private PromoteActionPayloadFunctions() { - } + private PromoteActionPayloadFunctions() {} /** - * Joins dataset representing graph table with dataset representing action payload using supplied functions. + * Joins dataset representing graph table with dataset representing action payload using + * supplied functions. * - * @param rowDS Dataset representing graph table - * @param actionPayloadDS Dataset representing action payload - * @param rowIdFn Function used to get the id of graph table row - * @param actionPayloadIdFn Function used to get id of action payload instance - * @param mergeAndGetFn Function used to merge graph table row and action payload instance - * @param rowClazz Class of graph table + * @param rowDS Dataset representing graph table + * @param actionPayloadDS Dataset representing action payload + * @param rowIdFn Function used to get the id of graph table row + * @param actionPayloadIdFn Function used to get id of action payload instance + * @param mergeAndGetFn Function used to merge graph table row and action payload instance + * @param rowClazz Class of graph table * @param actionPayloadClazz Class of action payload - * @param Type of graph table row - * @param Type of action payload instance + * @param Type of graph table row + * @param Type of action payload instance * @return Dataset of merged graph table rows and action payload instances */ - public static Dataset joinGraphTableWithActionPayloadAndMerge(Dataset rowDS, - Dataset actionPayloadDS, - SerializableSupplier> rowIdFn, - SerializableSupplier> actionPayloadIdFn, - SerializableSupplier> mergeAndGetFn, - Class rowClazz, - Class actionPayloadClazz) { + public static Dataset joinGraphTableWithActionPayloadAndMerge( + Dataset rowDS, + Dataset actionPayloadDS, + SerializableSupplier> rowIdFn, + SerializableSupplier> actionPayloadIdFn, + SerializableSupplier> mergeAndGetFn, + Class rowClazz, + Class actionPayloadClazz) { if (!isSubClass(rowClazz, actionPayloadClazz)) { - throw new RuntimeException("action payload type must be the same or be a super type of table row type"); + throw new RuntimeException( + "action payload type must be the same or be a super type of table row type"); } Dataset> rowWithIdDS = mapToTupleWithId(rowDS, rowIdFn, rowClazz); - Dataset> actionPayloadWithIdDS = mapToTupleWithId(actionPayloadDS, actionPayloadIdFn, actionPayloadClazz); + Dataset> actionPayloadWithIdDS = + mapToTupleWithId(actionPayloadDS, actionPayloadIdFn, actionPayloadClazz); return rowWithIdDS - .joinWith(actionPayloadWithIdDS, rowWithIdDS.col("_1").equalTo(actionPayloadWithIdDS.col("_1")), "full_outer") - .map((MapFunction, Tuple2>, G>) value -> { - Optional rowOpt = Optional.ofNullable(value._1()).map(Tuple2::_2); - Optional actionPayloadOpt = Optional.ofNullable(value._2()).map(Tuple2::_2); - return rowOpt - .map(row -> actionPayloadOpt - .map(actionPayload -> mergeAndGetFn.get().apply(row, actionPayload)) - .orElse(row)) - .orElseGet(() -> actionPayloadOpt - .filter(actionPayload -> actionPayload.getClass().equals(rowClazz)) - .map(rowClazz::cast) - .orElse(null)); - }, Encoders.kryo(rowClazz)) + .joinWith( + actionPayloadWithIdDS, + rowWithIdDS.col("_1").equalTo(actionPayloadWithIdDS.col("_1")), + "full_outer") + .map( + (MapFunction, Tuple2>, G>) + value -> { + Optional rowOpt = + Optional.ofNullable(value._1()).map(Tuple2::_2); + Optional actionPayloadOpt = + Optional.ofNullable(value._2()).map(Tuple2::_2); + return rowOpt.map( + row -> + actionPayloadOpt + .map( + actionPayload -> + mergeAndGetFn + .get() + .apply( + row, + actionPayload)) + .orElse(row)) + .orElseGet( + () -> + actionPayloadOpt + .filter( + actionPayload -> + actionPayload + .getClass() + .equals( + rowClazz)) + .map(rowClazz::cast) + .orElse(null)); + }, + Encoders.kryo(rowClazz)) .filter((FilterFunction) Objects::nonNull); } - private static Dataset> mapToTupleWithId(Dataset ds, - SerializableSupplier> idFn, - Class clazz) { - return ds - .map((MapFunction>) value -> new Tuple2<>(idFn.get().apply(value), value), - Encoders.tuple(Encoders.STRING(), Encoders.kryo(clazz))); + private static Dataset> mapToTupleWithId( + Dataset ds, SerializableSupplier> idFn, Class clazz) { + return ds.map( + (MapFunction>) + value -> new Tuple2<>(idFn.get().apply(value), value), + Encoders.tuple(Encoders.STRING(), Encoders.kryo(clazz))); } /** * Groups graph table by id and aggregates using supplied functions. * - * @param rowDS Dataset representing graph table - * @param rowIdFn Function used to get the id of graph table row + * @param rowDS Dataset representing graph table + * @param rowIdFn Function used to get the id of graph table row * @param mergeAndGetFn Function used to merge graph table rows - * @param zeroFn Function to create a zero/empty instance of graph table row - * @param isNotZeroFn Function to check if graph table row is not zero/empty - * @param rowClazz Class of graph table - * @param Type of graph table row + * @param zeroFn Function to create a zero/empty instance of graph table row + * @param isNotZeroFn Function to check if graph table row is not zero/empty + * @param rowClazz Class of graph table + * @param Type of graph table row * @return Dataset of aggregated graph table rows */ - public static Dataset groupGraphTableByIdAndMerge(Dataset rowDS, - SerializableSupplier> rowIdFn, - SerializableSupplier> mergeAndGetFn, - SerializableSupplier zeroFn, - SerializableSupplier> isNotZeroFn, - Class rowClazz) { - TypedColumn aggregator = new TableAggregator<>(zeroFn, mergeAndGetFn, isNotZeroFn, rowClazz).toColumn(); - return rowDS - .groupByKey((MapFunction) x -> rowIdFn.get().apply(x), Encoders.STRING()) + public static Dataset groupGraphTableByIdAndMerge( + Dataset rowDS, + SerializableSupplier> rowIdFn, + SerializableSupplier> mergeAndGetFn, + SerializableSupplier zeroFn, + SerializableSupplier> isNotZeroFn, + Class rowClazz) { + TypedColumn aggregator = + new TableAggregator<>(zeroFn, mergeAndGetFn, isNotZeroFn, rowClazz).toColumn(); + return rowDS.groupByKey( + (MapFunction) x -> rowIdFn.get().apply(x), Encoders.STRING()) .agg(aggregator) .map((MapFunction, G>) Tuple2::_2, Encoders.kryo(rowClazz)); } @@ -115,10 +139,11 @@ public class PromoteActionPayloadFunctions { private SerializableSupplier> isNotZeroFn; private Class rowClazz; - public TableAggregator(SerializableSupplier zeroFn, - SerializableSupplier> mergeAndGetFn, - SerializableSupplier> isNotZeroFn, - Class rowClazz) { + public TableAggregator( + SerializableSupplier zeroFn, + SerializableSupplier> mergeAndGetFn, + SerializableSupplier> isNotZeroFn, + Class rowClazz) { this.zeroFn = zeroFn; this.mergeAndGetFn = mergeAndGetFn; this.isNotZeroFn = isNotZeroFn; @@ -149,7 +174,8 @@ public class PromoteActionPayloadFunctions { } else if (!isNotZero.apply(left) && isNotZero.apply(right)) { return right; } - throw new RuntimeException("internal aggregation error: left and right objects are zero"); + throw new RuntimeException( + "internal aggregation error: left and right objects are zero"); } @Override @@ -167,4 +193,4 @@ public class PromoteActionPayloadFunctions { return Encoders.kryo(rowClazz); } } -} \ No newline at end of file +} diff --git a/dhp-workflows/dhp-actionmanager/src/test/java/eu/dnetlib/dhp/actionmanager/partition/PartitionActionSetsByPayloadTypeJobTest.java b/dhp-workflows/dhp-actionmanager/src/test/java/eu/dnetlib/dhp/actionmanager/partition/PartitionActionSetsByPayloadTypeJobTest.java index bd5dc9a5d..7f605d92d 100644 --- a/dhp-workflows/dhp-actionmanager/src/test/java/eu/dnetlib/dhp/actionmanager/partition/PartitionActionSetsByPayloadTypeJobTest.java +++ b/dhp-workflows/dhp-actionmanager/src/test/java/eu/dnetlib/dhp/actionmanager/partition/PartitionActionSetsByPayloadTypeJobTest.java @@ -1,10 +1,20 @@ package eu.dnetlib.dhp.actionmanager.partition; +import static eu.dnetlib.dhp.common.ThrowingSupport.rethrowAsRuntimeException; +import static org.apache.spark.sql.functions.*; +import static org.junit.jupiter.api.Assertions.assertIterableEquals; +import static scala.collection.JavaConversions.mutableSeqAsJavaList; + import com.fasterxml.jackson.databind.ObjectMapper; -import com.google.common.collect.Lists; import eu.dnetlib.dhp.actionmanager.ISClient; import eu.dnetlib.dhp.actionmanager.promote.PromoteActionPayloadForGraphTableJobTest; import eu.dnetlib.dhp.schema.oaf.*; +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.*; +import java.util.stream.Collectors; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; @@ -25,32 +35,23 @@ import org.mockito.junit.jupiter.MockitoExtension; import scala.Tuple2; import scala.collection.mutable.Seq; -import java.io.IOException; -import java.nio.file.Files; -import java.nio.file.Path; -import java.nio.file.Paths; -import java.util.*; -import java.util.stream.Collectors; - -import static eu.dnetlib.dhp.common.ThrowingSupport.rethrowAsRuntimeException; -import static org.apache.spark.sql.functions.*; -import static org.junit.jupiter.api.Assertions.assertIterableEquals; -import static scala.collection.JavaConversions.mutableSeqAsJavaList; - @ExtendWith(MockitoExtension.class) public class PartitionActionSetsByPayloadTypeJobTest { - private static final ClassLoader cl = PartitionActionSetsByPayloadTypeJobTest.class.getClassLoader(); + private static final ClassLoader cl = + PartitionActionSetsByPayloadTypeJobTest.class.getClassLoader(); private static Configuration configuration; private static SparkSession spark; private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); - private static final StructType ATOMIC_ACTION_SCHEMA = StructType$.MODULE$.apply( - Arrays.asList( - StructField$.MODULE$.apply("clazz", DataTypes.StringType, false, Metadata.empty()), - StructField$.MODULE$.apply("payload", DataTypes.StringType, false, Metadata.empty()) - )); + private static final StructType ATOMIC_ACTION_SCHEMA = + StructType$.MODULE$.apply( + Arrays.asList( + StructField$.MODULE$.apply( + "clazz", DataTypes.StringType, false, Metadata.empty()), + StructField$.MODULE$.apply( + "payload", DataTypes.StringType, false, Metadata.empty()))); @BeforeAll public static void beforeAll() throws IOException { @@ -71,11 +72,11 @@ public class PartitionActionSetsByPayloadTypeJobTest { @Nested class Main { - @Mock - private ISClient isClient; + @Mock private ISClient isClient; @Test - public void shouldPartitionActionSetsByPayloadType(@TempDir Path workingDir) throws Exception { + public void shouldPartitionActionSetsByPayloadType(@TempDir Path workingDir) + throws Exception { // given Path inputActionSetsBaseDir = workingDir.resolve("input").resolve("action_sets"); Path outputDir = workingDir.resolve("output"); @@ -85,15 +86,16 @@ public class PartitionActionSetsByPayloadTypeJobTest { List inputActionSetsPaths = resolveInputActionSetPaths(inputActionSetsBaseDir); // when - Mockito.when(isClient.getLatestRawsetPaths(Mockito.anyString())).thenReturn(inputActionSetsPaths); + Mockito.when(isClient.getLatestRawsetPaths(Mockito.anyString())) + .thenReturn(inputActionSetsPaths); PartitionActionSetsByPayloadTypeJob job = new PartitionActionSetsByPayloadTypeJob(); job.setIsClient(isClient); job.run( Boolean.FALSE, - "", // it can be empty we're mocking the response from isClient to resolve the paths - outputDir.toString() - ); + "", // it can be empty we're mocking the response from isClient to resolve the + // paths + outputDir.toString()); // then Files.exists(outputDir); @@ -110,112 +112,134 @@ public class PartitionActionSetsByPayloadTypeJobTest { } } - private List resolveInputActionSetPaths(Path inputActionSetsBaseDir) throws IOException { + private List resolveInputActionSetPaths(Path inputActionSetsBaseDir) + throws IOException { Path inputActionSetJsonDumpsDir = getInputActionSetJsonDumpsDir(); - return Files - .list(inputActionSetJsonDumpsDir) - .map(path -> { - String inputActionSetId = path.getFileName().toString(); - return inputActionSetsBaseDir.resolve(inputActionSetId).toString(); - }) + return Files.list(inputActionSetJsonDumpsDir) + .map( + path -> { + String inputActionSetId = path.getFileName().toString(); + return inputActionSetsBaseDir.resolve(inputActionSetId).toString(); + }) .collect(Collectors.toCollection(ArrayList::new)); } - private static Map> createActionSets(Path inputActionSetsDir) throws IOException { + private static Map> createActionSets(Path inputActionSetsDir) + throws IOException { Path inputActionSetJsonDumpsDir = getInputActionSetJsonDumpsDir(); Map> oafsByType = new HashMap<>(); - Files - .list(inputActionSetJsonDumpsDir) - .forEach(inputActionSetJsonDumpFile -> { - String inputActionSetId = inputActionSetJsonDumpFile.getFileName().toString(); - Path inputActionSetDir = inputActionSetsDir.resolve(inputActionSetId); + Files.list(inputActionSetJsonDumpsDir) + .forEach( + inputActionSetJsonDumpFile -> { + String inputActionSetId = + inputActionSetJsonDumpFile.getFileName().toString(); + Path inputActionSetDir = inputActionSetsDir.resolve(inputActionSetId); - Dataset actionDS = readActionsFromJsonDump(inputActionSetJsonDumpFile.toString()) - .cache(); + Dataset actionDS = + readActionsFromJsonDump(inputActionSetJsonDumpFile.toString()) + .cache(); - writeActionsAsJobInput(actionDS, inputActionSetId, inputActionSetDir.toString()); + writeActionsAsJobInput( + actionDS, inputActionSetId, inputActionSetDir.toString()); - Map> actionSetOafsByType = actionDS - .withColumn("atomic_action", from_json(col("value"), ATOMIC_ACTION_SCHEMA)) - .select(expr("atomic_action.*")) - .groupBy(col("clazz")) - .agg(collect_list(col("payload")).as("payload_list")) - .collectAsList() - .stream() - .map(row -> new AbstractMap.SimpleEntry<>(row.getAs("clazz"), - mutableSeqAsJavaList(row.>getAs("payload_list")))) - .collect(Collectors.toMap(AbstractMap.SimpleEntry::getKey, AbstractMap.SimpleEntry::getValue)); + Map> actionSetOafsByType = + actionDS + .withColumn( + "atomic_action", + from_json(col("value"), ATOMIC_ACTION_SCHEMA)) + .select(expr("atomic_action.*")).groupBy(col("clazz")) + .agg(collect_list(col("payload")).as("payload_list")) + .collectAsList().stream() + .map( + row -> + new AbstractMap.SimpleEntry<>( + row.getAs("clazz"), + mutableSeqAsJavaList( + row.>getAs( + "payload_list")))) + .collect( + Collectors.toMap( + AbstractMap.SimpleEntry::getKey, + AbstractMap.SimpleEntry::getValue)); - actionSetOafsByType.keySet() - .forEach(x -> { - if (oafsByType.containsKey(x)) { - List collected = new ArrayList<>(); - collected.addAll(oafsByType.get(x)); - collected.addAll(actionSetOafsByType.get(x)); - oafsByType.put(x, collected); - } else { - oafsByType.put(x, actionSetOafsByType.get(x)); - } - }); - }); + actionSetOafsByType + .keySet() + .forEach( + x -> { + if (oafsByType.containsKey(x)) { + List collected = new ArrayList<>(); + collected.addAll(oafsByType.get(x)); + collected.addAll(actionSetOafsByType.get(x)); + oafsByType.put(x, collected); + } else { + oafsByType.put(x, actionSetOafsByType.get(x)); + } + }); + }); return oafsByType; } private static Path getInputActionSetJsonDumpsDir() { - return Paths - .get(Objects.requireNonNull(cl.getResource("eu/dnetlib/dhp/actionmanager/partition/input/")) + return Paths.get( + Objects.requireNonNull( + cl.getResource("eu/dnetlib/dhp/actionmanager/partition/input/")) .getFile()); } private static Dataset readActionsFromJsonDump(String path) { - return spark - .read() - .textFile(path); + return spark.read().textFile(path); } - private static void writeActionsAsJobInput(Dataset actionDS, - String inputActionSetId, - String path) { - actionDS - .javaRDD() + private static void writeActionsAsJobInput( + Dataset actionDS, String inputActionSetId, String path) { + actionDS.javaRDD() .mapToPair(json -> new Tuple2<>(new Text(inputActionSetId), new Text(json))) - .saveAsNewAPIHadoopFile(path, + .saveAsNewAPIHadoopFile( + path, Text.class, Text.class, SequenceFileOutputFormat.class, configuration); } - private static void assertForOafType(Path outputDir, Map> oafsByClassName, Class clazz) { - Path outputDatasetDir = outputDir.resolve(String.format("clazz=%s", clazz.getCanonicalName())); + private static void assertForOafType( + Path outputDir, Map> oafsByClassName, Class clazz) { + Path outputDatasetDir = + outputDir.resolve(String.format("clazz=%s", clazz.getCanonicalName())); Files.exists(outputDatasetDir); - List actuals = readActionPayloadFromJobOutput(outputDatasetDir.toString(), clazz).collectAsList(); + List actuals = + readActionPayloadFromJobOutput(outputDatasetDir.toString(), clazz).collectAsList(); actuals.sort(Comparator.comparingInt(Object::hashCode)); - List expecteds = oafsByClassName.get(clazz.getCanonicalName()).stream() - .map(json -> mapToOaf(json, clazz)) - .sorted(Comparator.comparingInt(Object::hashCode)) - .collect(Collectors.toList()); + List expecteds = + oafsByClassName.get(clazz.getCanonicalName()).stream() + .map(json -> mapToOaf(json, clazz)) + .sorted(Comparator.comparingInt(Object::hashCode)) + .collect(Collectors.toList()); assertIterableEquals(expecteds, actuals); } - private static Dataset readActionPayloadFromJobOutput(String path, - Class clazz) { - return spark - .read() + private static Dataset readActionPayloadFromJobOutput( + String path, Class clazz) { + return spark.read() .parquet(path) - .map((MapFunction) value -> OBJECT_MAPPER.readValue(value.getAs("payload"), clazz), + .map( + (MapFunction) + value -> + OBJECT_MAPPER.readValue( + value.getAs("payload"), clazz), Encoders.bean(clazz)); } private static T mapToOaf(String json, Class clazz) { return rethrowAsRuntimeException( () -> OBJECT_MAPPER.readValue(json, clazz), - String.format("failed to map json to class: json=%s, class=%s", json, clazz.getCanonicalName()) - ); + String.format( + "failed to map json to class: json=%s, class=%s", + json, clazz.getCanonicalName())); } -} \ No newline at end of file +} diff --git a/dhp-workflows/dhp-actionmanager/src/test/java/eu/dnetlib/dhp/actionmanager/promote/MergeAndGetTest.java b/dhp-workflows/dhp-actionmanager/src/test/java/eu/dnetlib/dhp/actionmanager/promote/MergeAndGetTest.java index 154e0a331..6a46170c0 100644 --- a/dhp-workflows/dhp-actionmanager/src/test/java/eu/dnetlib/dhp/actionmanager/promote/MergeAndGetTest.java +++ b/dhp-workflows/dhp-actionmanager/src/test/java/eu/dnetlib/dhp/actionmanager/promote/MergeAndGetTest.java @@ -1,17 +1,16 @@ package eu.dnetlib.dhp.actionmanager.promote; -import eu.dnetlib.dhp.common.FunctionalInterfaceSupport.SerializableSupplier; -import eu.dnetlib.dhp.schema.oaf.*; -import org.junit.jupiter.api.Nested; -import org.junit.jupiter.api.Test; - -import java.util.function.BiFunction; - import static eu.dnetlib.dhp.actionmanager.promote.MergeAndGet.Strategy; import static eu.dnetlib.dhp.actionmanager.promote.MergeAndGet.functionFor; import static org.junit.jupiter.api.Assertions.*; import static org.mockito.Mockito.*; +import eu.dnetlib.dhp.common.FunctionalInterfaceSupport.SerializableSupplier; +import eu.dnetlib.dhp.schema.oaf.*; +import java.util.function.BiFunction; +import org.junit.jupiter.api.Nested; +import org.junit.jupiter.api.Test; + public class MergeAndGetTest { @Nested @@ -24,11 +23,11 @@ public class MergeAndGetTest { Oaf b = mock(Oaf.class); // when - SerializableSupplier> fn = functionFor(Strategy.MERGE_FROM_AND_GET); + SerializableSupplier> fn = + functionFor(Strategy.MERGE_FROM_AND_GET); // then - assertThrows(RuntimeException.class, () -> - fn.get().apply(a, b)); + assertThrows(RuntimeException.class, () -> fn.get().apply(a, b)); } @Test @@ -38,11 +37,11 @@ public class MergeAndGetTest { Relation b = mock(Relation.class); // when - SerializableSupplier> fn = functionFor(Strategy.MERGE_FROM_AND_GET); + SerializableSupplier> fn = + functionFor(Strategy.MERGE_FROM_AND_GET); // then - assertThrows(RuntimeException.class, () -> - fn.get().apply(a, b)); + assertThrows(RuntimeException.class, () -> fn.get().apply(a, b)); } @Test @@ -52,11 +51,11 @@ public class MergeAndGetTest { OafEntity b = mock(OafEntity.class); // when - SerializableSupplier> fn = functionFor(Strategy.MERGE_FROM_AND_GET); + SerializableSupplier> fn = + functionFor(Strategy.MERGE_FROM_AND_GET); // then - assertThrows(RuntimeException.class, () -> - fn.get().apply(a, b)); + assertThrows(RuntimeException.class, () -> fn.get().apply(a, b)); } @Test @@ -66,11 +65,11 @@ public class MergeAndGetTest { Oaf b = mock(Oaf.class); // when - SerializableSupplier> fn = functionFor(Strategy.MERGE_FROM_AND_GET); + SerializableSupplier> fn = + functionFor(Strategy.MERGE_FROM_AND_GET); // then - assertThrows(RuntimeException.class, () -> - fn.get().apply(a, b)); + assertThrows(RuntimeException.class, () -> fn.get().apply(a, b)); } @Test @@ -80,11 +79,11 @@ public class MergeAndGetTest { OafEntity b = mock(OafEntity.class); // when - SerializableSupplier> fn = functionFor(Strategy.MERGE_FROM_AND_GET); + SerializableSupplier> fn = + functionFor(Strategy.MERGE_FROM_AND_GET); // then - assertThrows(RuntimeException.class, () -> - fn.get().apply(a, b)); + assertThrows(RuntimeException.class, () -> fn.get().apply(a, b)); } @Test @@ -94,7 +93,8 @@ public class MergeAndGetTest { Relation b = mock(Relation.class); // when - SerializableSupplier> fn = functionFor(Strategy.MERGE_FROM_AND_GET); + SerializableSupplier> fn = + functionFor(Strategy.MERGE_FROM_AND_GET); // then Oaf x = fn.get().apply(a, b); @@ -110,11 +110,11 @@ public class MergeAndGetTest { Oaf b = mock(Oaf.class); // when - SerializableSupplier> fn = functionFor(Strategy.MERGE_FROM_AND_GET); + SerializableSupplier> fn = + functionFor(Strategy.MERGE_FROM_AND_GET); // then - assertThrows(RuntimeException.class, () -> - fn.get().apply(a, b)); + assertThrows(RuntimeException.class, () -> fn.get().apply(a, b)); } @Test @@ -124,30 +124,28 @@ public class MergeAndGetTest { Relation b = mock(Relation.class); // when - SerializableSupplier> fn = functionFor(Strategy.MERGE_FROM_AND_GET); + SerializableSupplier> fn = + functionFor(Strategy.MERGE_FROM_AND_GET); // then - assertThrows(RuntimeException.class, () -> - fn.get().apply(a, b)); + assertThrows(RuntimeException.class, () -> fn.get().apply(a, b)); } @Test public void shouldThrowForOafEntityAndOafEntityButNotSubclasses() { // given - class OafEntitySub1 extends OafEntity { - } - class OafEntitySub2 extends OafEntity { - } + class OafEntitySub1 extends OafEntity {} + class OafEntitySub2 extends OafEntity {} OafEntitySub1 a = mock(OafEntitySub1.class); OafEntitySub2 b = mock(OafEntitySub2.class); // when - SerializableSupplier> fn = functionFor(Strategy.MERGE_FROM_AND_GET); + SerializableSupplier> fn = + functionFor(Strategy.MERGE_FROM_AND_GET); // then - assertThrows(RuntimeException.class, () -> - fn.get().apply(a, b)); + assertThrows(RuntimeException.class, () -> fn.get().apply(a, b)); } @Test @@ -157,7 +155,8 @@ public class MergeAndGetTest { OafEntity b = mock(OafEntity.class); // when - SerializableSupplier> fn = functionFor(Strategy.MERGE_FROM_AND_GET); + SerializableSupplier> fn = + functionFor(Strategy.MERGE_FROM_AND_GET); // then Oaf x = fn.get().apply(a, b); @@ -177,11 +176,11 @@ public class MergeAndGetTest { Relation b = mock(Relation.class); // when - SerializableSupplier> fn = functionFor(Strategy.SELECT_NEWER_AND_GET); + SerializableSupplier> fn = + functionFor(Strategy.SELECT_NEWER_AND_GET); // then - assertThrows(RuntimeException.class, () -> - fn.get().apply(a, b)); + assertThrows(RuntimeException.class, () -> fn.get().apply(a, b)); } @Test @@ -191,11 +190,11 @@ public class MergeAndGetTest { OafEntity b = mock(OafEntity.class); // when - SerializableSupplier> fn = functionFor(Strategy.SELECT_NEWER_AND_GET); + SerializableSupplier> fn = + functionFor(Strategy.SELECT_NEWER_AND_GET); // then - assertThrows(RuntimeException.class, () -> - fn.get().apply(a, b)); + assertThrows(RuntimeException.class, () -> fn.get().apply(a, b)); } @Test @@ -205,28 +204,29 @@ public class MergeAndGetTest { Result b = mock(Result.class); // when - SerializableSupplier> fn = functionFor(Strategy.SELECT_NEWER_AND_GET); + SerializableSupplier> fn = + functionFor(Strategy.SELECT_NEWER_AND_GET); // then - assertThrows(RuntimeException.class, () -> - fn.get().apply(a, b)); + assertThrows(RuntimeException.class, () -> fn.get().apply(a, b)); } @Test public void shouldThrowWhenSuperTypeIsNewerForResultAndOafEntity() { // given - // real types must be used because subclass-superclass resolution does not work for mocks + // real types must be used because subclass-superclass resolution does not work for + // mocks Dataset a = new Dataset(); a.setLastupdatetimestamp(1L); Result b = new Result(); b.setLastupdatetimestamp(2L); // when - SerializableSupplier> fn = functionFor(Strategy.SELECT_NEWER_AND_GET); + SerializableSupplier> fn = + functionFor(Strategy.SELECT_NEWER_AND_GET); // then - assertThrows(RuntimeException.class, () -> - fn.get().apply(a, b)); + assertThrows(RuntimeException.class, () -> fn.get().apply(a, b)); } @Test @@ -265,4 +265,4 @@ public class MergeAndGetTest { assertEquals(a, x); } } -} \ No newline at end of file +} diff --git a/dhp-workflows/dhp-actionmanager/src/test/java/eu/dnetlib/dhp/actionmanager/promote/PromoteActionPayloadForGraphTableJobTest.java b/dhp-workflows/dhp-actionmanager/src/test/java/eu/dnetlib/dhp/actionmanager/promote/PromoteActionPayloadForGraphTableJobTest.java index 6f53fbec2..29592bb49 100644 --- a/dhp-workflows/dhp-actionmanager/src/test/java/eu/dnetlib/dhp/actionmanager/promote/PromoteActionPayloadForGraphTableJobTest.java +++ b/dhp-workflows/dhp-actionmanager/src/test/java/eu/dnetlib/dhp/actionmanager/promote/PromoteActionPayloadForGraphTableJobTest.java @@ -1,8 +1,20 @@ package eu.dnetlib.dhp.actionmanager.promote; +import static org.junit.jupiter.api.Assertions.*; +import static org.junit.jupiter.params.provider.Arguments.arguments; + import com.fasterxml.jackson.databind.ObjectMapper; import eu.dnetlib.dhp.schema.common.ModelSupport; import eu.dnetlib.dhp.schema.oaf.*; +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.Comparator; +import java.util.List; +import java.util.Objects; +import java.util.stream.Collectors; +import java.util.stream.Stream; import org.apache.commons.io.FileUtils; import org.apache.spark.SparkConf; import org.apache.spark.api.java.function.MapFunction; @@ -14,21 +26,9 @@ import org.junit.jupiter.params.ParameterizedTest; import org.junit.jupiter.params.provider.Arguments; import org.junit.jupiter.params.provider.MethodSource; -import java.io.IOException; -import java.nio.file.Files; -import java.nio.file.Path; -import java.nio.file.Paths; -import java.util.Comparator; -import java.util.List; -import java.util.Objects; -import java.util.stream.Collectors; -import java.util.stream.Stream; - -import static org.junit.jupiter.api.Assertions.*; -import static org.junit.jupiter.params.provider.Arguments.arguments; - public class PromoteActionPayloadForGraphTableJobTest { - private static final ClassLoader cl = PromoteActionPayloadForGraphTableJobTest.class.getClassLoader(); + private static final ClassLoader cl = + PromoteActionPayloadForGraphTableJobTest.class.getClassLoader(); private static SparkSession spark; @@ -52,7 +52,9 @@ public class PromoteActionPayloadForGraphTableJobTest { @BeforeEach public void beforeEach() throws IOException { - workingDir = Files.createTempDirectory(PromoteActionPayloadForGraphTableJobTest.class.getSimpleName()); + workingDir = + Files.createTempDirectory( + PromoteActionPayloadForGraphTableJobTest.class.getSimpleName()); inputDir = workingDir.resolve("input"); inputGraphRootDir = inputDir.resolve("graph"); inputActionPayloadRootDir = inputDir.resolve("action_payload"); @@ -80,87 +82,130 @@ public class PromoteActionPayloadForGraphTableJobTest { Class actionPayloadClazz = OafEntity.class; // when - RuntimeException exception = assertThrows(RuntimeException.class, () -> - PromoteActionPayloadForGraphTableJob.main(new String[]{ - "-isSparkSessionManaged", Boolean.FALSE.toString(), - "-inputGraphTablePath", "", - "-graphTableClassName", rowClazz.getCanonicalName(), - "-inputActionPayloadPath", "", - "-actionPayloadClassName", actionPayloadClazz.getCanonicalName(), - "-outputGraphTablePath", "", - "-mergeAndGetStrategy", MergeAndGet.Strategy.SELECT_NEWER_AND_GET.name() - })); + RuntimeException exception = + assertThrows( + RuntimeException.class, + () -> + PromoteActionPayloadForGraphTableJob.main( + new String[] { + "-isSparkSessionManaged", Boolean.FALSE.toString(), + "-inputGraphTablePath", "", + "-graphTableClassName", rowClazz.getCanonicalName(), + "-inputActionPayloadPath", "", + "-actionPayloadClassName", + actionPayloadClazz.getCanonicalName(), + "-outputGraphTablePath", "", + "-mergeAndGetStrategy", + MergeAndGet.Strategy.SELECT_NEWER_AND_GET + .name() + })); // then - String msg = String.format("graph table class is not a subclass of action payload class: graph=%s, action=%s", - rowClazz.getCanonicalName(), actionPayloadClazz.getCanonicalName()); + String msg = + String.format( + "graph table class is not a subclass of action payload class: graph=%s, action=%s", + rowClazz.getCanonicalName(), actionPayloadClazz.getCanonicalName()); assertTrue(exception.getMessage().contains(msg)); } @ParameterizedTest(name = "strategy: {0}, graph table: {1}, action payload: {2}") - @MethodSource("eu.dnetlib.dhp.actionmanager.promote.PromoteActionPayloadForGraphTableJobTest#promoteJobTestParams") - public void shouldPromoteActionPayloadForGraphTable(MergeAndGet.Strategy strategy, - Class rowClazz, - Class actionPayloadClazz) throws Exception { + @MethodSource( + "eu.dnetlib.dhp.actionmanager.promote.PromoteActionPayloadForGraphTableJobTest#promoteJobTestParams") + public void shouldPromoteActionPayloadForGraphTable( + MergeAndGet.Strategy strategy, + Class rowClazz, + Class actionPayloadClazz) + throws Exception { // given Path inputGraphTableDir = createGraphTable(inputGraphRootDir, rowClazz); - Path inputActionPayloadDir = createActionPayload(inputActionPayloadRootDir, rowClazz, actionPayloadClazz); - Path outputGraphTableDir = outputDir.resolve("graph").resolve(rowClazz.getSimpleName().toLowerCase()); + Path inputActionPayloadDir = + createActionPayload(inputActionPayloadRootDir, rowClazz, actionPayloadClazz); + Path outputGraphTableDir = + outputDir.resolve("graph").resolve(rowClazz.getSimpleName().toLowerCase()); // when - PromoteActionPayloadForGraphTableJob.main(new String[]{ - "-isSparkSessionManaged", Boolean.FALSE.toString(), - "-inputGraphTablePath", inputGraphTableDir.toString(), - "-graphTableClassName", rowClazz.getCanonicalName(), - "-inputActionPayloadPath", inputActionPayloadDir.toString(), - "-actionPayloadClassName", actionPayloadClazz.getCanonicalName(), - "-outputGraphTablePath", outputGraphTableDir.toString(), - "-mergeAndGetStrategy", strategy.name() - }); + PromoteActionPayloadForGraphTableJob.main( + new String[] { + "-isSparkSessionManaged", Boolean.FALSE.toString(), + "-inputGraphTablePath", inputGraphTableDir.toString(), + "-graphTableClassName", rowClazz.getCanonicalName(), + "-inputActionPayloadPath", inputActionPayloadDir.toString(), + "-actionPayloadClassName", actionPayloadClazz.getCanonicalName(), + "-outputGraphTablePath", outputGraphTableDir.toString(), + "-mergeAndGetStrategy", strategy.name() + }); // then assertTrue(Files.exists(outputGraphTableDir)); - List actualOutputRows = readGraphTableFromJobOutput(outputGraphTableDir.toString(), rowClazz) - .collectAsList() - .stream() - .sorted(Comparator.comparingInt(Object::hashCode)) - .collect(Collectors.toList()); - String expectedOutputGraphTableJsonDumpPath = resultFileLocation(strategy, rowClazz, actionPayloadClazz); - Path expectedOutputGraphTableJsonDumpFile = Paths - .get(Objects.requireNonNull(cl.getResource(expectedOutputGraphTableJsonDumpPath)).getFile()); - List expectedOutputRows = readGraphTableFromJsonDump(expectedOutputGraphTableJsonDumpFile.toString(), rowClazz) - .collectAsList() - .stream() - .sorted(Comparator.comparingInt(Object::hashCode)) - .collect(Collectors.toList()); + List actualOutputRows = + readGraphTableFromJobOutput(outputGraphTableDir.toString(), rowClazz) + .collectAsList().stream() + .sorted(Comparator.comparingInt(Object::hashCode)) + .collect(Collectors.toList()); + String expectedOutputGraphTableJsonDumpPath = + resultFileLocation(strategy, rowClazz, actionPayloadClazz); + Path expectedOutputGraphTableJsonDumpFile = + Paths.get( + Objects.requireNonNull( + cl.getResource(expectedOutputGraphTableJsonDumpPath)) + .getFile()); + List expectedOutputRows = + readGraphTableFromJsonDump( + expectedOutputGraphTableJsonDumpFile.toString(), rowClazz) + .collectAsList().stream() + .sorted(Comparator.comparingInt(Object::hashCode)) + .collect(Collectors.toList()); assertIterableEquals(expectedOutputRows, actualOutputRows); } } public static Stream promoteJobTestParams() { return Stream.of( - arguments(MergeAndGet.Strategy.MERGE_FROM_AND_GET, eu.dnetlib.dhp.schema.oaf.Dataset.class, eu.dnetlib.dhp.schema.oaf.Dataset.class), - arguments(MergeAndGet.Strategy.MERGE_FROM_AND_GET, eu.dnetlib.dhp.schema.oaf.Dataset.class, eu.dnetlib.dhp.schema.oaf.Result.class), - arguments(MergeAndGet.Strategy.MERGE_FROM_AND_GET, Datasource.class, Datasource.class), - arguments(MergeAndGet.Strategy.MERGE_FROM_AND_GET, Organization.class, Organization.class), - arguments(MergeAndGet.Strategy.MERGE_FROM_AND_GET, OtherResearchProduct.class, OtherResearchProduct.class), - arguments(MergeAndGet.Strategy.MERGE_FROM_AND_GET, OtherResearchProduct.class, Result.class), + arguments( + MergeAndGet.Strategy.MERGE_FROM_AND_GET, + eu.dnetlib.dhp.schema.oaf.Dataset.class, + eu.dnetlib.dhp.schema.oaf.Dataset.class), + arguments( + MergeAndGet.Strategy.MERGE_FROM_AND_GET, + eu.dnetlib.dhp.schema.oaf.Dataset.class, + eu.dnetlib.dhp.schema.oaf.Result.class), + arguments( + MergeAndGet.Strategy.MERGE_FROM_AND_GET, + Datasource.class, + Datasource.class), + arguments( + MergeAndGet.Strategy.MERGE_FROM_AND_GET, + Organization.class, + Organization.class), + arguments( + MergeAndGet.Strategy.MERGE_FROM_AND_GET, + OtherResearchProduct.class, + OtherResearchProduct.class), + arguments( + MergeAndGet.Strategy.MERGE_FROM_AND_GET, + OtherResearchProduct.class, + Result.class), arguments(MergeAndGet.Strategy.MERGE_FROM_AND_GET, Project.class, Project.class), - arguments(MergeAndGet.Strategy.MERGE_FROM_AND_GET, Publication.class, Publication.class), + arguments( + MergeAndGet.Strategy.MERGE_FROM_AND_GET, + Publication.class, + Publication.class), arguments(MergeAndGet.Strategy.MERGE_FROM_AND_GET, Publication.class, Result.class), arguments(MergeAndGet.Strategy.MERGE_FROM_AND_GET, Relation.class, Relation.class), arguments(MergeAndGet.Strategy.MERGE_FROM_AND_GET, Software.class, Software.class), - arguments(MergeAndGet.Strategy.MERGE_FROM_AND_GET, Software.class, Result.class) - ); + arguments(MergeAndGet.Strategy.MERGE_FROM_AND_GET, Software.class, Result.class)); } - private static Path createGraphTable(Path inputGraphRootDir, - Class rowClazz) { + private static Path createGraphTable( + Path inputGraphRootDir, Class rowClazz) { String inputGraphTableJsonDumpPath = inputGraphTableJsonDumpLocation(rowClazz); - Path inputGraphTableJsonDumpFile = Paths - .get(Objects.requireNonNull(cl.getResource(inputGraphTableJsonDumpPath)).getFile()); - Dataset rowDS = readGraphTableFromJsonDump(inputGraphTableJsonDumpFile.toString(), rowClazz); + Path inputGraphTableJsonDumpFile = + Paths.get( + Objects.requireNonNull(cl.getResource(inputGraphTableJsonDumpPath)) + .getFile()); + Dataset rowDS = + readGraphTableFromJsonDump(inputGraphTableJsonDumpFile.toString(), rowClazz); String inputGraphTableName = rowClazz.getSimpleName().toLowerCase(); Path inputGraphTableDir = inputGraphRootDir.resolve(inputGraphTableName); writeGraphTableAaJobInput(rowDS, inputGraphTableDir.toString()); @@ -169,71 +214,74 @@ public class PromoteActionPayloadForGraphTableJobTest { private static String inputGraphTableJsonDumpLocation(Class rowClazz) { return String.format( - "%s/%s.json", "eu/dnetlib/dhp/actionmanager/promote/input/graph", rowClazz.getSimpleName().toLowerCase()); + "%s/%s.json", + "eu/dnetlib/dhp/actionmanager/promote/input/graph", + rowClazz.getSimpleName().toLowerCase()); } - private static Dataset readGraphTableFromJsonDump(String path, - Class rowClazz) { - return spark - .read() + private static Dataset readGraphTableFromJsonDump( + String path, Class rowClazz) { + return spark.read() .textFile(path) - .map((MapFunction) json -> OBJECT_MAPPER.readValue(json, rowClazz), Encoders.bean(rowClazz)); + .map( + (MapFunction) json -> OBJECT_MAPPER.readValue(json, rowClazz), + Encoders.bean(rowClazz)); } - private static void writeGraphTableAaJobInput(Dataset rowDS, - String path) { - rowDS - .write() - .option("compression", "gzip") - .json(path); + private static void writeGraphTableAaJobInput(Dataset rowDS, String path) { + rowDS.write().option("compression", "gzip").json(path); } - private static Path createActionPayload(Path inputActionPayloadRootDir, - Class rowClazz, - Class actionPayloadClazz) { - String inputActionPayloadJsonDumpPath = inputActionPayloadJsonDumpLocation(rowClazz, actionPayloadClazz); - Path inputActionPayloadJsonDumpFile = Paths - .get(Objects.requireNonNull(cl.getResource(inputActionPayloadJsonDumpPath)).getFile()); - Dataset actionPayloadDS = readActionPayloadFromJsonDump(inputActionPayloadJsonDumpFile.toString()); - Path inputActionPayloadDir = inputActionPayloadRootDir.resolve(actionPayloadClazz.getSimpleName().toLowerCase()); + private static Path createActionPayload( + Path inputActionPayloadRootDir, Class rowClazz, Class actionPayloadClazz) { + String inputActionPayloadJsonDumpPath = + inputActionPayloadJsonDumpLocation(rowClazz, actionPayloadClazz); + Path inputActionPayloadJsonDumpFile = + Paths.get( + Objects.requireNonNull(cl.getResource(inputActionPayloadJsonDumpPath)) + .getFile()); + Dataset actionPayloadDS = + readActionPayloadFromJsonDump(inputActionPayloadJsonDumpFile.toString()); + Path inputActionPayloadDir = + inputActionPayloadRootDir.resolve(actionPayloadClazz.getSimpleName().toLowerCase()); writeActionPayloadAsJobInput(actionPayloadDS, inputActionPayloadDir.toString()); return inputActionPayloadDir; } - private static String inputActionPayloadJsonDumpLocation(Class rowClazz, - Class actionPayloadClazz) { + private static String inputActionPayloadJsonDumpLocation( + Class rowClazz, Class actionPayloadClazz) { - return String.format("eu/dnetlib/dhp/actionmanager/promote/input/action_payload/%s_table/%s.json", - rowClazz.getSimpleName().toLowerCase(), actionPayloadClazz.getSimpleName().toLowerCase()); + return String.format( + "eu/dnetlib/dhp/actionmanager/promote/input/action_payload/%s_table/%s.json", + rowClazz.getSimpleName().toLowerCase(), + actionPayloadClazz.getSimpleName().toLowerCase()); } private static Dataset readActionPayloadFromJsonDump(String path) { - return spark - .read() - .textFile(path); + return spark.read().textFile(path); } - private static void writeActionPayloadAsJobInput(Dataset actionPayloadDS, - String path) { - actionPayloadDS - .withColumnRenamed("value", "payload") - .write() - .parquet(path); + private static void writeActionPayloadAsJobInput(Dataset actionPayloadDS, String path) { + actionPayloadDS.withColumnRenamed("value", "payload").write().parquet(path); } - private static Dataset readGraphTableFromJobOutput(String path, - Class rowClazz) { - return spark - .read() + private static Dataset readGraphTableFromJobOutput( + String path, Class rowClazz) { + return spark.read() .textFile(path) - .map((MapFunction) json -> OBJECT_MAPPER.readValue(json, rowClazz), Encoders.bean(rowClazz)); + .map( + (MapFunction) json -> OBJECT_MAPPER.readValue(json, rowClazz), + Encoders.bean(rowClazz)); } - private static String resultFileLocation(MergeAndGet.Strategy strategy, - Class rowClazz, - Class actionPayloadClazz) { - return String - .format("eu/dnetlib/dhp/actionmanager/promote/output/graph/%s/%s/%s_action_payload/result.json", - strategy.name().toLowerCase(), rowClazz.getSimpleName().toLowerCase(), actionPayloadClazz.getSimpleName().toLowerCase()); + private static String resultFileLocation( + MergeAndGet.Strategy strategy, + Class rowClazz, + Class actionPayloadClazz) { + return String.format( + "eu/dnetlib/dhp/actionmanager/promote/output/graph/%s/%s/%s_action_payload/result.json", + strategy.name().toLowerCase(), + rowClazz.getSimpleName().toLowerCase(), + actionPayloadClazz.getSimpleName().toLowerCase()); } -} \ No newline at end of file +} diff --git a/dhp-workflows/dhp-actionmanager/src/test/java/eu/dnetlib/dhp/actionmanager/promote/PromoteActionPayloadFunctionsTest.java b/dhp-workflows/dhp-actionmanager/src/test/java/eu/dnetlib/dhp/actionmanager/promote/PromoteActionPayloadFunctionsTest.java index e3fc7db48..5c69e32e3 100644 --- a/dhp-workflows/dhp-actionmanager/src/test/java/eu/dnetlib/dhp/actionmanager/promote/PromoteActionPayloadFunctionsTest.java +++ b/dhp-workflows/dhp-actionmanager/src/test/java/eu/dnetlib/dhp/actionmanager/promote/PromoteActionPayloadFunctionsTest.java @@ -1,7 +1,15 @@ package eu.dnetlib.dhp.actionmanager.promote; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertThrows; + import eu.dnetlib.dhp.common.FunctionalInterfaceSupport.SerializableSupplier; import eu.dnetlib.dhp.schema.oaf.Oaf; +import java.util.Arrays; +import java.util.List; +import java.util.Objects; +import java.util.function.BiFunction; +import java.util.function.Function; import org.apache.spark.SparkConf; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Encoders; @@ -11,15 +19,6 @@ import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.Nested; import org.junit.jupiter.api.Test; -import java.util.Arrays; -import java.util.List; -import java.util.Objects; -import java.util.function.BiFunction; -import java.util.function.Function; - -import static org.junit.jupiter.api.Assertions.assertEquals; -import static org.junit.jupiter.api.Assertions.assertThrows; - public class PromoteActionPayloadFunctionsTest { private static SparkSession spark; @@ -44,20 +43,20 @@ public class PromoteActionPayloadFunctionsTest { @Test public void shouldThrowWhenTableTypeIsNotSubtypeOfActionPayloadType() { // given - class OafImpl extends Oaf { - } + class OafImpl extends Oaf {} // when - assertThrows(RuntimeException.class, () -> - PromoteActionPayloadFunctions - .joinGraphTableWithActionPayloadAndMerge(null, + assertThrows( + RuntimeException.class, + () -> + PromoteActionPayloadFunctions.joinGraphTableWithActionPayloadAndMerge( + null, null, null, null, null, OafImplSubSub.class, - OafImpl.class - )); + OafImpl.class)); } @Test @@ -68,40 +67,53 @@ public class PromoteActionPayloadFunctionsTest { String id2 = "id2"; String id3 = "id3"; String id4 = "id4"; - List rowData = Arrays.asList( - createOafImplSubSub(id0), - createOafImplSubSub(id1), - createOafImplSubSub(id2), - createOafImplSubSub(id3) - ); - Dataset rowDS = spark.createDataset(rowData, Encoders.bean(OafImplSubSub.class)); + List rowData = + Arrays.asList( + createOafImplSubSub(id0), + createOafImplSubSub(id1), + createOafImplSubSub(id2), + createOafImplSubSub(id3)); + Dataset rowDS = + spark.createDataset(rowData, Encoders.bean(OafImplSubSub.class)); - List actionPayloadData = Arrays.asList( - createOafImplSubSub(id1), - createOafImplSubSub(id2), createOafImplSubSub(id2), - createOafImplSubSub(id3), createOafImplSubSub(id3), createOafImplSubSub(id3), - createOafImplSubSub(id4), createOafImplSubSub(id4), createOafImplSubSub(id4), createOafImplSubSub(id4) - ); - Dataset actionPayloadDS = spark.createDataset(actionPayloadData, Encoders.bean(OafImplSubSub.class)); + List actionPayloadData = + Arrays.asList( + createOafImplSubSub(id1), + createOafImplSubSub(id2), + createOafImplSubSub(id2), + createOafImplSubSub(id3), + createOafImplSubSub(id3), + createOafImplSubSub(id3), + createOafImplSubSub(id4), + createOafImplSubSub(id4), + createOafImplSubSub(id4), + createOafImplSubSub(id4)); + Dataset actionPayloadDS = + spark.createDataset(actionPayloadData, Encoders.bean(OafImplSubSub.class)); - SerializableSupplier> rowIdFn = () -> OafImplRoot::getId; - SerializableSupplier> actionPayloadIdFn = () -> OafImplRoot::getId; - SerializableSupplier> mergeAndGetFn = () -> (x, y) -> { - x.merge(y); - return x; - }; + SerializableSupplier> rowIdFn = + () -> OafImplRoot::getId; + SerializableSupplier> actionPayloadIdFn = + () -> OafImplRoot::getId; + SerializableSupplier> + mergeAndGetFn = + () -> + (x, y) -> { + x.merge(y); + return x; + }; // when - List results = PromoteActionPayloadFunctions - .joinGraphTableWithActionPayloadAndMerge(rowDS, - actionPayloadDS, - rowIdFn, - actionPayloadIdFn, - mergeAndGetFn, - OafImplSubSub.class, - OafImplSubSub.class - ) - .collectAsList(); + List results = + PromoteActionPayloadFunctions.joinGraphTableWithActionPayloadAndMerge( + rowDS, + actionPayloadDS, + rowIdFn, + actionPayloadIdFn, + mergeAndGetFn, + OafImplSubSub.class, + OafImplSubSub.class) + .collectAsList(); // then assertEquals(11, results.size()); @@ -111,23 +123,24 @@ public class PromoteActionPayloadFunctionsTest { assertEquals(3, results.stream().filter(x -> x.getId().equals(id3)).count()); assertEquals(4, results.stream().filter(x -> x.getId().equals(id4)).count()); - results.forEach(result -> { - switch (result.getId()) { - case "id0": - assertEquals(1, result.getMerged()); - break; - case "id1": - case "id2": - case "id3": - assertEquals(2, result.getMerged()); - break; - case "id4": - assertEquals(1, result.getMerged()); - break; - default: - throw new RuntimeException(); - } - }); + results.forEach( + result -> { + switch (result.getId()) { + case "id0": + assertEquals(1, result.getMerged()); + break; + case "id1": + case "id2": + case "id3": + assertEquals(2, result.getMerged()); + break; + case "id4": + assertEquals(1, result.getMerged()); + break; + default: + throw new RuntimeException(); + } + }); } @Test @@ -138,40 +151,53 @@ public class PromoteActionPayloadFunctionsTest { String id2 = "id2"; String id3 = "id3"; String id4 = "id4"; - List rowData = Arrays.asList( - createOafImplSubSub(id0), - createOafImplSubSub(id1), - createOafImplSubSub(id2), - createOafImplSubSub(id3) - ); - Dataset rowDS = spark.createDataset(rowData, Encoders.bean(OafImplSubSub.class)); + List rowData = + Arrays.asList( + createOafImplSubSub(id0), + createOafImplSubSub(id1), + createOafImplSubSub(id2), + createOafImplSubSub(id3)); + Dataset rowDS = + spark.createDataset(rowData, Encoders.bean(OafImplSubSub.class)); - List actionPayloadData = Arrays.asList( - createOafImplSub(id1), - createOafImplSub(id2), createOafImplSub(id2), - createOafImplSub(id3), createOafImplSub(id3), createOafImplSub(id3), - createOafImplSub(id4), createOafImplSub(id4), createOafImplSub(id4), createOafImplSub(id4) - ); - Dataset actionPayloadDS = spark.createDataset(actionPayloadData, Encoders.bean(OafImplSub.class)); + List actionPayloadData = + Arrays.asList( + createOafImplSub(id1), + createOafImplSub(id2), + createOafImplSub(id2), + createOafImplSub(id3), + createOafImplSub(id3), + createOafImplSub(id3), + createOafImplSub(id4), + createOafImplSub(id4), + createOafImplSub(id4), + createOafImplSub(id4)); + Dataset actionPayloadDS = + spark.createDataset(actionPayloadData, Encoders.bean(OafImplSub.class)); - SerializableSupplier> rowIdFn = () -> OafImplRoot::getId; - SerializableSupplier> actionPayloadIdFn = () -> OafImplRoot::getId; - SerializableSupplier> mergeAndGetFn = () -> (x, y) -> { - x.merge(y); - return x; - }; + SerializableSupplier> rowIdFn = + () -> OafImplRoot::getId; + SerializableSupplier> actionPayloadIdFn = + () -> OafImplRoot::getId; + SerializableSupplier> + mergeAndGetFn = + () -> + (x, y) -> { + x.merge(y); + return x; + }; // when - List results = PromoteActionPayloadFunctions - .joinGraphTableWithActionPayloadAndMerge(rowDS, - actionPayloadDS, - rowIdFn, - actionPayloadIdFn, - mergeAndGetFn, - OafImplSubSub.class, - OafImplSub.class - ) - .collectAsList(); + List results = + PromoteActionPayloadFunctions.joinGraphTableWithActionPayloadAndMerge( + rowDS, + actionPayloadDS, + rowIdFn, + actionPayloadIdFn, + mergeAndGetFn, + OafImplSubSub.class, + OafImplSub.class) + .collectAsList(); // then assertEquals(7, results.size()); @@ -181,22 +207,22 @@ public class PromoteActionPayloadFunctionsTest { assertEquals(3, results.stream().filter(x -> x.getId().equals(id3)).count()); assertEquals(0, results.stream().filter(x -> x.getId().equals(id4)).count()); - results.forEach(result -> { - switch (result.getId()) { - case "id0": - assertEquals(1, result.getMerged()); - break; - case "id1": - case "id2": - case "id3": - assertEquals(2, result.getMerged()); - break; - default: - throw new RuntimeException(); - } - }); + results.forEach( + result -> { + switch (result.getId()) { + case "id0": + assertEquals(1, result.getMerged()); + break; + case "id1": + case "id2": + case "id3": + assertEquals(2, result.getMerged()); + break; + default: + throw new RuntimeException(); + } + }); } - } @Nested @@ -208,30 +234,40 @@ public class PromoteActionPayloadFunctionsTest { String id1 = "id1"; String id2 = "id2"; String id3 = "id3"; - List rowData = Arrays.asList( - createOafImplSubSub(id1), - createOafImplSubSub(id2), createOafImplSubSub(id2), - createOafImplSubSub(id3), createOafImplSubSub(id3), createOafImplSubSub(id3) - ); - Dataset rowDS = spark.createDataset(rowData, Encoders.bean(OafImplSubSub.class)); + List rowData = + Arrays.asList( + createOafImplSubSub(id1), + createOafImplSubSub(id2), + createOafImplSubSub(id2), + createOafImplSubSub(id3), + createOafImplSubSub(id3), + createOafImplSubSub(id3)); + Dataset rowDS = + spark.createDataset(rowData, Encoders.bean(OafImplSubSub.class)); - SerializableSupplier> rowIdFn = () -> OafImplRoot::getId; - SerializableSupplier> mergeAndGetFn = () -> (x, y) -> { - x.merge(y); - return x; - }; + SerializableSupplier> rowIdFn = + () -> OafImplRoot::getId; + SerializableSupplier> + mergeAndGetFn = + () -> + (x, y) -> { + x.merge(y); + return x; + }; SerializableSupplier zeroFn = OafImplSubSub::new; - SerializableSupplier> isNotZeroFn = () -> x -> Objects.nonNull(x.getId()); + SerializableSupplier> isNotZeroFn = + () -> x -> Objects.nonNull(x.getId()); // when - List results = PromoteActionPayloadFunctions - .groupGraphTableByIdAndMerge(rowDS, - rowIdFn, - mergeAndGetFn, - zeroFn, - isNotZeroFn, - OafImplSubSub.class) - .collectAsList(); + List results = + PromoteActionPayloadFunctions.groupGraphTableByIdAndMerge( + rowDS, + rowIdFn, + mergeAndGetFn, + zeroFn, + isNotZeroFn, + OafImplSubSub.class) + .collectAsList(); // then assertEquals(3, results.size()); @@ -239,23 +275,23 @@ public class PromoteActionPayloadFunctionsTest { assertEquals(1, results.stream().filter(x -> x.getId().equals(id2)).count()); assertEquals(1, results.stream().filter(x -> x.getId().equals(id3)).count()); - results.forEach(result -> { - switch (result.getId()) { - case "id1": - assertEquals(1, result.getMerged()); - break; - case "id2": - assertEquals(2, result.getMerged()); - break; - case "id3": - assertEquals(3, result.getMerged()); - break; - default: - throw new RuntimeException(); - } - }); + results.forEach( + result -> { + switch (result.getId()) { + case "id1": + assertEquals(1, result.getMerged()); + break; + case "id2": + assertEquals(2, result.getMerged()); + break; + case "id3": + assertEquals(3, result.getMerged()); + break; + default: + throw new RuntimeException(); + } + }); } - } public static class OafImplRoot extends Oaf { @@ -310,5 +346,4 @@ public class PromoteActionPayloadFunctionsTest { x.setId(id); return x; } - } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/GenerateNativeStoreSparkJob.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/GenerateNativeStoreSparkJob.java index f4da193a1..494d3d5c1 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/GenerateNativeStoreSparkJob.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/GenerateNativeStoreSparkJob.java @@ -7,6 +7,11 @@ import eu.dnetlib.dhp.model.mdstore.Provenance; import eu.dnetlib.message.Message; import eu.dnetlib.message.MessageManager; import eu.dnetlib.message.MessageType; +import java.io.ByteArrayInputStream; +import java.nio.charset.StandardCharsets; +import java.util.HashMap; +import java.util.Map; +import java.util.Objects; import org.apache.commons.cli.*; import org.apache.commons.io.IOUtils; import org.apache.commons.lang3.StringUtils; @@ -24,94 +29,135 @@ import org.dom4j.Document; import org.dom4j.Node; import org.dom4j.io.SAXReader; -import java.io.ByteArrayInputStream; -import java.nio.charset.StandardCharsets; -import java.util.HashMap; -import java.util.Map; -import java.util.Objects; - public class GenerateNativeStoreSparkJob { + public static MetadataRecord parseRecord( + final String input, + final String xpath, + final String encoding, + final Provenance provenance, + final Long dateOfCollection, + final LongAccumulator totalItems, + final LongAccumulator invalidRecords) { - public static MetadataRecord parseRecord (final String input, final String xpath, final String encoding, final Provenance provenance, final Long dateOfCollection, final LongAccumulator totalItems, final LongAccumulator invalidRecords) { - - if(totalItems != null) - totalItems.add(1); + if (totalItems != null) totalItems.add(1); try { SAXReader reader = new SAXReader(); - Document document = reader.read(new ByteArrayInputStream(input.getBytes(StandardCharsets.UTF_8))); + Document document = + reader.read(new ByteArrayInputStream(input.getBytes(StandardCharsets.UTF_8))); Node node = document.selectSingleNode(xpath); final String originalIdentifier = node.getText(); if (StringUtils.isBlank(originalIdentifier)) { - if (invalidRecords!= null) - invalidRecords.add(1); + if (invalidRecords != null) invalidRecords.add(1); return null; } - return new MetadataRecord(originalIdentifier, encoding, provenance, input, dateOfCollection); + return new MetadataRecord( + originalIdentifier, encoding, provenance, input, dateOfCollection); } catch (Throwable e) { - if (invalidRecords!= null) - invalidRecords.add(1); + if (invalidRecords != null) invalidRecords.add(1); e.printStackTrace(); return null; - } } public static void main(String[] args) throws Exception { - final ArgumentApplicationParser parser = new ArgumentApplicationParser(IOUtils.toString(GenerateNativeStoreSparkJob.class.getResourceAsStream("/eu/dnetlib/dhp/collection/collection_input_parameters.json"))); + final ArgumentApplicationParser parser = + new ArgumentApplicationParser( + IOUtils.toString( + GenerateNativeStoreSparkJob.class.getResourceAsStream( + "/eu/dnetlib/dhp/collection/collection_input_parameters.json"))); parser.parseArgument(args); - final ObjectMapper jsonMapper = new ObjectMapper(); - final Provenance provenance = jsonMapper.readValue(parser.get("provenance"), Provenance.class); - final long dateOfCollection = new Long(parser.get("dateOfCollection")); + final ObjectMapper jsonMapper = new ObjectMapper(); + final Provenance provenance = + jsonMapper.readValue(parser.get("provenance"), Provenance.class); + final long dateOfCollection = new Long(parser.get("dateOfCollection")); - final SparkSession spark = SparkSession - .builder() - .appName("GenerateNativeStoreSparkJob") - .master(parser.get("master")) - .getOrCreate(); + final SparkSession spark = + SparkSession.builder() + .appName("GenerateNativeStoreSparkJob") + .master(parser.get("master")) + .getOrCreate(); final Map ongoingMap = new HashMap<>(); final Map reportMap = new HashMap<>(); - final boolean test = parser.get("isTest") == null ? false : Boolean.valueOf(parser.get("isTest")); + final boolean test = + parser.get("isTest") == null ? false : Boolean.valueOf(parser.get("isTest")); final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); - final JavaPairRDD inputRDD = sc.sequenceFile(parser.get("input"), IntWritable.class, Text.class); + final JavaPairRDD inputRDD = + sc.sequenceFile(parser.get("input"), IntWritable.class, Text.class); final LongAccumulator totalItems = sc.sc().longAccumulator("TotalItems"); final LongAccumulator invalidRecords = sc.sc().longAccumulator("InvalidRecords"); - final MessageManager manager = new MessageManager(parser.get("rabbitHost"), parser.get("rabbitUser"), parser.get("rabbitPassword"), false, false, null); + final MessageManager manager = + new MessageManager( + parser.get("rabbitHost"), + parser.get("rabbitUser"), + parser.get("rabbitPassword"), + false, + false, + null); - final JavaRDD mappeRDD = inputRDD.map(item -> parseRecord(item._2().toString(), parser.get("xpath"), parser.get("encoding"), provenance, dateOfCollection, totalItems, invalidRecords)) - .filter(Objects::nonNull).distinct(); + final JavaRDD mappeRDD = + inputRDD.map( + item -> + parseRecord( + item._2().toString(), + parser.get("xpath"), + parser.get("encoding"), + provenance, + dateOfCollection, + totalItems, + invalidRecords)) + .filter(Objects::nonNull) + .distinct(); ongoingMap.put("ongoing", "0"); if (!test) { - manager.sendMessage(new Message(parser.get("workflowId"),"DataFrameCreation", MessageType.ONGOING, ongoingMap ), parser.get("rabbitOngoingQueue"), true, false); + manager.sendMessage( + new Message( + parser.get("workflowId"), + "DataFrameCreation", + MessageType.ONGOING, + ongoingMap), + parser.get("rabbitOngoingQueue"), + true, + false); } - final Encoder encoder = Encoders.bean(MetadataRecord.class); final Dataset mdstore = spark.createDataset(mappeRDD.rdd(), encoder); final LongAccumulator mdStoreRecords = sc.sc().longAccumulator("MDStoreRecords"); mdStoreRecords.add(mdstore.count()); - ongoingMap.put("ongoing", ""+ totalItems.value()); + ongoingMap.put("ongoing", "" + totalItems.value()); if (!test) { - manager.sendMessage(new Message(parser.get("workflowId"), "DataFrameCreation", MessageType.ONGOING, ongoingMap), parser.get("rabbitOngoingQueue"), true, false); - + manager.sendMessage( + new Message( + parser.get("workflowId"), + "DataFrameCreation", + MessageType.ONGOING, + ongoingMap), + parser.get("rabbitOngoingQueue"), + true, + false); } mdstore.write().format("parquet").save(parser.get("output")); - reportMap.put("inputItem" , ""+ totalItems.value()); + reportMap.put("inputItem", "" + totalItems.value()); reportMap.put("invalidRecords", "" + invalidRecords.value()); reportMap.put("mdStoreSize", "" + mdStoreRecords.value()); if (!test) { - manager.sendMessage(new Message(parser.get("workflowId"), "Collection", MessageType.REPORT, reportMap), parser.get("rabbitReportQueue"), true, false); + manager.sendMessage( + new Message( + parser.get("workflowId"), "Collection", MessageType.REPORT, reportMap), + parser.get("rabbitReportQueue"), + true, + false); manager.close(); } - } } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/CollectorPlugin.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/CollectorPlugin.java index cfa0e417b..fa628ac9e 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/CollectorPlugin.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/CollectorPlugin.java @@ -2,10 +2,9 @@ package eu.dnetlib.dhp.collection.plugin; import eu.dnetlib.collector.worker.model.ApiDescriptor; import eu.dnetlib.dhp.collection.worker.DnetCollectorException; - import java.util.stream.Stream; public interface CollectorPlugin { - Stream collect(ApiDescriptor api) throws DnetCollectorException; + Stream collect(ApiDescriptor api) throws DnetCollectorException; } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/oai/OaiCollectorPlugin.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/oai/OaiCollectorPlugin.java index ad893ce52..a089a5750 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/oai/OaiCollectorPlugin.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/oai/OaiCollectorPlugin.java @@ -1,5 +1,11 @@ package eu.dnetlib.dhp.collection.plugin.oai; +import com.google.common.base.Splitter; +import com.google.common.collect.Iterators; +import com.google.common.collect.Lists; +import eu.dnetlib.collector.worker.model.ApiDescriptor; +import eu.dnetlib.dhp.collection.plugin.CollectorPlugin; +import eu.dnetlib.dhp.collection.worker.DnetCollectorException; import java.util.ArrayList; import java.util.Iterator; import java.util.List; @@ -7,62 +13,70 @@ import java.util.Spliterator; import java.util.Spliterators; import java.util.stream.Stream; import java.util.stream.StreamSupport; -import com.google.common.base.Splitter; -import com.google.common.collect.Iterators; -import com.google.common.collect.Lists; - - -import eu.dnetlib.collector.worker.model.ApiDescriptor; -import eu.dnetlib.dhp.collection.plugin.CollectorPlugin; -import eu.dnetlib.dhp.collection.worker.DnetCollectorException; - public class OaiCollectorPlugin implements CollectorPlugin { - private static final String FORMAT_PARAM = "format"; - private static final String OAI_SET_PARAM = "set"; - private static final Object OAI_FROM_DATE_PARAM = "fromDate"; - private static final Object OAI_UNTIL_DATE_PARAM = "untilDate"; + private static final String FORMAT_PARAM = "format"; + private static final String OAI_SET_PARAM = "set"; + private static final Object OAI_FROM_DATE_PARAM = "fromDate"; + private static final Object OAI_UNTIL_DATE_PARAM = "untilDate"; + private OaiIteratorFactory oaiIteratorFactory; - private OaiIteratorFactory oaiIteratorFactory; + @Override + public Stream collect(final ApiDescriptor api) throws DnetCollectorException { + final String baseUrl = api.getBaseUrl(); + final String mdFormat = api.getParams().get(FORMAT_PARAM); + final String setParam = api.getParams().get(OAI_SET_PARAM); + final String fromDate = api.getParams().get(OAI_FROM_DATE_PARAM); + final String untilDate = api.getParams().get(OAI_UNTIL_DATE_PARAM); - @Override - public Stream collect(final ApiDescriptor api) throws DnetCollectorException { - final String baseUrl = api.getBaseUrl(); - final String mdFormat = api.getParams().get(FORMAT_PARAM); - final String setParam = api.getParams().get(OAI_SET_PARAM); - final String fromDate = api.getParams().get(OAI_FROM_DATE_PARAM); - final String untilDate = api.getParams().get(OAI_UNTIL_DATE_PARAM); + final List sets = new ArrayList<>(); + if (setParam != null) { + sets.addAll( + Lists.newArrayList( + Splitter.on(",").omitEmptyStrings().trimResults().split(setParam))); + } + if (sets.isEmpty()) { + // If no set is defined, ALL the sets must be harvested + sets.add(""); + } - final List sets = new ArrayList<>(); - if (setParam != null) { - sets.addAll(Lists.newArrayList(Splitter.on(",").omitEmptyStrings().trimResults().split(setParam))); - } - if (sets.isEmpty()) { - // If no set is defined, ALL the sets must be harvested - sets.add(""); - } + if (baseUrl == null || baseUrl.isEmpty()) { + throw new DnetCollectorException("Param 'baseurl' is null or empty"); + } - if (baseUrl == null || baseUrl.isEmpty()) { throw new DnetCollectorException("Param 'baseurl' is null or empty"); } + if (mdFormat == null || mdFormat.isEmpty()) { + throw new DnetCollectorException("Param 'mdFormat' is null or empty"); + } - if (mdFormat == null || mdFormat.isEmpty()) { throw new DnetCollectorException("Param 'mdFormat' is null or empty"); } + if (fromDate != null && !fromDate.matches("\\d{4}-\\d{2}-\\d{2}")) { + throw new DnetCollectorException("Invalid date (YYYY-MM-DD): " + fromDate); + } - if (fromDate != null && !fromDate.matches("\\d{4}-\\d{2}-\\d{2}")) { throw new DnetCollectorException("Invalid date (YYYY-MM-DD): " + fromDate); } + if (untilDate != null && !untilDate.matches("\\d{4}-\\d{2}-\\d{2}")) { + throw new DnetCollectorException("Invalid date (YYYY-MM-DD): " + untilDate); + } - if (untilDate != null && !untilDate.matches("\\d{4}-\\d{2}-\\d{2}")) { throw new DnetCollectorException("Invalid date (YYYY-MM-DD): " + untilDate); } + final Iterator> iters = + sets.stream() + .map( + set -> + getOaiIteratorFactory() + .newIterator( + baseUrl, mdFormat, set, fromDate, + untilDate)) + .iterator(); - final Iterator> iters = sets.stream() - .map(set -> getOaiIteratorFactory().newIterator(baseUrl, mdFormat, set, fromDate, untilDate)) - .iterator(); + return StreamSupport.stream( + Spliterators.spliteratorUnknownSize(Iterators.concat(iters), Spliterator.ORDERED), + false); + } - return StreamSupport.stream(Spliterators.spliteratorUnknownSize(Iterators.concat(iters), Spliterator.ORDERED), false); - } - - public OaiIteratorFactory getOaiIteratorFactory() { - if (oaiIteratorFactory == null){ - oaiIteratorFactory = new OaiIteratorFactory(); - } - return oaiIteratorFactory; - } + public OaiIteratorFactory getOaiIteratorFactory() { + if (oaiIteratorFactory == null) { + oaiIteratorFactory = new OaiIteratorFactory(); + } + return oaiIteratorFactory; + } } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/oai/OaiIterator.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/oai/OaiIterator.java index cd093ed2d..013e83722 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/oai/OaiIterator.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/oai/OaiIterator.java @@ -1,15 +1,14 @@ package eu.dnetlib.dhp.collection.plugin.oai; +import eu.dnetlib.dhp.collection.worker.DnetCollectorException; +import eu.dnetlib.dhp.collection.worker.utils.HttpConnector; +import eu.dnetlib.dhp.collection.worker.utils.XmlCleaner; import java.io.StringReader; import java.io.UnsupportedEncodingException; import java.net.URLEncoder; import java.util.Iterator; import java.util.Queue; import java.util.concurrent.PriorityBlockingQueue; - -import eu.dnetlib.dhp.collection.worker.DnetCollectorException; -import eu.dnetlib.dhp.collection.worker.utils.HttpConnector; -import eu.dnetlib.dhp.collection.worker.utils.XmlCleaner; import org.apache.commons.lang.StringUtils; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; @@ -18,146 +17,163 @@ import org.dom4j.DocumentException; import org.dom4j.Node; import org.dom4j.io.SAXReader; - public class OaiIterator implements Iterator { - private static final Log log = LogFactory.getLog(OaiIterator.class); // NOPMD by marko on 11/24/08 5:02 PM + private static final Log log = + LogFactory.getLog(OaiIterator.class); // NOPMD by marko on 11/24/08 5:02 PM - private final Queue queue = new PriorityBlockingQueue<>(); - private final SAXReader reader = new SAXReader(); + private final Queue queue = new PriorityBlockingQueue<>(); + private final SAXReader reader = new SAXReader(); - private final String baseUrl; - private final String set; - private final String mdFormat; - private final String fromDate; - private final String untilDate; - private String token; - private boolean started; - private final HttpConnector httpConnector; + private final String baseUrl; + private final String set; + private final String mdFormat; + private final String fromDate; + private final String untilDate; + private String token; + private boolean started; + private final HttpConnector httpConnector; - public OaiIterator(final String baseUrl, final String mdFormat, final String set, final String fromDate, final String untilDate, - final HttpConnector httpConnector) { - this.baseUrl = baseUrl; - this.mdFormat = mdFormat; - this.set = set; - this.fromDate = fromDate; - this.untilDate = untilDate; - this.started = false; - this.httpConnector = httpConnector; - } + public OaiIterator( + final String baseUrl, + final String mdFormat, + final String set, + final String fromDate, + final String untilDate, + final HttpConnector httpConnector) { + this.baseUrl = baseUrl; + this.mdFormat = mdFormat; + this.set = set; + this.fromDate = fromDate; + this.untilDate = untilDate; + this.started = false; + this.httpConnector = httpConnector; + } - private void verifyStarted() { - if (!this.started) { - this.started = true; - try { - this.token = firstPage(); - } catch (final DnetCollectorException e) { - throw new RuntimeException(e); - } - } - } + private void verifyStarted() { + if (!this.started) { + this.started = true; + try { + this.token = firstPage(); + } catch (final DnetCollectorException e) { + throw new RuntimeException(e); + } + } + } - @Override - public boolean hasNext() { - synchronized (queue) { - verifyStarted(); - return !queue.isEmpty(); - } - } + @Override + public boolean hasNext() { + synchronized (queue) { + verifyStarted(); + return !queue.isEmpty(); + } + } - @Override - public String next() { - synchronized (queue) { - verifyStarted(); - final String res = queue.poll(); - while (queue.isEmpty() && token != null && !token.isEmpty()) { - try { - token = otherPages(token); - } catch (final DnetCollectorException e) { - throw new RuntimeException(e); - } - } - return res; - } - } + @Override + public String next() { + synchronized (queue) { + verifyStarted(); + final String res = queue.poll(); + while (queue.isEmpty() && token != null && !token.isEmpty()) { + try { + token = otherPages(token); + } catch (final DnetCollectorException e) { + throw new RuntimeException(e); + } + } + return res; + } + } - @Override - public void remove() {} + @Override + public void remove() {} - private String firstPage() throws DnetCollectorException { - try { - String url = baseUrl + "?verb=ListRecords&metadataPrefix=" + URLEncoder.encode(mdFormat, "UTF-8"); - if (set != null && !set.isEmpty()) { - url += "&set=" + URLEncoder.encode(set, "UTF-8"); - } - if (fromDate != null && fromDate.matches("\\d{4}-\\d{2}-\\d{2}")) { - url += "&from=" + URLEncoder.encode(fromDate, "UTF-8"); - } - if (untilDate != null && untilDate.matches("\\d{4}-\\d{2}-\\d{2}")) { - url += "&until=" + URLEncoder.encode(untilDate, "UTF-8"); - } - log.info("Start harvesting using url: " + url); + private String firstPage() throws DnetCollectorException { + try { + String url = + baseUrl + + "?verb=ListRecords&metadataPrefix=" + + URLEncoder.encode(mdFormat, "UTF-8"); + if (set != null && !set.isEmpty()) { + url += "&set=" + URLEncoder.encode(set, "UTF-8"); + } + if (fromDate != null && fromDate.matches("\\d{4}-\\d{2}-\\d{2}")) { + url += "&from=" + URLEncoder.encode(fromDate, "UTF-8"); + } + if (untilDate != null && untilDate.matches("\\d{4}-\\d{2}-\\d{2}")) { + url += "&until=" + URLEncoder.encode(untilDate, "UTF-8"); + } + log.info("Start harvesting using url: " + url); - return downloadPage(url); - } catch (final UnsupportedEncodingException e) { - throw new DnetCollectorException(e); - } - } + return downloadPage(url); + } catch (final UnsupportedEncodingException e) { + throw new DnetCollectorException(e); + } + } - private String extractResumptionToken(final String xml) { + private String extractResumptionToken(final String xml) { - final String s = StringUtils.substringAfter(xml, "", "", " newIterator(final String baseUrl, final String mdFormat, final String set, final String fromDate, final String untilDate) { - return new OaiIterator(baseUrl, mdFormat, set, fromDate, untilDate, getHttpConnector()); - } - - private HttpConnector getHttpConnector() { - if (httpConnector== null) - httpConnector = new HttpConnector(); - return httpConnector; - } - - + public Iterator newIterator( + final String baseUrl, + final String mdFormat, + final String set, + final String fromDate, + final String untilDate) { + return new OaiIterator(baseUrl, mdFormat, set, fromDate, untilDate, getHttpConnector()); + } + private HttpConnector getHttpConnector() { + if (httpConnector == null) httpConnector = new HttpConnector(); + return httpConnector; + } } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/DnetCollectorException.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/DnetCollectorException.java index 75ccc4d91..2b9921922 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/DnetCollectorException.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/DnetCollectorException.java @@ -2,29 +2,30 @@ package eu.dnetlib.dhp.collection.worker; public class DnetCollectorException extends Exception { - /** - * - */ - private static final long serialVersionUID = -290723075076039757L; + /** */ + private static final long serialVersionUID = -290723075076039757L; - public DnetCollectorException() { - super(); - } + public DnetCollectorException() { + super(); + } - public DnetCollectorException(final String message, final Throwable cause, final boolean enableSuppression, final boolean writableStackTrace) { - super(message, cause, enableSuppression, writableStackTrace); - } + public DnetCollectorException( + final String message, + final Throwable cause, + final boolean enableSuppression, + final boolean writableStackTrace) { + super(message, cause, enableSuppression, writableStackTrace); + } - public DnetCollectorException(final String message, final Throwable cause) { - super(message, cause); - } + public DnetCollectorException(final String message, final Throwable cause) { + super(message, cause); + } - public DnetCollectorException(final String message) { - super(message); - } - - public DnetCollectorException(final Throwable cause) { - super(cause); - } + public DnetCollectorException(final String message) { + super(message); + } + public DnetCollectorException(final Throwable cause) { + super(cause); + } } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/DnetCollectorWorker.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/DnetCollectorWorker.java index c76536b3a..5a0852501 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/DnetCollectorWorker.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/DnetCollectorWorker.java @@ -2,13 +2,17 @@ package eu.dnetlib.dhp.collection.worker; import com.fasterxml.jackson.databind.ObjectMapper; import eu.dnetlib.collector.worker.model.ApiDescriptor; - import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.collection.plugin.CollectorPlugin; import eu.dnetlib.dhp.collection.worker.utils.CollectorPluginFactory; import eu.dnetlib.message.Message; import eu.dnetlib.message.MessageManager; import eu.dnetlib.message.MessageType; +import java.io.IOException; +import java.net.URI; +import java.util.HashMap; +import java.util.Map; +import java.util.concurrent.atomic.AtomicInteger; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; @@ -18,37 +22,34 @@ import org.apache.hadoop.io.Text; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.io.IOException; -import java.net.URI; -import java.util.HashMap; -import java.util.Map; -import java.util.concurrent.atomic.AtomicInteger; - public class DnetCollectorWorker { private static final Logger log = LoggerFactory.getLogger(DnetCollectorWorker.class); - private final CollectorPluginFactory collectorPluginFactory; private final ArgumentApplicationParser argumentParser; private final MessageManager manager; - - public DnetCollectorWorker(final CollectorPluginFactory collectorPluginFactory, final ArgumentApplicationParser argumentParser, final MessageManager manager) throws DnetCollectorException { + public DnetCollectorWorker( + final CollectorPluginFactory collectorPluginFactory, + final ArgumentApplicationParser argumentParser, + final MessageManager manager) + throws DnetCollectorException { this.collectorPluginFactory = collectorPluginFactory; this.argumentParser = argumentParser; this.manager = manager; } - public void collect() throws DnetCollectorException { try { final ObjectMapper jsonMapper = new ObjectMapper(); - final ApiDescriptor api = jsonMapper.readValue(argumentParser.get("apidescriptor"), ApiDescriptor.class); + final ApiDescriptor api = + jsonMapper.readValue(argumentParser.get("apidescriptor"), ApiDescriptor.class); - final CollectorPlugin plugin = collectorPluginFactory.getPluginByProtocol(api.getProtocol()); + final CollectorPlugin plugin = + collectorPluginFactory.getPluginByProtocol(api.getProtocol()); final String hdfsuri = argumentParser.get("namenode"); @@ -62,7 +63,7 @@ public class DnetCollectorWorker { System.setProperty("HADOOP_USER_NAME", argumentParser.get("userHDFS")); System.setProperty("hadoop.home.dir", "/"); - //Get the filesystem - HDFS + // Get the filesystem - HDFS FileSystem.get(URI.create(hdfsuri), conf); Path hdfswritepath = new Path(argumentParser.get("hdfsPath")); @@ -71,43 +72,69 @@ public class DnetCollectorWorker { final Map ongoingMap = new HashMap<>(); final Map reportMap = new HashMap<>(); final AtomicInteger counter = new AtomicInteger(0); - try (SequenceFile.Writer writer = SequenceFile.createWriter(conf, - SequenceFile.Writer.file(hdfswritepath), SequenceFile.Writer.keyClass(IntWritable.class), - SequenceFile.Writer.valueClass(Text.class))) { + try (SequenceFile.Writer writer = + SequenceFile.createWriter( + conf, + SequenceFile.Writer.file(hdfswritepath), + SequenceFile.Writer.keyClass(IntWritable.class), + SequenceFile.Writer.valueClass(Text.class))) { final IntWritable key = new IntWritable(counter.get()); final Text value = new Text(); - plugin.collect(api).forEach(content -> { - - key.set(counter.getAndIncrement()); - value.set(content); - if (counter.get() % 10 == 0) { - try { - ongoingMap.put("ongoing", "" + counter.get()); - log.debug("Sending message: "+ manager.sendMessage(new Message(argumentParser.get("workflowId"), "Collection", MessageType.ONGOING, ongoingMap), argumentParser.get("rabbitOngoingQueue"), true, false)); - } catch (Exception e) { - log.error("Error on sending message ", e); - } - } - try { - writer.append(key, value); - } catch (IOException e) { - throw new RuntimeException(e); - } - - }); + plugin.collect(api) + .forEach( + content -> { + key.set(counter.getAndIncrement()); + value.set(content); + if (counter.get() % 10 == 0) { + try { + ongoingMap.put("ongoing", "" + counter.get()); + log.debug( + "Sending message: " + + manager.sendMessage( + new Message( + argumentParser.get( + "workflowId"), + "Collection", + MessageType.ONGOING, + ongoingMap), + argumentParser.get( + "rabbitOngoingQueue"), + true, + false)); + } catch (Exception e) { + log.error("Error on sending message ", e); + } + } + try { + writer.append(key, value); + } catch (IOException e) { + throw new RuntimeException(e); + } + }); } ongoingMap.put("ongoing", "" + counter.get()); - manager.sendMessage(new Message(argumentParser.get("workflowId"), "Collection", MessageType.ONGOING, ongoingMap), argumentParser.get("rabbitOngoingQueue"), true, false); + manager.sendMessage( + new Message( + argumentParser.get("workflowId"), + "Collection", + MessageType.ONGOING, + ongoingMap), + argumentParser.get("rabbitOngoingQueue"), + true, + false); reportMap.put("collected", "" + counter.get()); - manager.sendMessage(new Message(argumentParser.get("workflowId"), "Collection", MessageType.REPORT, reportMap), argumentParser.get("rabbitOngoingQueue"), true, false); + manager.sendMessage( + new Message( + argumentParser.get("workflowId"), + "Collection", + MessageType.REPORT, + reportMap), + argumentParser.get("rabbitOngoingQueue"), + true, + false); manager.close(); } catch (Throwable e) { - throw new DnetCollectorException("Error on collecting ",e); + throw new DnetCollectorException("Error on collecting ", e); } } - - - - - } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/DnetCollectorWorkerApplication.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/DnetCollectorWorkerApplication.java index d4bd22817..9c19383bb 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/DnetCollectorWorkerApplication.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/DnetCollectorWorkerApplication.java @@ -1,6 +1,5 @@ package eu.dnetlib.dhp.collection.worker; - import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.collection.worker.utils.CollectorPluginFactory; import eu.dnetlib.message.MessageManager; @@ -8,16 +7,13 @@ import org.apache.commons.io.IOUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; - /** - * DnetCollectortWorkerApplication is the main class responsible to start - * the Dnet Collection into HDFS. - * This module will be executed on the hadoop cluster and taking in input some parameters - * that tells it which is the right collector plugin to use and where store the data into HDFS path + * DnetCollectortWorkerApplication is the main class responsible to start the Dnet Collection into + * HDFS. This module will be executed on the hadoop cluster and taking in input some parameters that + * tells it which is the right collector plugin to use and where store the data into HDFS path * * @author Sandro La Bruzzo */ - public class DnetCollectorWorkerApplication { private static final Logger log = LoggerFactory.getLogger(DnetCollectorWorkerApplication.class); @@ -26,22 +22,27 @@ public class DnetCollectorWorkerApplication { private static ArgumentApplicationParser argumentParser; - - /** - * @param args - */ + /** @param args */ public static void main(final String[] args) throws Exception { - argumentParser= new ArgumentApplicationParser(IOUtils.toString(DnetCollectorWorker.class.getResourceAsStream("/eu/dnetlib/collector/worker/collector_parameter.json"))); + argumentParser = + new ArgumentApplicationParser( + IOUtils.toString( + DnetCollectorWorker.class.getResourceAsStream( + "/eu/dnetlib/collector/worker/collector_parameter.json"))); argumentParser.parseArgument(args); log.info("hdfsPath =" + argumentParser.get("hdfsPath")); log.info("json = " + argumentParser.get("apidescriptor")); - final MessageManager manager = new MessageManager(argumentParser.get("rabbitHost"), argumentParser.get("rabbitUser"), argumentParser.get("rabbitPassword"), false, false, null); - final DnetCollectorWorker worker = new DnetCollectorWorker(collectorPluginFactory, argumentParser, manager); + final MessageManager manager = + new MessageManager( + argumentParser.get("rabbitHost"), + argumentParser.get("rabbitUser"), + argumentParser.get("rabbitPassword"), + false, + false, + null); + final DnetCollectorWorker worker = + new DnetCollectorWorker(collectorPluginFactory, argumentParser, manager); worker.collect(); - - } - - } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/utils/CollectorPluginErrorLogList.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/utils/CollectorPluginErrorLogList.java index 807479c5d..27d8fa5fb 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/utils/CollectorPluginErrorLogList.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/utils/CollectorPluginErrorLogList.java @@ -4,16 +4,15 @@ import java.util.LinkedList; public class CollectorPluginErrorLogList extends LinkedList { - private static final long serialVersionUID = -6925786561303289704L; - - @Override - public String toString() { - String log = new String(); - int index = 0; - for (final String errorMessage : this) { - log += String.format("Retry #%s: %s / ", index++, errorMessage); - } - return log; - } + private static final long serialVersionUID = -6925786561303289704L; + @Override + public String toString() { + String log = new String(); + int index = 0; + for (final String errorMessage : this) { + log += String.format("Retry #%s: %s / ", index++, errorMessage); + } + return log; + } } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/utils/CollectorPluginFactory.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/utils/CollectorPluginFactory.java index cc2eaaddc..2dd0f1a6f 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/utils/CollectorPluginFactory.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/utils/CollectorPluginFactory.java @@ -4,19 +4,16 @@ import eu.dnetlib.dhp.collection.plugin.CollectorPlugin; import eu.dnetlib.dhp.collection.plugin.oai.OaiCollectorPlugin; import eu.dnetlib.dhp.collection.worker.DnetCollectorException; -; - - public class CollectorPluginFactory { - public CollectorPlugin getPluginByProtocol(final String protocol) throws DnetCollectorException { - if (protocol==null) throw new DnetCollectorException("protocol cannot be null"); - switch (protocol.toLowerCase().trim()){ + public CollectorPlugin getPluginByProtocol(final String protocol) + throws DnetCollectorException { + if (protocol == null) throw new DnetCollectorException("protocol cannot be null"); + switch (protocol.toLowerCase().trim()) { case "oai": return new OaiCollectorPlugin(); default: - throw new DnetCollectorException("UNknown protocol"); + throw new DnetCollectorException("UNknown protocol"); } - } } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/utils/HttpConnector.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/utils/HttpConnector.java index 24e9f1ac1..816ebbd80 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/utils/HttpConnector.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/utils/HttpConnector.java @@ -1,15 +1,6 @@ package eu.dnetlib.dhp.collection.worker.utils; import eu.dnetlib.dhp.collection.worker.DnetCollectorException; -import org.apache.commons.io.IOUtils; -import org.apache.commons.lang.math.NumberUtils; -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; - -import javax.net.ssl.HttpsURLConnection; -import javax.net.ssl.SSLContext; -import javax.net.ssl.TrustManager; -import javax.net.ssl.X509TrustManager; import java.io.IOException; import java.io.InputStream; import java.net.*; @@ -17,203 +8,243 @@ import java.security.GeneralSecurityException; import java.security.cert.X509Certificate; import java.util.List; import java.util.Map; - +import javax.net.ssl.HttpsURLConnection; +import javax.net.ssl.SSLContext; +import javax.net.ssl.TrustManager; +import javax.net.ssl.X509TrustManager; +import org.apache.commons.io.IOUtils; +import org.apache.commons.lang.math.NumberUtils; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; public class HttpConnector { - private static final Log log = LogFactory.getLog(HttpConnector.class); + private static final Log log = LogFactory.getLog(HttpConnector.class); - private int maxNumberOfRetry = 6; - private int defaultDelay = 120; // seconds - private int readTimeOut = 120; // seconds + private int maxNumberOfRetry = 6; + private int defaultDelay = 120; // seconds + private int readTimeOut = 120; // seconds - private String responseType = null; + private String responseType = null; - private final String userAgent = "Mozilla/5.0 (compatible; OAI; +http://www.openaire.eu)"; + private final String userAgent = "Mozilla/5.0 (compatible; OAI; +http://www.openaire.eu)"; - public HttpConnector() { - CookieHandler.setDefault(new CookieManager(null, CookiePolicy.ACCEPT_ALL)); - } + public HttpConnector() { + CookieHandler.setDefault(new CookieManager(null, CookiePolicy.ACCEPT_ALL)); + } - /** - * Given the URL returns the content via HTTP GET - * - * @param requestUrl - * the URL - * @return the content of the downloaded resource - * @throws DnetCollectorException - * when retrying more than maxNumberOfRetry times - */ - public String getInputSource(final String requestUrl) throws DnetCollectorException { - return attemptDownlaodAsString(requestUrl, 1, new CollectorPluginErrorLogList()); - } + /** + * Given the URL returns the content via HTTP GET + * + * @param requestUrl the URL + * @return the content of the downloaded resource + * @throws DnetCollectorException when retrying more than maxNumberOfRetry times + */ + public String getInputSource(final String requestUrl) throws DnetCollectorException { + return attemptDownlaodAsString(requestUrl, 1, new CollectorPluginErrorLogList()); + } - /** - * Given the URL returns the content as a stream via HTTP GET - * - * @param requestUrl - * the URL - * @return the content of the downloaded resource as InputStream - * @throws DnetCollectorException - * when retrying more than maxNumberOfRetry times - */ - public InputStream getInputSourceAsStream(final String requestUrl) throws DnetCollectorException { - return attemptDownload(requestUrl, 1, new CollectorPluginErrorLogList()); - } + /** + * Given the URL returns the content as a stream via HTTP GET + * + * @param requestUrl the URL + * @return the content of the downloaded resource as InputStream + * @throws DnetCollectorException when retrying more than maxNumberOfRetry times + */ + public InputStream getInputSourceAsStream(final String requestUrl) + throws DnetCollectorException { + return attemptDownload(requestUrl, 1, new CollectorPluginErrorLogList()); + } - private String attemptDownlaodAsString(final String requestUrl, final int retryNumber, final CollectorPluginErrorLogList errorList) - throws DnetCollectorException { - try { - final InputStream s = attemptDownload(requestUrl, 1, new CollectorPluginErrorLogList()); - try { - return IOUtils.toString(s); - } catch (final IOException e) { - log.error("error while retrieving from http-connection occured: " + requestUrl, e); - Thread.sleep(defaultDelay * 1000); - errorList.add(e.getMessage()); - return attemptDownlaodAsString(requestUrl, retryNumber + 1, errorList); - } finally { - IOUtils.closeQuietly(s); - } - } catch (final InterruptedException e) { - throw new DnetCollectorException(e); - } - } + private String attemptDownlaodAsString( + final String requestUrl, + final int retryNumber, + final CollectorPluginErrorLogList errorList) + throws DnetCollectorException { + try { + final InputStream s = attemptDownload(requestUrl, 1, new CollectorPluginErrorLogList()); + try { + return IOUtils.toString(s); + } catch (final IOException e) { + log.error("error while retrieving from http-connection occured: " + requestUrl, e); + Thread.sleep(defaultDelay * 1000); + errorList.add(e.getMessage()); + return attemptDownlaodAsString(requestUrl, retryNumber + 1, errorList); + } finally { + IOUtils.closeQuietly(s); + } + } catch (final InterruptedException e) { + throw new DnetCollectorException(e); + } + } - private InputStream attemptDownload(final String requestUrl, final int retryNumber, final CollectorPluginErrorLogList errorList) - throws DnetCollectorException { + private InputStream attemptDownload( + final String requestUrl, + final int retryNumber, + final CollectorPluginErrorLogList errorList) + throws DnetCollectorException { - if (retryNumber > maxNumberOfRetry) { throw new DnetCollectorException("Max number of retries exceeded. Cause: \n " + errorList); } + if (retryNumber > maxNumberOfRetry) { + throw new DnetCollectorException( + "Max number of retries exceeded. Cause: \n " + errorList); + } - log.debug("Downloading " + requestUrl + " - try: " + retryNumber); - try { - InputStream input = null; + log.debug("Downloading " + requestUrl + " - try: " + retryNumber); + try { + InputStream input = null; - try { - final HttpURLConnection urlConn = (HttpURLConnection) new URL(requestUrl).openConnection(); - urlConn.setInstanceFollowRedirects(false); - urlConn.setReadTimeout(readTimeOut * 1000); - urlConn.addRequestProperty("User-Agent", userAgent); + try { + final HttpURLConnection urlConn = + (HttpURLConnection) new URL(requestUrl).openConnection(); + urlConn.setInstanceFollowRedirects(false); + urlConn.setReadTimeout(readTimeOut * 1000); + urlConn.addRequestProperty("User-Agent", userAgent); - if (log.isDebugEnabled()) { - logHeaderFields(urlConn); - } + if (log.isDebugEnabled()) { + logHeaderFields(urlConn); + } - final int retryAfter = obtainRetryAfter(urlConn.getHeaderFields()); - if (retryAfter > 0 && urlConn.getResponseCode() == HttpURLConnection.HTTP_UNAVAILABLE) { - log.warn("waiting and repeating request after " + retryAfter + " sec."); - Thread.sleep(retryAfter * 1000); - errorList.add("503 Service Unavailable"); - urlConn.disconnect(); - return attemptDownload(requestUrl, retryNumber + 1, errorList); - } else if (urlConn.getResponseCode() == HttpURLConnection.HTTP_MOVED_PERM || urlConn.getResponseCode() == HttpURLConnection.HTTP_MOVED_TEMP) { - final String newUrl = obtainNewLocation(urlConn.getHeaderFields()); - log.debug("The requested url has been moved to " + newUrl); - errorList.add(String.format("%s %s. Moved to: %s", urlConn.getResponseCode(), urlConn.getResponseMessage(), newUrl)); - urlConn.disconnect(); - return attemptDownload(newUrl, retryNumber + 1, errorList); - } else if (urlConn.getResponseCode() != HttpURLConnection.HTTP_OK) { - log.error(String.format("HTTP error: %s %s", urlConn.getResponseCode(), urlConn.getResponseMessage())); - Thread.sleep(defaultDelay * 1000); - errorList.add(String.format("%s %s", urlConn.getResponseCode(), urlConn.getResponseMessage())); - urlConn.disconnect(); - return attemptDownload(requestUrl, retryNumber + 1, errorList); - } else { - input = urlConn.getInputStream(); - responseType = urlConn.getContentType(); - return input; - } - } catch (final IOException e) { - log.error("error while retrieving from http-connection occured: " + requestUrl, e); - Thread.sleep(defaultDelay * 1000); - errorList.add(e.getMessage()); - return attemptDownload(requestUrl, retryNumber + 1, errorList); - } - } catch (final InterruptedException e) { - throw new DnetCollectorException(e); - } - } + final int retryAfter = obtainRetryAfter(urlConn.getHeaderFields()); + if (retryAfter > 0 + && urlConn.getResponseCode() == HttpURLConnection.HTTP_UNAVAILABLE) { + log.warn("waiting and repeating request after " + retryAfter + " sec."); + Thread.sleep(retryAfter * 1000); + errorList.add("503 Service Unavailable"); + urlConn.disconnect(); + return attemptDownload(requestUrl, retryNumber + 1, errorList); + } else if (urlConn.getResponseCode() == HttpURLConnection.HTTP_MOVED_PERM + || urlConn.getResponseCode() == HttpURLConnection.HTTP_MOVED_TEMP) { + final String newUrl = obtainNewLocation(urlConn.getHeaderFields()); + log.debug("The requested url has been moved to " + newUrl); + errorList.add( + String.format( + "%s %s. Moved to: %s", + urlConn.getResponseCode(), + urlConn.getResponseMessage(), + newUrl)); + urlConn.disconnect(); + return attemptDownload(newUrl, retryNumber + 1, errorList); + } else if (urlConn.getResponseCode() != HttpURLConnection.HTTP_OK) { + log.error( + String.format( + "HTTP error: %s %s", + urlConn.getResponseCode(), urlConn.getResponseMessage())); + Thread.sleep(defaultDelay * 1000); + errorList.add( + String.format( + "%s %s", + urlConn.getResponseCode(), urlConn.getResponseMessage())); + urlConn.disconnect(); + return attemptDownload(requestUrl, retryNumber + 1, errorList); + } else { + input = urlConn.getInputStream(); + responseType = urlConn.getContentType(); + return input; + } + } catch (final IOException e) { + log.error("error while retrieving from http-connection occured: " + requestUrl, e); + Thread.sleep(defaultDelay * 1000); + errorList.add(e.getMessage()); + return attemptDownload(requestUrl, retryNumber + 1, errorList); + } + } catch (final InterruptedException e) { + throw new DnetCollectorException(e); + } + } - private void logHeaderFields(final HttpURLConnection urlConn) throws IOException { - log.debug("StatusCode: " + urlConn.getResponseMessage()); + private void logHeaderFields(final HttpURLConnection urlConn) throws IOException { + log.debug("StatusCode: " + urlConn.getResponseMessage()); - for (final Map.Entry> e : urlConn.getHeaderFields().entrySet()) { - if (e.getKey() != null) { - for (final String v : e.getValue()) { - log.debug(" key: " + e.getKey() + " - value: " + v); - } - } - } - } + for (final Map.Entry> e : urlConn.getHeaderFields().entrySet()) { + if (e.getKey() != null) { + for (final String v : e.getValue()) { + log.debug(" key: " + e.getKey() + " - value: " + v); + } + } + } + } - private int obtainRetryAfter(final Map> headerMap) { - for (final String key : headerMap.keySet()) { - if (key != null && key.toLowerCase().equals("retry-after") && headerMap.get(key).size() > 0 - && NumberUtils.isNumber(headerMap.get(key).get(0))) { return Integer.parseInt(headerMap.get(key).get(0)) + 10; } - } - return -1; - } + private int obtainRetryAfter(final Map> headerMap) { + for (final String key : headerMap.keySet()) { + if (key != null + && key.toLowerCase().equals("retry-after") + && headerMap.get(key).size() > 0 + && NumberUtils.isNumber(headerMap.get(key).get(0))) { + return Integer.parseInt(headerMap.get(key).get(0)) + 10; + } + } + return -1; + } - private String obtainNewLocation(final Map> headerMap) throws DnetCollectorException { - for (final String key : headerMap.keySet()) { - if (key != null && key.toLowerCase().equals("location") && headerMap.get(key).size() > 0) { return headerMap.get(key).get(0); } - } - throw new DnetCollectorException("The requested url has been MOVED, but 'location' param is MISSING"); - } + private String obtainNewLocation(final Map> headerMap) + throws DnetCollectorException { + for (final String key : headerMap.keySet()) { + if (key != null + && key.toLowerCase().equals("location") + && headerMap.get(key).size() > 0) { + return headerMap.get(key).get(0); + } + } + throw new DnetCollectorException( + "The requested url has been MOVED, but 'location' param is MISSING"); + } - /** - * register for https scheme; this is a workaround and not intended for the use in trusted environments - */ - public void initTrustManager() { - final X509TrustManager tm = new X509TrustManager() { + /** + * register for https scheme; this is a workaround and not intended for the use in trusted + * environments + */ + public void initTrustManager() { + final X509TrustManager tm = + new X509TrustManager() { - @Override - public void checkClientTrusted(final X509Certificate[] xcs, final String string) {} + @Override + public void checkClientTrusted( + final X509Certificate[] xcs, final String string) {} - @Override - public void checkServerTrusted(final X509Certificate[] xcs, final String string) {} + @Override + public void checkServerTrusted( + final X509Certificate[] xcs, final String string) {} - @Override - public X509Certificate[] getAcceptedIssuers() { - return null; - } - }; - try { - final SSLContext ctx = SSLContext.getInstance("TLS"); - ctx.init(null, new TrustManager[] { tm }, null); - HttpsURLConnection.setDefaultSSLSocketFactory(ctx.getSocketFactory()); - } catch (final GeneralSecurityException e) { - log.fatal(e); - throw new IllegalStateException(e); - } - } + @Override + public X509Certificate[] getAcceptedIssuers() { + return null; + } + }; + try { + final SSLContext ctx = SSLContext.getInstance("TLS"); + ctx.init(null, new TrustManager[] {tm}, null); + HttpsURLConnection.setDefaultSSLSocketFactory(ctx.getSocketFactory()); + } catch (final GeneralSecurityException e) { + log.fatal(e); + throw new IllegalStateException(e); + } + } - public int getMaxNumberOfRetry() { - return maxNumberOfRetry; - } + public int getMaxNumberOfRetry() { + return maxNumberOfRetry; + } - public void setMaxNumberOfRetry(final int maxNumberOfRetry) { - this.maxNumberOfRetry = maxNumberOfRetry; - } + public void setMaxNumberOfRetry(final int maxNumberOfRetry) { + this.maxNumberOfRetry = maxNumberOfRetry; + } - public int getDefaultDelay() { - return defaultDelay; - } + public int getDefaultDelay() { + return defaultDelay; + } - public void setDefaultDelay(final int defaultDelay) { - this.defaultDelay = defaultDelay; - } + public void setDefaultDelay(final int defaultDelay) { + this.defaultDelay = defaultDelay; + } - public int getReadTimeOut() { - return readTimeOut; - } + public int getReadTimeOut() { + return readTimeOut; + } - public void setReadTimeOut(final int readTimeOut) { - this.readTimeOut = readTimeOut; - } - - public String getResponseType() { - return responseType; - } + public void setReadTimeOut(final int readTimeOut) { + this.readTimeOut = readTimeOut; + } + public String getResponseType() { + return responseType; + } } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/utils/XmlCleaner.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/utils/XmlCleaner.java index 8c8ee629f..911b65cc0 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/utils/XmlCleaner.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/utils/XmlCleaner.java @@ -6,254 +6,392 @@ import java.util.Map; import java.util.Set; import java.util.regex.Pattern; -/** - * @author jochen, Andreas Czerniak - * - */ +/** @author jochen, Andreas Czerniak */ public class XmlCleaner { - /** - * Pattern for numeric entities. - */ - private static Pattern validCharacterEntityPattern = Pattern.compile("^&#x?\\d{2,4};"); //$NON-NLS-1$ - // private static Pattern validCharacterEntityPattern = Pattern.compile("^&#?\\d{2,4};"); //$NON-NLS-1$ + /** Pattern for numeric entities. */ + private static Pattern validCharacterEntityPattern = + Pattern.compile("^&#x?\\d{2,4};"); // $NON-NLS-1$ + // private static Pattern validCharacterEntityPattern = Pattern.compile("^&#?\\d{2,4};"); + // //$NON-NLS-1$ - // see https://www.w3.org/TR/REC-xml/#charsets , not only limited to - private static Pattern invalidControlCharPattern = Pattern.compile("&#x?1[0-9a-fA-F];"); + // see https://www.w3.org/TR/REC-xml/#charsets , not only limited to + private static Pattern invalidControlCharPattern = Pattern.compile("&#x?1[0-9a-fA-F];"); - /** - * Pattern that negates the allowable XML 4 byte unicode characters. Valid are: #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | - * [#x10000-#x10FFFF] - */ - private static Pattern invalidCharacterPattern = Pattern.compile("[^\t\r\n\u0020-\uD7FF\uE000-\uFFFD]"); //$NON-NLS-1$ + /** + * Pattern that negates the allowable XML 4 byte unicode characters. Valid are: #x9 | #xA | #xD + * | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF] + */ + private static Pattern invalidCharacterPattern = + Pattern.compile("[^\t\r\n\u0020-\uD7FF\uE000-\uFFFD]"); // $NON-NLS-1$ - // Map entities to their unicode equivalent - private static Set goodEntities = new HashSet<>(); - private static Map badEntities = new HashMap<>(); + // Map entities to their unicode equivalent + private static Set goodEntities = new HashSet<>(); + private static Map badEntities = new HashMap<>(); - static { - // pre-defined XML entities - goodEntities.add("""); //$NON-NLS-1$ // quotation mark - goodEntities.add("&"); //$NON-NLS-1$ // ampersand - goodEntities.add("<"); //$NON-NLS-1$ // less-than sign - goodEntities.add(">"); //$NON-NLS-1$ // greater-than sign - // control entities - // badEntities.put(" ", ""); - badEntities.put("", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character - badEntities.put("€", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character - badEntities.put("", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character - badEntities.put("‚", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character - badEntities.put("ƒ", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character - badEntities.put("„", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character - badEntities.put("…", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character - badEntities.put("†", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character - badEntities.put("‡", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character - badEntities.put("ˆ", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character - badEntities.put("‰", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character - badEntities.put("Š", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character - badEntities.put("‹", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character - badEntities.put("Œ", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character - badEntities.put("", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character - badEntities.put("Ž", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character - badEntities.put("", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character - badEntities.put("", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character - badEntities.put("‘", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character - badEntities.put("’", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character - badEntities.put("“", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character - badEntities.put("”", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character - badEntities.put("•", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character - badEntities.put("–", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character - badEntities.put("—", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character - badEntities.put("˜", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character - badEntities.put("™", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character - badEntities.put("š", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character - badEntities.put("›", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character - badEntities.put("œ", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character - badEntities.put("", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character - badEntities.put("ž", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character - badEntities.put("Ÿ", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character - // misc entities - badEntities.put("€", "\u20AC"); //$NON-NLS-1$ //$NON-NLS-2$ // euro - badEntities.put("‘", "\u2018"); //$NON-NLS-1$ //$NON-NLS-2$ // left single quotation mark - badEntities.put("’", "\u2019"); //$NON-NLS-1$ //$NON-NLS-2$ // right single quotation mark - // Latin 1 entities - badEntities.put(" ", "\u00A0"); //$NON-NLS-1$ //$NON-NLS-2$ // no-break space - badEntities.put("¡", "\u00A1"); //$NON-NLS-1$ //$NON-NLS-2$ // inverted exclamation mark - badEntities.put("¢", "\u00A2"); //$NON-NLS-1$ //$NON-NLS-2$ // cent sign - badEntities.put("£", "\u00A3"); //$NON-NLS-1$ //$NON-NLS-2$ // pound sign - badEntities.put("¤", "\u00A4"); //$NON-NLS-1$ //$NON-NLS-2$ // currency sign - badEntities.put("¥", "\u00A5"); //$NON-NLS-1$ //$NON-NLS-2$ // yen sign - badEntities.put("¦", "\u00A6"); //$NON-NLS-1$ //$NON-NLS-2$ // broken vertical bar - badEntities.put("§", "\u00A7"); //$NON-NLS-1$ //$NON-NLS-2$ // section sign - badEntities.put("¨", "\u00A8"); //$NON-NLS-1$ //$NON-NLS-2$ // diaeresis - badEntities.put("©", "\u00A9"); //$NON-NLS-1$ //$NON-NLS-2$ // copyright sign - badEntities.put("ª", "\u00AA"); //$NON-NLS-1$ //$NON-NLS-2$ // feminine ordinal indicator - badEntities.put("«", "\u00AB"); //$NON-NLS-1$ //$NON-NLS-2$ // left-pointing double angle quotation mark - badEntities.put("¬", "\u00AC"); //$NON-NLS-1$ //$NON-NLS-2$ // not sign - badEntities.put("­", "\u00AD"); //$NON-NLS-1$ //$NON-NLS-2$ // soft hyphen - badEntities.put("®", "\u00AE"); //$NON-NLS-1$ //$NON-NLS-2$ // registered sign - badEntities.put("¯", "\u00AF"); //$NON-NLS-1$ //$NON-NLS-2$ // macron - badEntities.put("°", "\u00B0"); //$NON-NLS-1$ //$NON-NLS-2$ // degree sign - badEntities.put("±", "\u00B1"); //$NON-NLS-1$ //$NON-NLS-2$ // plus-minus sign - badEntities.put("²", "\u00B2"); //$NON-NLS-1$ //$NON-NLS-2$ // superscript two - badEntities.put("³", "\u00B3"); //$NON-NLS-1$ //$NON-NLS-2$ // superscript three - badEntities.put("´", "\u00B4"); //$NON-NLS-1$ //$NON-NLS-2$ // acute accent - badEntities.put("µ", "\u00B5"); //$NON-NLS-1$ //$NON-NLS-2$ // micro sign - badEntities.put("¶", "\u00B6"); //$NON-NLS-1$ //$NON-NLS-2$ // pilcrow sign - badEntities.put("·", "\u00B7"); //$NON-NLS-1$ //$NON-NLS-2$ // middle dot - badEntities.put("¸", "\u00B8"); //$NON-NLS-1$ //$NON-NLS-2$ // cedilla - badEntities.put("¹", "\u00B9"); //$NON-NLS-1$ //$NON-NLS-2$ // superscript one - badEntities.put("º", "\u00BA"); //$NON-NLS-1$ //$NON-NLS-2$ // masculine ordinal indicator - badEntities.put("»", "\u00BB"); //$NON-NLS-1$ //$NON-NLS-2$ // right-pointing double angle quotation mark - badEntities.put("¼", "\u00BC"); //$NON-NLS-1$ //$NON-NLS-2$ // vulgar fraction one quarter - badEntities.put("½", "\u00BD"); //$NON-NLS-1$ //$NON-NLS-2$ // vulgar fraction one half - badEntities.put("¾", "\u00BE"); //$NON-NLS-1$ //$NON-NLS-2$ // vulgar fraction three quarters - badEntities.put("¿", "\u00BF"); //$NON-NLS-1$ //$NON-NLS-2$ // inverted question mark - badEntities.put("À", "\u00C0"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter A with grave - badEntities.put("Á", "\u00C1"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter A with acute - badEntities.put("Â", "\u00C2"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter A with circumflex - badEntities.put("Ã", "\u00C3"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter A with tilde - badEntities.put("Ä", "\u00C4"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter A with diaeresis - badEntities.put("Å", "\u00C5"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter A with ring above - badEntities.put("Æ", "\u00C6"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter AE - badEntities.put("Ç", "\u00C7"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter C with cedilla - badEntities.put("È", "\u00C8"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter E with grave - badEntities.put("É", "\u00C9"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter E with acute - badEntities.put("Ê", "\u00CA"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter E with circumflex - badEntities.put("Ë", "\u00CB"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter E with diaeresis - badEntities.put("Ì", "\u00CC"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter I with grave - badEntities.put("Í", "\u00CD"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter I with acute - badEntities.put("Î", "\u00CE"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter I with circumflex - badEntities.put("Ï", "\u00CF"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter I with diaeresis - badEntities.put("Ð", "\u00D0"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter ETH - badEntities.put("Ñ", "\u00D1"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter N with tilde - badEntities.put("Ò", "\u00D2"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter O with grave - badEntities.put("Ó", "\u00D3"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter O with acute - badEntities.put("Ô", "\u00D4"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter O with circumflex - badEntities.put("Õ", "\u00D5"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter O with tilde - badEntities.put("Ö", "\u00D6"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter O with diaeresis - badEntities.put("×", "\u00D7"); //$NON-NLS-1$ //$NON-NLS-2$ // multiplication sign - badEntities.put("Ø", "\u00D8"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter O with stroke - badEntities.put("Ù", "\u00D9"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter U with grave - badEntities.put("Ú", "\u00DA"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter U with acute - badEntities.put("Û", "\u00DB"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter U with circumflex - badEntities.put("Ü", "\u00DC"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter U with diaeresis - badEntities.put("Ý", "\u00DD"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter Y with acute - badEntities.put("Þ", "\u00DE"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter THORN - badEntities.put("ß", "\u00DF"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter sharp s - badEntities.put("à", "\u00E0"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter a with grave - badEntities.put("á", "\u00E1"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter a with acute - badEntities.put("â", "\u00E2"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter a with circumflex - badEntities.put("ã", "\u00E3"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter a with tilde - badEntities.put("ä", "\u00E4"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter a with diaeresis - badEntities.put("å", "\u00E5"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter a with ring above - badEntities.put("æ", "\u00E6"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter ae - badEntities.put("ç", "\u00E7"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter c with cedilla - badEntities.put("è", "\u00E8"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter e with grave - badEntities.put("é", "\u00E9"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter e with acute - badEntities.put("ê", "\u00EA"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter e with circumflex - badEntities.put("ë", "\u00EB"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter e with diaeresis - badEntities.put("ì", "\u00EC"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter i with grave - badEntities.put("í", "\u00ED"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter i with acute - badEntities.put("î", "\u00EE"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter i with circumflex - badEntities.put("ï", "\u00EF"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter i with diaeresis - badEntities.put("ð", "\u00F0"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter eth - badEntities.put("ñ", "\u00F1"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter n with tilde - badEntities.put("ò", "\u00F2"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter o with grave - badEntities.put("ó", "\u00F3"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter o with acute - badEntities.put("ô", "\u00F4"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter o with circumflex - badEntities.put("õ", "\u00F5"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter o with tilde - badEntities.put("ö", "\u00F6"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter o with diaeresis - badEntities.put("÷", "\u00F7"); //$NON-NLS-1$ //$NON-NLS-2$ // division sign - badEntities.put("ø", "\u00F8"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter o with stroke - badEntities.put("ù", "\u00F9"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter u with grave - badEntities.put("ú", "\u00FA"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter u with acute - badEntities.put("û", "\u00FB"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter u with circumflex - badEntities.put("ü", "\u00FC"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter u with diaeresis - badEntities.put("ý", "\u00FD"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter y with acute - badEntities.put("þ", "\u00FE"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter thorn - badEntities.put("ÿ", "\u00FF"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter y with diaeresis - } + static { + // pre-defined XML entities + goodEntities.add("""); // $NON-NLS-1$ // quotation mark + goodEntities.add("&"); // $NON-NLS-1$ // ampersand + goodEntities.add("<"); // $NON-NLS-1$ // less-than sign + goodEntities.add(">"); // $NON-NLS-1$ // greater-than sign + // control entities + // badEntities.put(" ", ""); + badEntities.put("", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("€", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("‚", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("ƒ", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("„", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("…", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("†", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("‡", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("ˆ", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("‰", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("Š", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("‹", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("Œ", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("Ž", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("‘", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("’", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("“", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("”", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("•", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("–", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("—", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("˜", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("™", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("š", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("›", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("œ", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("ž", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("Ÿ", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + // misc entities + badEntities.put("€", "\u20AC"); // $NON-NLS-1$ //$NON-NLS-2$ // euro + badEntities.put( + "‘", "\u2018"); // $NON-NLS-1$ //$NON-NLS-2$ // left single quotation mark + badEntities.put( + "’", "\u2019"); // $NON-NLS-1$ //$NON-NLS-2$ // right single quotation mark + // Latin 1 entities + badEntities.put(" ", "\u00A0"); // $NON-NLS-1$ //$NON-NLS-2$ // no-break space + badEntities.put( + "¡", "\u00A1"); // $NON-NLS-1$ //$NON-NLS-2$ // inverted exclamation mark + badEntities.put("¢", "\u00A2"); // $NON-NLS-1$ //$NON-NLS-2$ // cent sign + badEntities.put("£", "\u00A3"); // $NON-NLS-1$ //$NON-NLS-2$ // pound sign + badEntities.put("¤", "\u00A4"); // $NON-NLS-1$ //$NON-NLS-2$ // currency sign + badEntities.put("¥", "\u00A5"); // $NON-NLS-1$ //$NON-NLS-2$ // yen sign + badEntities.put("¦", "\u00A6"); // $NON-NLS-1$ //$NON-NLS-2$ // broken vertical bar + badEntities.put("§", "\u00A7"); // $NON-NLS-1$ //$NON-NLS-2$ // section sign + badEntities.put("¨", "\u00A8"); // $NON-NLS-1$ //$NON-NLS-2$ // diaeresis + badEntities.put("©", "\u00A9"); // $NON-NLS-1$ //$NON-NLS-2$ // copyright sign + badEntities.put( + "ª", "\u00AA"); // $NON-NLS-1$ //$NON-NLS-2$ // feminine ordinal indicator + badEntities.put( + "«", + "\u00AB"); //$NON-NLS-1$ //$NON-NLS-2$ // left-pointing double angle quotation mark + badEntities.put("¬", "\u00AC"); // $NON-NLS-1$ //$NON-NLS-2$ // not sign + badEntities.put("­", "\u00AD"); // $NON-NLS-1$ //$NON-NLS-2$ // soft hyphen + badEntities.put("®", "\u00AE"); // $NON-NLS-1$ //$NON-NLS-2$ // registered sign + badEntities.put("¯", "\u00AF"); // $NON-NLS-1$ //$NON-NLS-2$ // macron + badEntities.put("°", "\u00B0"); // $NON-NLS-1$ //$NON-NLS-2$ // degree sign + badEntities.put("±", "\u00B1"); // $NON-NLS-1$ //$NON-NLS-2$ // plus-minus sign + badEntities.put("²", "\u00B2"); // $NON-NLS-1$ //$NON-NLS-2$ // superscript two + badEntities.put("³", "\u00B3"); // $NON-NLS-1$ //$NON-NLS-2$ // superscript three + badEntities.put("´", "\u00B4"); // $NON-NLS-1$ //$NON-NLS-2$ // acute accent + badEntities.put("µ", "\u00B5"); // $NON-NLS-1$ //$NON-NLS-2$ // micro sign + badEntities.put("¶", "\u00B6"); // $NON-NLS-1$ //$NON-NLS-2$ // pilcrow sign + badEntities.put("·", "\u00B7"); // $NON-NLS-1$ //$NON-NLS-2$ // middle dot + badEntities.put("¸", "\u00B8"); // $NON-NLS-1$ //$NON-NLS-2$ // cedilla + badEntities.put("¹", "\u00B9"); // $NON-NLS-1$ //$NON-NLS-2$ // superscript one + badEntities.put( + "º", "\u00BA"); // $NON-NLS-1$ //$NON-NLS-2$ // masculine ordinal indicator + badEntities.put( + "»", + "\u00BB"); //$NON-NLS-1$ //$NON-NLS-2$ // right-pointing double angle quotation mark + badEntities.put( + "¼", "\u00BC"); // $NON-NLS-1$ //$NON-NLS-2$ // vulgar fraction one quarter + badEntities.put( + "½", "\u00BD"); // $NON-NLS-1$ //$NON-NLS-2$ // vulgar fraction one half + badEntities.put( + "¾", + "\u00BE"); // $NON-NLS-1$ //$NON-NLS-2$ // vulgar fraction three quarters + badEntities.put( + "¿", "\u00BF"); // $NON-NLS-1$ //$NON-NLS-2$ // inverted question mark + badEntities.put( + "À", + "\u00C0"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter A with grave + badEntities.put( + "Á", + "\u00C1"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter A with acute + badEntities.put( + "Â", + "\u00C2"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter A with circumflex + badEntities.put( + "Ã", + "\u00C3"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter A with tilde + badEntities.put( + "Ä", + "\u00C4"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter A with diaeresis + badEntities.put( + "Å", + "\u00C5"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter A with ring above + badEntities.put( + "Æ", "\u00C6"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter AE + badEntities.put( + "Ç", + "\u00C7"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter C with cedilla + badEntities.put( + "È", + "\u00C8"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter E with grave + badEntities.put( + "É", + "\u00C9"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter E with acute + badEntities.put( + "Ê", + "\u00CA"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter E with circumflex + badEntities.put( + "Ë", + "\u00CB"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter E with diaeresis + badEntities.put( + "Ì", + "\u00CC"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter I with grave + badEntities.put( + "Í", + "\u00CD"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter I with acute + badEntities.put( + "Î", + "\u00CE"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter I with circumflex + badEntities.put( + "Ï", + "\u00CF"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter I with diaeresis + badEntities.put("Ð", "\u00D0"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter ETH + badEntities.put( + "Ñ", + "\u00D1"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter N with tilde + badEntities.put( + "Ò", + "\u00D2"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter O with grave + badEntities.put( + "Ó", + "\u00D3"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter O with acute + badEntities.put( + "Ô", + "\u00D4"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter O with circumflex + badEntities.put( + "Õ", + "\u00D5"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter O with tilde + badEntities.put( + "Ö", + "\u00D6"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter O with diaeresis + badEntities.put("×", "\u00D7"); // $NON-NLS-1$ //$NON-NLS-2$ // multiplication sign + badEntities.put( + "Ø", + "\u00D8"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter O with stroke + badEntities.put( + "Ù", + "\u00D9"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter U with grave + badEntities.put( + "Ú", + "\u00DA"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter U with acute + badEntities.put( + "Û", + "\u00DB"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter U with circumflex + badEntities.put( + "Ü", + "\u00DC"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter U with diaeresis + badEntities.put( + "Ý", + "\u00DD"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter Y with acute + badEntities.put( + "Þ", "\u00DE"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter THORN + badEntities.put( + "ß", "\u00DF"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter sharp s + badEntities.put( + "à", + "\u00E0"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter a with grave + badEntities.put( + "á", + "\u00E1"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter a with acute + badEntities.put( + "â", + "\u00E2"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter a with circumflex + badEntities.put( + "ã", + "\u00E3"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter a with tilde + badEntities.put( + "ä", + "\u00E4"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter a with diaeresis + badEntities.put( + "å", + "\u00E5"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter a with ring above + badEntities.put("æ", "\u00E6"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter ae + badEntities.put( + "ç", + "\u00E7"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter c with cedilla + badEntities.put( + "è", + "\u00E8"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter e with grave + badEntities.put( + "é", + "\u00E9"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter e with acute + badEntities.put( + "ê", + "\u00EA"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter e with circumflex + badEntities.put( + "ë", + "\u00EB"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter e with diaeresis + badEntities.put( + "ì", + "\u00EC"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter i with grave + badEntities.put( + "í", + "\u00ED"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter i with acute + badEntities.put( + "î", + "\u00EE"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter i with circumflex + badEntities.put( + "ï", + "\u00EF"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter i with diaeresis + badEntities.put("ð", "\u00F0"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter eth + badEntities.put( + "ñ", + "\u00F1"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter n with tilde + badEntities.put( + "ò", + "\u00F2"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter o with grave + badEntities.put( + "ó", + "\u00F3"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter o with acute + badEntities.put( + "ô", + "\u00F4"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter o with circumflex + badEntities.put( + "õ", + "\u00F5"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter o with tilde + badEntities.put( + "ö", + "\u00F6"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter o with diaeresis + badEntities.put("÷", "\u00F7"); // $NON-NLS-1$ //$NON-NLS-2$ // division sign + badEntities.put( + "ø", + "\u00F8"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter o with stroke + badEntities.put( + "ù", + "\u00F9"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter u with grave + badEntities.put( + "ú", + "\u00FA"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter u with acute + badEntities.put( + "û", + "\u00FB"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter u with circumflex + badEntities.put( + "ü", + "\u00FC"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter u with diaeresis + badEntities.put( + "ý", + "\u00FD"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter y with acute + badEntities.put( + "þ", "\u00FE"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter thorn + badEntities.put( + "ÿ", + "\u00FF"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter y with diaeresis + } - /** - * For each entity in the input that is not allowed in XML, replace the entity with its unicode equivalent or remove it. For each - * instance of a bare {@literal &}, replace it with {@literal &
- * } XML only allows 4 entities: {@literal &amp;}, {@literal &quot;}, {@literal &lt;} and {@literal &gt;}. - * - * @param broken - * the string to handle entities - * @return the string with entities appropriately fixed up - */ - static public String cleanAllEntities(final String broken) { - if (broken == null) { return null; } + /** + * For each entity in the input that is not allowed in XML, replace the entity with its unicode + * equivalent or remove it. For each instance of a bare {@literal &}, replace it with {@literal + * &
} XML only allows 4 entities: {@literal &amp;}, {@literal &quot;}, + * {@literal &lt;} and {@literal &gt;}. + * + * @param broken the string to handle entities + * @return the string with entities appropriately fixed up + */ + public static String cleanAllEntities(final String broken) { + if (broken == null) { + return null; + } - String working = invalidControlCharPattern.matcher(broken).replaceAll(""); - working = invalidCharacterPattern.matcher(working).replaceAll(""); + String working = invalidControlCharPattern.matcher(broken).replaceAll(""); + working = invalidCharacterPattern.matcher(working).replaceAll(""); - int cleanfrom = 0; + int cleanfrom = 0; - while (true) { - int amp = working.indexOf('&', cleanfrom); - // If there are no more amps then we are done - if (amp == -1) { - break; - } - // Skip references of the kind &#ddd; - if (validCharacterEntityPattern.matcher(working.substring(amp)).find()) { - cleanfrom = working.indexOf(';', amp) + 1; - continue; - } - int i = amp + 1; - while (true) { - // if we are at the end of the string then just escape the '&'; - if (i >= working.length()) { return working.substring(0, amp) + "&" + working.substring(amp + 1); //$NON-NLS-1$ - } - // if we have come to a ; then we have an entity - // If it is something that xml can't handle then replace it. - final char c = working.charAt(i); - if (c == ';') { - final String entity = working.substring(amp, i + 1); - final String replace = handleEntity(entity); - working = working.substring(0, amp) + replace + working.substring(i + 1); - break; - } - // Did we end an entity without finding a closing ; - // Then treat it as an '&' that needs to be replaced with & - if (!Character.isLetterOrDigit(c)) { - working = working.substring(0, amp) + "&" + working.substring(amp + 1); //$NON-NLS-1$ - amp = i + 4; // account for the 4 extra characters - break; - } - i++; - } - cleanfrom = amp + 1; - } + while (true) { + int amp = working.indexOf('&', cleanfrom); + // If there are no more amps then we are done + if (amp == -1) { + break; + } + // Skip references of the kind &#ddd; + if (validCharacterEntityPattern.matcher(working.substring(amp)).find()) { + cleanfrom = working.indexOf(';', amp) + 1; + continue; + } + int i = amp + 1; + while (true) { + // if we are at the end of the string then just escape the '&'; + if (i >= working.length()) { + return working.substring(0, amp) + + "&" + + working.substring(amp + 1); // $NON-NLS-1$ + } + // if we have come to a ; then we have an entity + // If it is something that xml can't handle then replace it. + final char c = working.charAt(i); + if (c == ';') { + final String entity = working.substring(amp, i + 1); + final String replace = handleEntity(entity); + working = working.substring(0, amp) + replace + working.substring(i + 1); + break; + } + // Did we end an entity without finding a closing ; + // Then treat it as an '&' that needs to be replaced with & + if (!Character.isLetterOrDigit(c)) { + working = + working.substring(0, amp) + + "&" + + working.substring(amp + 1); // $NON-NLS-1$ + amp = i + 4; // account for the 4 extra characters + break; + } + i++; + } + cleanfrom = amp + 1; + } - if (Pattern.compile("<<").matcher(working).find()) { - working = working.replaceAll("<<", "<<"); - } + if (Pattern.compile("<<").matcher(working).find()) { + working = working.replaceAll("<<", "<<"); + } - if (Pattern.compile(">>").matcher(working).find()) { - working = working.replaceAll(">>", ">>"); - } + if (Pattern.compile(">>").matcher(working).find()) { + working = working.replaceAll(">>", ">>"); + } - return working; - } + return working; + } - /** - * Replace entity with its unicode equivalent, if it is not a valid XML entity. Otherwise strip it out. XML only allows 4 entities: - * &amp;, &quot;, &lt; and &gt;. - * - * @param entity - * the entity to be replaced - * @return the substitution for the entity, either itself, the unicode equivalent or an empty string. - */ - private static String handleEntity(final String entity) { - if (goodEntities.contains(entity)) { return entity; } + /** + * Replace entity with its unicode equivalent, if it is not a valid XML entity. Otherwise strip + * it out. XML only allows 4 entities: &amp;, &quot;, &lt; and &gt;. + * + * @param entity the entity to be replaced + * @return the substitution for the entity, either itself, the unicode equivalent or an empty + * string. + */ + private static String handleEntity(final String entity) { + if (goodEntities.contains(entity)) { + return entity; + } - final String replace = badEntities.get(entity); - if (replace != null) { return replace; } + final String replace = badEntities.get(entity); + if (replace != null) { + return replace; + } - return replace != null ? replace : ""; - } + return replace != null ? replace : ""; + } } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/actions/LicenseComparator.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/actions/LicenseComparator.java index 9d0e82aca..126a97e39 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/actions/LicenseComparator.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/actions/LicenseComparator.java @@ -1,7 +1,6 @@ package eu.dnetlib.dhp.migration.actions; import eu.dnetlib.data.proto.FieldTypeProtos.Qualifier; - import java.util.Comparator; public class LicenseComparator implements Comparator { @@ -45,5 +44,4 @@ public class LicenseComparator implements Comparator { // Else (but unlikely), lexicographical ordering will do. return lClass.compareTo(rClass); } - } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/actions/MigrateActionSet.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/actions/MigrateActionSet.java index 487fac359..680f7759c 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/actions/MigrateActionSet.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/actions/MigrateActionSet.java @@ -6,6 +6,11 @@ import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.utils.ISLookupClientFactory; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; +import java.io.File; +import java.io.FileOutputStream; +import java.io.OutputStream; +import java.util.*; +import java.util.stream.Collectors; import org.apache.commons.io.IOUtils; import org.apache.commons.lang3.StringUtils; import org.apache.commons.logging.Log; @@ -17,12 +22,6 @@ import org.apache.hadoop.tools.DistCp; import org.apache.hadoop.tools.DistCpOptions; import org.apache.hadoop.util.ToolRunner; -import java.io.File; -import java.io.FileOutputStream; -import java.io.OutputStream; -import java.util.*; -import java.util.stream.Collectors; - public class MigrateActionSet { private static final Log log = LogFactory.getLog(MigrateActionSet.class); @@ -34,9 +33,11 @@ public class MigrateActionSet { private static Boolean DEFAULT_TRANSFORM_ONLY = false; public static void main(String[] args) throws Exception { - final ArgumentApplicationParser parser = new ArgumentApplicationParser( - IOUtils.toString(MigrateActionSet.class.getResourceAsStream( - "/eu/dnetlib/dhp/migration/migrate_actionsets_parameters.json"))); + final ArgumentApplicationParser parser = + new ArgumentApplicationParser( + IOUtils.toString( + MigrateActionSet.class.getResourceAsStream( + "/eu/dnetlib/dhp/migration/migrate_actionsets_parameters.json"))); parser.parseArgument(args); new MigrateActionSet().run(parser); @@ -47,7 +48,7 @@ public class MigrateActionSet { final String isLookupUrl = parser.get("isLookupUrl"); final String sourceNN = parser.get("sourceNameNode"); final String targetNN = parser.get("targetNameNode"); - final String workDir = parser.get("workingDirectory"); + final String workDir = parser.get("workingDirectory"); final Integer distcp_num_maps = Integer.parseInt(parser.get("distcp_num_maps")); final String distcp_memory_mb = parser.get("distcp_memory_mb"); @@ -63,10 +64,12 @@ public class MigrateActionSet { ISLookUpService isLookUp = ISLookupClientFactory.getLookUpService(isLookupUrl); - Configuration conf = getConfiguration(distcp_task_timeout, distcp_memory_mb, distcp_num_maps); + Configuration conf = + getConfiguration(distcp_task_timeout, distcp_memory_mb, distcp_num_maps); FileSystem targetFS = FileSystem.get(conf); - Configuration sourceConf = getConfiguration(distcp_task_timeout, distcp_memory_mb, distcp_num_maps); + Configuration sourceConf = + getConfiguration(distcp_task_timeout, distcp_memory_mb, distcp_num_maps); sourceConf.set(FileSystem.FS_DEFAULT_NAME_KEY, sourceNN); FileSystem sourceFS = FileSystem.get(sourceConf); @@ -75,14 +78,20 @@ public class MigrateActionSet { List targetPaths = new ArrayList<>(); final List sourcePaths = getSourcePaths(sourceNN, isLookUp); - log.info(String.format("paths to process:\n%s", sourcePaths.stream().map(p -> p.toString()).collect(Collectors.joining("\n")))); - for(Path source : sourcePaths) { + log.info( + String.format( + "paths to process:\n%s", + sourcePaths.stream() + .map(p -> p.toString()) + .collect(Collectors.joining("\n")))); + for (Path source : sourcePaths) { if (!sourceFS.exists(source)) { log.warn(String.format("skipping unexisting path: %s", source)); } else { - LinkedList pathQ = Lists.newLinkedList(Splitter.on(SEPARATOR).split(source.toUri().getPath())); + LinkedList pathQ = + Lists.newLinkedList(Splitter.on(SEPARATOR).split(source.toUri().getPath())); final String rawSet = pathQ.pollLast(); log.info(String.format("got RAWSET: %s", rawSet)); @@ -91,7 +100,14 @@ public class MigrateActionSet { final String actionSetDirectory = pathQ.pollLast(); - final Path targetPath = new Path(targetNN + workDir + SEPARATOR + actionSetDirectory + SEPARATOR + rawSet); + final Path targetPath = + new Path( + targetNN + + workDir + + SEPARATOR + + actionSetDirectory + + SEPARATOR + + rawSet); log.info(String.format("using TARGET PATH: %s", targetPath)); @@ -99,7 +115,13 @@ public class MigrateActionSet { if (targetFS.exists(targetPath)) { targetFS.delete(targetPath, true); } - runDistcp(distcp_num_maps, distcp_memory_mb, distcp_task_timeout, conf, source, targetPath); + runDistcp( + distcp_num_maps, + distcp_memory_mb, + distcp_task_timeout, + conf, + source, + targetPath); } targetPaths.add(targetPath); @@ -107,19 +129,25 @@ public class MigrateActionSet { } } - props.setProperty(TARGET_PATHS, targetPaths - .stream() - .map(p -> p.toString()) - .collect(Collectors.joining(","))); + props.setProperty( + TARGET_PATHS, + targetPaths.stream().map(p -> p.toString()).collect(Collectors.joining(","))); File file = new File(System.getProperty("oozie.action.output.properties")); - try(OutputStream os = new FileOutputStream(file)) { + try (OutputStream os = new FileOutputStream(file)) { props.store(os, ""); } System.out.println(file.getAbsolutePath()); } - private void runDistcp(Integer distcp_num_maps, String distcp_memory_mb, String distcp_task_timeout, Configuration conf, Path source, Path targetPath) throws Exception { + private void runDistcp( + Integer distcp_num_maps, + String distcp_memory_mb, + String distcp_task_timeout, + Configuration conf, + Path source, + Path targetPath) + throws Exception { final DistCpOptions op = new DistCpOptions(source, targetPath); op.setMaxMaps(distcp_num_maps); @@ -127,20 +155,25 @@ public class MigrateActionSet { op.preserve(DistCpOptions.FileAttribute.REPLICATION); op.preserve(DistCpOptions.FileAttribute.CHECKSUMTYPE); - int res = ToolRunner.run(new DistCp(conf, op), new String[]{ - "-Dmapred.task.timeout=" + distcp_task_timeout, - "-Dmapreduce.map.memory.mb=" + distcp_memory_mb, - "-pb", - "-m " + distcp_num_maps, - source.toString(), - targetPath.toString()}); + int res = + ToolRunner.run( + new DistCp(conf, op), + new String[] { + "-Dmapred.task.timeout=" + distcp_task_timeout, + "-Dmapreduce.map.memory.mb=" + distcp_memory_mb, + "-pb", + "-m " + distcp_num_maps, + source.toString(), + targetPath.toString() + }); if (res != 0) { throw new RuntimeException(String.format("distcp exited with code %s", res)); } } - private Configuration getConfiguration(String distcp_task_timeout, String distcp_memory_mb, Integer distcp_num_maps) { + private Configuration getConfiguration( + String distcp_task_timeout, String distcp_memory_mb, Integer distcp_num_maps) { final Configuration conf = new Configuration(); conf.set("dfs.webhdfs.socket.connect-timeout", distcp_task_timeout); conf.set("dfs.webhdfs.socket.read-timeout", distcp_task_timeout); @@ -151,20 +184,20 @@ public class MigrateActionSet { return conf; } - private List getSourcePaths(String sourceNN, ISLookUpService isLookUp) throws ISLookUpException { - String XQUERY = "distinct-values(\n" + - "let $basePath := collection('/db/DRIVER/ServiceResources/ActionManagerServiceResourceType')//SERVICE_PROPERTIES/PROPERTY[@key = 'basePath']/@value/string()\n" + - "for $x in collection('/db/DRIVER/ActionManagerSetDSResources/ActionManagerSetDSResourceType') \n" + - "let $setDir := $x//SET/@directory/string()\n" + - "let $rawSet := $x//RAW_SETS/LATEST/@id/string()\n" + - "return concat($basePath, '/', $setDir, '/', $rawSet))"; + private List getSourcePaths(String sourceNN, ISLookUpService isLookUp) + throws ISLookUpException { + String XQUERY = + "distinct-values(\n" + + "let $basePath := collection('/db/DRIVER/ServiceResources/ActionManagerServiceResourceType')//SERVICE_PROPERTIES/PROPERTY[@key = 'basePath']/@value/string()\n" + + "for $x in collection('/db/DRIVER/ActionManagerSetDSResources/ActionManagerSetDSResourceType') \n" + + "let $setDir := $x//SET/@directory/string()\n" + + "let $rawSet := $x//RAW_SETS/LATEST/@id/string()\n" + + "return concat($basePath, '/', $setDir, '/', $rawSet))"; log.info(String.format("running xquery:\n%s", XQUERY)); - return isLookUp.quickSearchProfile(XQUERY) - .stream() + return isLookUp.quickSearchProfile(XQUERY).stream() .map(p -> sourceNN + p) .map(Path::new) .collect(Collectors.toList()); } - } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/actions/ProtoConverter.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/actions/ProtoConverter.java index a7e70ee81..b5d027c03 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/actions/ProtoConverter.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/actions/ProtoConverter.java @@ -4,12 +4,11 @@ import com.google.common.collect.Lists; import com.googlecode.protobuf.format.JsonFormat; import eu.dnetlib.data.proto.*; import eu.dnetlib.dhp.schema.oaf.*; -import org.apache.commons.lang3.StringUtils; - import java.io.Serializable; import java.util.List; import java.util.Optional; import java.util.stream.Collectors; +import org.apache.commons.lang3.StringUtils; public class ProtoConverter implements Serializable { @@ -42,10 +41,12 @@ public class ProtoConverter implements Serializable { rel.setRelType(r.getRelType().toString()); rel.setSubRelType(r.getSubRelType().toString()); rel.setRelClass(r.getRelClass()); - rel.setCollectedFrom(r.getCollectedfromCount() > 0 ? - r.getCollectedfromList().stream() - .map(kv -> mapKV(kv)) - .collect(Collectors.toList()) : null); + rel.setCollectedFrom( + r.getCollectedfromCount() > 0 + ? r.getCollectedfromList().stream() + .map(kv -> mapKV(kv)) + .collect(Collectors.toList()) + : null); return rel; } @@ -71,8 +72,7 @@ public class ProtoConverter implements Serializable { final ResultProtos.Result r = oaf.getEntity().getResult(); if (r.getInstanceCount() > 0) { - return r.getInstanceList() - .stream() + return r.getInstanceList().stream() .map(i -> convertInstance(i)) .collect(Collectors.toList()); } @@ -96,15 +96,16 @@ public class ProtoConverter implements Serializable { } private static Organization convertOrganization(OafProtos.Oaf oaf) { - final OrganizationProtos.Organization.Metadata m = oaf.getEntity().getOrganization().getMetadata(); + final OrganizationProtos.Organization.Metadata m = + oaf.getEntity().getOrganization().getMetadata(); final Organization org = setOaf(new Organization(), oaf); setEntity(org, oaf); org.setLegalshortname(mapStringField(m.getLegalshortname())); org.setLegalname(mapStringField(m.getLegalname())); - org.setAlternativeNames(m.getAlternativeNamesList(). - stream() - .map(ProtoConverter::mapStringField) - .collect(Collectors.toList())); + org.setAlternativeNames( + m.getAlternativeNamesList().stream() + .map(ProtoConverter::mapStringField) + .collect(Collectors.toList())); org.setWebsiteurl(mapStringField(m.getWebsiteurl())); org.setLogourl(mapStringField(m.getLogourl())); org.setEclegalbody(mapStringField(m.getEclegalbody())); @@ -112,7 +113,8 @@ public class ProtoConverter implements Serializable { org.setEcnonprofit(mapStringField(m.getEcnonprofit())); org.setEcresearchorganization(mapStringField(m.getEcresearchorganization())); org.setEchighereducation(mapStringField(m.getEchighereducation())); - org.setEcinternationalorganizationeurinterests(mapStringField(m.getEcinternationalorganizationeurinterests())); + org.setEcinternationalorganizationeurinterests( + mapStringField(m.getEcinternationalorganizationeurinterests())); org.setEcinternationalorganization(mapStringField(m.getEcinternationalorganization())); org.setEcenterprise(mapStringField(m.getEcenterprise())); org.setEcsmevalidated(mapStringField(m.getEcsmevalidated())); @@ -123,13 +125,14 @@ public class ProtoConverter implements Serializable { } private static Datasource convertDataSource(OafProtos.Oaf oaf) { - final DatasourceProtos.Datasource.Metadata m = oaf.getEntity().getDatasource().getMetadata(); + final DatasourceProtos.Datasource.Metadata m = + oaf.getEntity().getDatasource().getMetadata(); final Datasource datasource = setOaf(new Datasource(), oaf); setEntity(datasource, oaf); - datasource.setAccessinfopackage(m.getAccessinfopackageList() - .stream() - .map(ProtoConverter::mapStringField) - .collect(Collectors.toList())); + datasource.setAccessinfopackage( + m.getAccessinfopackageList().stream() + .map(ProtoConverter::mapStringField) + .collect(Collectors.toList())); datasource.setCertificates(mapStringField(m.getCertificates())); datasource.setCitationguidelineurl(mapStringField(m.getCitationguidelineurl())); datasource.setContactemail(mapStringField(m.getContactemail())); @@ -148,37 +151,36 @@ public class ProtoConverter implements Serializable { datasource.setLogourl(mapStringField(m.getLogourl())); datasource.setMissionstatementurl(mapStringField(m.getMissionstatementurl())); datasource.setNamespaceprefix(mapStringField(m.getNamespaceprefix())); - datasource.setOdcontenttypes(m.getOdcontenttypesList() - .stream() - .map(ProtoConverter::mapStringField) - .collect(Collectors.toList())); - datasource.setOdlanguages(m.getOdlanguagesList() - .stream() - .map(ProtoConverter::mapStringField) - .collect(Collectors.toList())); + datasource.setOdcontenttypes( + m.getOdcontenttypesList().stream() + .map(ProtoConverter::mapStringField) + .collect(Collectors.toList())); + datasource.setOdlanguages( + m.getOdlanguagesList().stream() + .map(ProtoConverter::mapStringField) + .collect(Collectors.toList())); datasource.setOdnumberofitems(mapStringField(m.getOdnumberofitems())); datasource.setOdnumberofitemsdate(mapStringField(m.getOdnumberofitemsdate())); datasource.setOdpolicies(mapStringField(m.getOdpolicies())); datasource.setOfficialname(mapStringField(m.getOfficialname())); datasource.setOpenairecompatibility(mapQualifier(m.getOpenairecompatibility())); datasource.setPidsystems(mapStringField(m.getPidsystems())); - datasource.setPolicies(m.getPoliciesList() - .stream() - .map(ProtoConverter::mapKV) - .collect(Collectors.toList())); + datasource.setPolicies( + m.getPoliciesList().stream() + .map(ProtoConverter::mapKV) + .collect(Collectors.toList())); datasource.setQualitymanagementkind(mapStringField(m.getQualitymanagementkind())); datasource.setReleaseenddate(mapStringField(m.getReleaseenddate())); datasource.setServiceprovider(mapBoolField(m.getServiceprovider())); datasource.setReleasestartdate(mapStringField(m.getReleasestartdate())); - datasource.setSubjects(m.getSubjectsList() - .stream() - .map(ProtoConverter::mapStructuredProperty) - .collect(Collectors.toList())); + datasource.setSubjects( + m.getSubjectsList().stream() + .map(ProtoConverter::mapStructuredProperty) + .collect(Collectors.toList())); datasource.setVersioning(mapBoolField(m.getVersioning())); datasource.setWebsiteurl(mapStringField(m.getWebsiteurl())); datasource.setJournal(mapJournal(m.getJournal())); - return datasource; } @@ -204,14 +206,16 @@ public class ProtoConverter implements Serializable { project.setFundedamount(m.getFundedamount()); project.setTotalcost(m.getTotalcost()); project.setKeywords(mapStringField(m.getKeywords())); - project.setSubjects(m.getSubjectsList().stream() - .map(sp -> mapStructuredProperty(sp)) - .collect(Collectors.toList())); + project.setSubjects( + m.getSubjectsList().stream() + .map(sp -> mapStructuredProperty(sp)) + .collect(Collectors.toList())); project.setTitle(mapStringField(m.getTitle())); project.setWebsiteurl(mapStringField(m.getWebsiteurl())); - project.setFundingtree(m.getFundingtreeList().stream() - .map(f -> mapStringField(f)) - .collect(Collectors.toList())); + project.setFundingtree( + m.getFundingtreeList().stream() + .map(f -> mapStringField(f)) + .collect(Collectors.toList())); project.setJsonextrainfo(mapStringField(m.getJsonextrainfo())); project.setSummary(mapStringField(m.getSummary())); project.setOptional1(mapStringField(m.getOptional1())); @@ -242,14 +246,14 @@ public class ProtoConverter implements Serializable { setEntity(software, oaf); setResult(software, oaf); - software.setDocumentationUrl(m.getDocumentationUrlList() - .stream() - .map(ProtoConverter::mapStringField) - .collect(Collectors.toList())); - software.setLicense(m.getLicenseList() - .stream() - .map(ProtoConverter::mapStructuredProperty) - .collect(Collectors.toList())); + software.setDocumentationUrl( + m.getDocumentationUrlList().stream() + .map(ProtoConverter::mapStringField) + .collect(Collectors.toList())); + software.setLicense( + m.getLicenseList().stream() + .map(ProtoConverter::mapStructuredProperty) + .collect(Collectors.toList())); software.setCodeRepositoryUrl(mapStringField(m.getCodeRepositoryUrl())); software.setProgrammingLanguage(mapQualifier(m.getProgrammingLanguage())); return software; @@ -260,18 +264,18 @@ public class ProtoConverter implements Serializable { OtherResearchProduct otherResearchProducts = setOaf(new OtherResearchProduct(), oaf); setEntity(otherResearchProducts, oaf); setResult(otherResearchProducts, oaf); - otherResearchProducts.setContactperson(m.getContactpersonList() - .stream() - .map(ProtoConverter::mapStringField) - .collect(Collectors.toList())); - otherResearchProducts.setContactgroup(m.getContactgroupList() - .stream() - .map(ProtoConverter::mapStringField) - .collect(Collectors.toList())); - otherResearchProducts.setTool(m.getToolList() - .stream() - .map(ProtoConverter::mapStringField) - .collect(Collectors.toList())); + otherResearchProducts.setContactperson( + m.getContactpersonList().stream() + .map(ProtoConverter::mapStringField) + .collect(Collectors.toList())); + otherResearchProducts.setContactgroup( + m.getContactgroupList().stream() + .map(ProtoConverter::mapStringField) + .collect(Collectors.toList())); + otherResearchProducts.setTool( + m.getToolList().stream() + .map(ProtoConverter::mapStringField) + .collect(Collectors.toList())); return otherResearchProducts; } @@ -298,12 +302,11 @@ public class ProtoConverter implements Serializable { dataset.setVersion(mapStringField(m.getVersion())); dataset.setLastmetadataupdate(mapStringField(m.getLastmetadataupdate())); dataset.setMetadataversionnumber(mapStringField(m.getMetadataversionnumber())); - dataset.setGeolocation(m.getGeolocationList() - .stream() - .map(ProtoConverter::mapGeolocation) - .collect(Collectors.toList())); + dataset.setGeolocation( + m.getGeolocationList().stream() + .map(ProtoConverter::mapGeolocation) + .collect(Collectors.toList())); return dataset; - } public static T setOaf(T oaf, OafProtos.Oaf o) { @@ -313,100 +316,103 @@ public class ProtoConverter implements Serializable { } public static T setEntity(T entity, OafProtos.Oaf oaf) { - //setting Entity fields + // setting Entity fields final OafProtos.OafEntity e = oaf.getEntity(); entity.setId(e.getId()); entity.setOriginalId(e.getOriginalIdList()); - entity.setCollectedfrom(e.getCollectedfromList() - .stream() - .map(ProtoConverter::mapKV) - .collect(Collectors.toList())); - entity.setPid(e.getPidList().stream() - .map(ProtoConverter::mapStructuredProperty) - .collect(Collectors.toList())); + entity.setCollectedfrom( + e.getCollectedfromList().stream() + .map(ProtoConverter::mapKV) + .collect(Collectors.toList())); + entity.setPid( + e.getPidList().stream() + .map(ProtoConverter::mapStructuredProperty) + .collect(Collectors.toList())); entity.setDateofcollection(e.getDateofcollection()); entity.setDateoftransformation(e.getDateoftransformation()); - entity.setExtraInfo(e.getExtraInfoList() - .stream() - .map(ProtoConverter::mapExtraInfo) - .collect(Collectors.toList())); + entity.setExtraInfo( + e.getExtraInfoList().stream() + .map(ProtoConverter::mapExtraInfo) + .collect(Collectors.toList())); return entity; } public static T setResult(T entity, OafProtos.Oaf oaf) { - //setting Entity fields + // setting Entity fields final ResultProtos.Result.Metadata m = oaf.getEntity().getResult().getMetadata(); - entity.setAuthor(m.getAuthorList() - .stream() - .map(ProtoConverter::mapAuthor) - .collect(Collectors.toList())); + entity.setAuthor( + m.getAuthorList().stream() + .map(ProtoConverter::mapAuthor) + .collect(Collectors.toList())); entity.setResulttype(mapQualifier(m.getResulttype())); entity.setLanguage(mapQualifier(m.getLanguage())); - entity.setCountry(m.getCountryList() - .stream() - .map(ProtoConverter::mapQualifierAsCountry) - .collect(Collectors.toList())); - entity.setSubject(m.getSubjectList() - .stream() - .map(ProtoConverter::mapStructuredProperty) - .collect(Collectors.toList())); - entity.setTitle(m.getTitleList() - .stream() - .map(ProtoConverter::mapStructuredProperty) - .collect(Collectors.toList())); - entity.setRelevantdate(m.getRelevantdateList() - .stream() - .map(ProtoConverter::mapStructuredProperty) - .collect(Collectors.toList())); - entity.setDescription(m.getDescriptionList() - .stream() - .map(ProtoConverter::mapStringField) - .collect(Collectors.toList())); + entity.setCountry( + m.getCountryList().stream() + .map(ProtoConverter::mapQualifierAsCountry) + .collect(Collectors.toList())); + entity.setSubject( + m.getSubjectList().stream() + .map(ProtoConverter::mapStructuredProperty) + .collect(Collectors.toList())); + entity.setTitle( + m.getTitleList().stream() + .map(ProtoConverter::mapStructuredProperty) + .collect(Collectors.toList())); + entity.setRelevantdate( + m.getRelevantdateList().stream() + .map(ProtoConverter::mapStructuredProperty) + .collect(Collectors.toList())); + entity.setDescription( + m.getDescriptionList().stream() + .map(ProtoConverter::mapStringField) + .collect(Collectors.toList())); entity.setDateofacceptance(mapStringField(m.getDateofacceptance())); entity.setPublisher(mapStringField(m.getPublisher())); entity.setEmbargoenddate(mapStringField(m.getEmbargoenddate())); - entity.setSource(m.getSourceList() - .stream() - .map(ProtoConverter::mapStringField) - .collect(Collectors.toList())); - entity.setFulltext(m.getFulltextList() - .stream() - .map(ProtoConverter::mapStringField) - .collect(Collectors.toList())); - entity.setFormat(m.getFormatList() - .stream() - .map(ProtoConverter::mapStringField) - .collect(Collectors.toList())); - entity.setContributor(m.getContributorList() - .stream() - .map(ProtoConverter::mapStringField) - .collect(Collectors.toList())); + entity.setSource( + m.getSourceList().stream() + .map(ProtoConverter::mapStringField) + .collect(Collectors.toList())); + entity.setFulltext( + m.getFulltextList().stream() + .map(ProtoConverter::mapStringField) + .collect(Collectors.toList())); + entity.setFormat( + m.getFormatList().stream() + .map(ProtoConverter::mapStringField) + .collect(Collectors.toList())); + entity.setContributor( + m.getContributorList().stream() + .map(ProtoConverter::mapStringField) + .collect(Collectors.toList())); entity.setResourcetype(mapQualifier(m.getResourcetype())); - entity.setCoverage(m.getCoverageList() - .stream() - .map(ProtoConverter::mapStringField) - .collect(Collectors.toList())); - entity.setContext(m.getContextList() - .stream() - .map(ProtoConverter::mapContext) - .collect(Collectors.toList())); + entity.setCoverage( + m.getCoverageList().stream() + .map(ProtoConverter::mapStringField) + .collect(Collectors.toList())); + entity.setContext( + m.getContextList().stream() + .map(ProtoConverter::mapContext) + .collect(Collectors.toList())); - entity.setBestaccessright(getBestAccessRights(oaf.getEntity().getResult().getInstanceList())); + entity.setBestaccessright( + getBestAccessRights(oaf.getEntity().getResult().getInstanceList())); return entity; } private static Qualifier getBestAccessRights(List instanceList) { if (instanceList != null) { - final Optional min = instanceList.stream() - .map(i -> i.getAccessright()).min(new LicenseComparator()); + final Optional min = + instanceList.stream().map(i -> i.getAccessright()).min(new LicenseComparator()); final Qualifier rights = min.isPresent() ? mapQualifier(min.get()) : new Qualifier(); if (StringUtils.isBlank(rights.getClassid())) { rights.setClassid(UNKNOWN); } - if (StringUtils.isBlank(rights.getClassname()) || UNKNOWN.equalsIgnoreCase(rights.getClassname())) { + if (StringUtils.isBlank(rights.getClassname()) + || UNKNOWN.equalsIgnoreCase(rights.getClassname())) { rights.setClassname(NOT_AVAILABLE); } if (StringUtils.isBlank(rights.getSchemeid())) { @@ -425,14 +431,13 @@ public class ProtoConverter implements Serializable { final Context entity = new Context(); entity.setId(context.getId()); - entity.setDataInfo(context.getDataInfoList() - .stream() - .map(ProtoConverter::mapDataInfo) - .collect(Collectors.toList())); + entity.setDataInfo( + context.getDataInfoList().stream() + .map(ProtoConverter::mapDataInfo) + .collect(Collectors.toList())); return entity; } - public static KeyValue mapKV(FieldTypeProtos.KeyValue kv) { final KeyValue keyValue = new KeyValue(); keyValue.setKey(kv.getKey()); @@ -495,7 +500,8 @@ public class ProtoConverter implements Serializable { return entity; } - public static OriginDescription mapOriginalDescription(FieldTypeProtos.OAIProvenance.OriginDescription originDescription) { + public static OriginDescription mapOriginalDescription( + FieldTypeProtos.OAIProvenance.OriginDescription originDescription) { final OriginDescription originDescriptionResult = new OriginDescription(); originDescriptionResult.setHarvestDate(originDescription.getHarvestDate()); originDescriptionResult.setAltered(originDescription.getAltered()); @@ -550,24 +556,24 @@ public class ProtoConverter implements Serializable { entity.setName(author.getName()); entity.setSurname(author.getSurname()); entity.setRank(author.getRank()); - entity.setPid(author.getPidList() - .stream() - .map(kv -> { - final StructuredProperty sp = new StructuredProperty(); - sp.setValue(kv.getValue()); - final Qualifier q = new Qualifier(); - q.setClassid(kv.getKey()); - q.setClassname(kv.getKey()); - sp.setQualifier(q); - return sp; - }) - .collect(Collectors.toList())); - entity.setAffiliation(author.getAffiliationList() - .stream() - .map(ProtoConverter::mapStringField) - .collect(Collectors.toList())); + entity.setPid( + author.getPidList().stream() + .map( + kv -> { + final StructuredProperty sp = new StructuredProperty(); + sp.setValue(kv.getValue()); + final Qualifier q = new Qualifier(); + q.setClassid(kv.getKey()); + q.setClassname(kv.getKey()); + sp.setQualifier(q); + return sp; + }) + .collect(Collectors.toList())); + entity.setAffiliation( + author.getAffiliationList().stream() + .map(ProtoConverter::mapStringField) + .collect(Collectors.toList())); return entity; - } public static GeoLocation mapGeolocation(ResultProtos.Result.GeoLocation geoLocation) { diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/actions/TransformActions.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/actions/TransformActions.java index 65dec7b7f..81c2e7705 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/actions/TransformActions.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/actions/TransformActions.java @@ -5,14 +5,17 @@ import com.fasterxml.jackson.databind.ObjectMapper; import com.google.common.base.Splitter; import com.google.common.collect.Lists; import com.google.protobuf.InvalidProtocolBufferException; -import eu.dnetlib.dhp.schema.action.AtomicAction; import eu.dnetlib.data.proto.OafProtos; import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.schema.action.AtomicAction; import eu.dnetlib.dhp.schema.oaf.*; import eu.dnetlib.dhp.utils.ISLookupClientFactory; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; - +import java.io.IOException; +import java.io.Serializable; +import java.util.LinkedList; +import java.util.Objects; import org.apache.commons.io.IOUtils; import org.apache.commons.lang3.StringUtils; import org.apache.commons.logging.Log; @@ -21,16 +24,9 @@ import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.compress.GzipCodec; -import org.apache.hadoop.mapred.SequenceFileOutputFormat; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.sql.SparkSession; -import scala.Tuple2; - -import java.io.IOException; -import java.io.Serializable; -import java.util.LinkedList; -import java.util.Objects; public class TransformActions implements Serializable { @@ -38,9 +34,11 @@ public class TransformActions implements Serializable { private static final String SEPARATOR = "/"; public static void main(String[] args) throws Exception { - final ArgumentApplicationParser parser = new ArgumentApplicationParser( - IOUtils.toString(MigrateActionSet.class.getResourceAsStream( - "/eu/dnetlib/dhp/migration/transform_actionsets_parameters.json"))); + final ArgumentApplicationParser parser = + new ArgumentApplicationParser( + IOUtils.toString( + MigrateActionSet.class.getResourceAsStream( + "/eu/dnetlib/dhp/migration/transform_actionsets_parameters.json"))); parser.parseArgument(args); new TransformActions().run(parser); @@ -51,7 +49,7 @@ public class TransformActions implements Serializable { final String isLookupUrl = parser.get("isLookupUrl"); log.info("isLookupUrl: " + isLookupUrl); - final String inputPaths = parser.get("inputPaths"); + final String inputPaths = parser.get("inputPaths"); if (StringUtils.isBlank(inputPaths)) { throw new RuntimeException("empty inputPaths"); @@ -60,18 +58,25 @@ public class TransformActions implements Serializable { final String targetBaseDir = getTargetBaseDir(isLookupUrl); - try(SparkSession spark = getSparkSession(parser)) { + try (SparkSession spark = getSparkSession(parser)) { final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); final FileSystem fs = FileSystem.get(spark.sparkContext().hadoopConfiguration()); - for(String sourcePath : Lists.newArrayList(Splitter.on(",").split(inputPaths))) { + for (String sourcePath : Lists.newArrayList(Splitter.on(",").split(inputPaths))) { - LinkedList pathQ = Lists.newLinkedList(Splitter.on(SEPARATOR).split(sourcePath)); + LinkedList pathQ = + Lists.newLinkedList(Splitter.on(SEPARATOR).split(sourcePath)); final String rawset = pathQ.pollLast(); final String actionSetDirectory = pathQ.pollLast(); - final Path targetDirectory = new Path(targetBaseDir + SEPARATOR + actionSetDirectory + SEPARATOR + rawset); + final Path targetDirectory = + new Path( + targetBaseDir + + SEPARATOR + + actionSetDirectory + + SEPARATOR + + rawset); if (fs.exists(targetDirectory)) { log.info(String.format("found target directory '%s", targetDirectory)); @@ -79,20 +84,27 @@ public class TransformActions implements Serializable { log.info(String.format("deleted target directory '%s", targetDirectory)); } - log.info(String.format("transforming actions from '%s' to '%s'", sourcePath, targetDirectory)); + log.info( + String.format( + "transforming actions from '%s' to '%s'", + sourcePath, targetDirectory)); sc.sequenceFile(sourcePath, Text.class, Text.class) - .map(a -> eu.dnetlib.actionmanager.actions.AtomicAction.fromJSON(a._2().toString())) - .map(a -> doTransform(a)) - .filter(Objects::isNull) - .filter(a -> a.getPayload() == null) - .map(a -> new ObjectMapper().writeValueAsString(a)) - .saveAsTextFile(targetDirectory.toString(), GzipCodec.class); + .map( + a -> + eu.dnetlib.actionmanager.actions.AtomicAction.fromJSON( + a._2().toString())) + .map(a -> doTransform(a)) + .filter(Objects::isNull) + .filter(a -> a.getPayload() == null) + .map(a -> new ObjectMapper().writeValueAsString(a)) + .saveAsTextFile(targetDirectory.toString(), GzipCodec.class); } } } - private Text transformAction(eu.dnetlib.actionmanager.actions.AtomicAction aa) throws InvalidProtocolBufferException, JsonProcessingException { + private Text transformAction(eu.dnetlib.actionmanager.actions.AtomicAction aa) + throws InvalidProtocolBufferException, JsonProcessingException { final Text out = new Text(); final ObjectMapper mapper = new ObjectMapper(); if (aa.getTargetValue() != null && aa.getTargetValue().length > 0) { @@ -135,7 +147,8 @@ public class TransformActions implements Serializable { return new AtomicAction<>(Relation.class, rel); } - private AtomicAction doTransform(eu.dnetlib.actionmanager.actions.AtomicAction aa) throws InvalidProtocolBufferException { + private AtomicAction doTransform(eu.dnetlib.actionmanager.actions.AtomicAction aa) + throws InvalidProtocolBufferException { final OafProtos.Oaf proto_oaf = OafProtos.Oaf.parseFrom(aa.getTargetValue()); final Oaf oaf = ProtoConverter.convert(proto_oaf); switch (proto_oaf.getKind()) { @@ -148,14 +161,21 @@ public class TransformActions implements Serializable { case project: return new AtomicAction<>(Project.class, (Project) oaf); case result: - final String resulttypeid = proto_oaf.getEntity().getResult().getMetadata().getResulttype().getClassid(); + final String resulttypeid = + proto_oaf + .getEntity() + .getResult() + .getMetadata() + .getResulttype() + .getClassid(); switch (resulttypeid) { case "publication": return new AtomicAction<>(Publication.class, (Publication) oaf); case "software": return new AtomicAction<>(Software.class, (Software) oaf); case "other": - return new AtomicAction<>(OtherResearchProduct.class, (OtherResearchProduct) oaf); + return new AtomicAction<>( + OtherResearchProduct.class, (OtherResearchProduct) oaf); case "dataset": return new AtomicAction<>(Dataset.class, (Dataset) oaf); default: @@ -163,7 +183,8 @@ public class TransformActions implements Serializable { return new AtomicAction<>(Result.class, (Result) oaf); } default: - throw new IllegalArgumentException("invalid entity type: " + proto_oaf.getEntity().getType()); + throw new IllegalArgumentException( + "invalid entity type: " + proto_oaf.getEntity().getType()); } case relation: return new AtomicAction<>(Relation.class, (Relation) oaf); @@ -174,15 +195,15 @@ public class TransformActions implements Serializable { private String getTargetBaseDir(String isLookupUrl) throws ISLookUpException { ISLookUpService isLookUp = ISLookupClientFactory.getLookUpService(isLookupUrl); - String XQUERY = "collection('/db/DRIVER/ServiceResources/ActionManagerServiceResourceType')//SERVICE_PROPERTIES/PROPERTY[@key = 'basePath']/@value/string()"; + String XQUERY = + "collection('/db/DRIVER/ServiceResources/ActionManagerServiceResourceType')//SERVICE_PROPERTIES/PROPERTY[@key = 'basePath']/@value/string()"; return isLookUp.getResourceProfileByQuery(XQUERY); } private static SparkSession getSparkSession(ArgumentApplicationParser parser) { SparkConf conf = new SparkConf(); - return SparkSession - .builder() + return SparkSession.builder() .appName(TransformActions.class.getSimpleName()) .master(parser.get("master")) .config(conf) diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/TransformFunction.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/TransformFunction.java index 5cd78491b..9e07a2d47 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/TransformFunction.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/TransformFunction.java @@ -3,30 +3,33 @@ package eu.dnetlib.dhp.transformation; import eu.dnetlib.dhp.model.mdstore.MetadataRecord; import eu.dnetlib.dhp.transformation.functions.Cleaner; import eu.dnetlib.dhp.transformation.vocabulary.Vocabulary; +import java.io.ByteArrayInputStream; +import java.io.StringWriter; +import java.util.Map; +import javax.xml.transform.stream.StreamSource; import net.sf.saxon.s9api.*; import org.apache.spark.api.java.function.MapFunction; import org.apache.spark.util.LongAccumulator; -import javax.xml.transform.stream.StreamSource; -import java.io.ByteArrayInputStream; -import java.io.StringWriter; -import java.util.Map; - public class TransformFunction implements MapFunction { - private final LongAccumulator totalItems; private final LongAccumulator errorItems; private final LongAccumulator transformedItems; private final String transformationRule; private final Cleaner cleanFunction; - private final long dateOfTransformation; - - public TransformFunction(LongAccumulator totalItems, LongAccumulator errorItems, LongAccumulator transformedItems, final String transformationRule, long dateOfTransformation, final Map vocabularies) throws Exception { - this.totalItems= totalItems; + public TransformFunction( + LongAccumulator totalItems, + LongAccumulator errorItems, + LongAccumulator transformedItems, + final String transformationRule, + long dateOfTransformation, + final Map vocabularies) + throws Exception { + this.totalItems = totalItems; this.errorItems = errorItems; this.transformedItems = transformedItems; this.transformationRule = transformationRule; @@ -41,13 +44,21 @@ public class TransformFunction implements MapFunction encoder = Encoders.bean(MetadataRecord.class); - final Dataset mdstoreInput = spark.read().format("parquet").load(inputPath).as(encoder); + final Dataset mdstoreInput = + spark.read().format("parquet").load(inputPath).as(encoder); final LongAccumulator totalItems = spark.sparkContext().longAccumulator("TotalItems"); final LongAccumulator errorItems = spark.sparkContext().longAccumulator("errorItems"); - final LongAccumulator transformedItems = spark.sparkContext().longAccumulator("transformedItems"); + final LongAccumulator transformedItems = + spark.sparkContext().longAccumulator("transformedItems"); final Map vocabularies = new HashMap<>(); vocabularies.put("dnet:languages", VocabularyHelper.getVocabularyFromAPI("dnet:languages")); - final TransformFunction transformFunction = new TransformFunction(totalItems, errorItems, transformedItems, trasformationRule, dateOfCollection, vocabularies) ; + final TransformFunction transformFunction = + new TransformFunction( + totalItems, + errorItems, + transformedItems, + trasformationRule, + dateOfCollection, + vocabularies); mdstoreInput.map(transformFunction, encoder).write().format("parquet").save(outputPath); if (rabbitHost != null) { System.out.println("SEND FINAL REPORT"); final Map reportMap = new HashMap<>(); - reportMap.put("inputItem" , ""+ totalItems.value()); + reportMap.put("inputItem", "" + totalItems.value()); reportMap.put("invalidRecords", "" + errorItems.value()); reportMap.put("mdStoreSize", "" + transformedItems.value()); System.out.println(new Message(workflowId, "Transform", MessageType.REPORT, reportMap)); if (!test) { - final MessageManager manager = new MessageManager(rabbitHost, rabbitUser, rabbitPassword, false, false, null); - manager.sendMessage(new Message(workflowId, "Transform", MessageType.REPORT, reportMap), rabbitReportQueue, true, false); + final MessageManager manager = + new MessageManager( + rabbitHost, rabbitUser, rabbitPassword, false, false, null); + manager.sendMessage( + new Message(workflowId, "Transform", MessageType.REPORT, reportMap), + rabbitReportQueue, + true, + false); manager.close(); } } } - private static String extractXSLTFromTR(final String tr) throws DocumentException { SAXReader reader = new SAXReader(); Document document = reader.read(new ByteArrayInputStream(tr.getBytes())); Node node = document.selectSingleNode("//CODE/*[local-name()='stylesheet']"); return node.asXML(); } -} \ No newline at end of file +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/functions/Cleaner.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/functions/Cleaner.java index 47f33a342..a15e05864 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/functions/Cleaner.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/functions/Cleaner.java @@ -2,17 +2,14 @@ package eu.dnetlib.dhp.transformation.functions; import eu.dnetlib.dhp.transformation.vocabulary.Term; import eu.dnetlib.dhp.transformation.vocabulary.Vocabulary; +import java.util.Map; +import java.util.Optional; import net.sf.saxon.s9api.*; import scala.Serializable; -import java.util.Map; -import java.util.Optional; - public class Cleaner implements ExtensionFunction, Serializable { - - private final Map vocabularies; - + private final Map vocabularies; public Cleaner(Map vocabularies) { this.vocabularies = vocabularies; @@ -30,20 +27,22 @@ public class Cleaner implements ExtensionFunction, Serializable { @Override public SequenceType[] getArgumentTypes() { - return new SequenceType[] - { - SequenceType.makeSequenceType(ItemType.STRING, OccurrenceIndicator.ONE), - SequenceType.makeSequenceType(ItemType.STRING, OccurrenceIndicator.ONE) - - }; + return new SequenceType[] { + SequenceType.makeSequenceType(ItemType.STRING, OccurrenceIndicator.ONE), + SequenceType.makeSequenceType(ItemType.STRING, OccurrenceIndicator.ONE) + }; } @Override public XdmValue call(XdmValue[] xdmValues) throws SaxonApiException { final String currentValue = xdmValues[0].itemAt(0).getStringValue(); - final String vocabularyName =xdmValues[1].itemAt(0).getStringValue(); - Optional cleanedValue = vocabularies.get(vocabularyName).getTerms().stream().filter(it -> it.getNativeName().equalsIgnoreCase(currentValue)).findAny(); + final String vocabularyName = xdmValues[1].itemAt(0).getStringValue(); + Optional cleanedValue = + vocabularies.get(vocabularyName).getTerms().stream() + .filter(it -> it.getNativeName().equalsIgnoreCase(currentValue)) + .findAny(); - return new XdmAtomicValue(cleanedValue.isPresent()?cleanedValue.get().getCode():currentValue); + return new XdmAtomicValue( + cleanedValue.isPresent() ? cleanedValue.get().getCode() : currentValue); } } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/vocabulary/Term.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/vocabulary/Term.java index f93c2a120..3f02098b1 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/vocabulary/Term.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/vocabulary/Term.java @@ -10,7 +10,6 @@ public class Term implements Serializable { private String code; private String synonyms; - public String getEnglishName() { return englishName; } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/vocabulary/Vocabulary.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/vocabulary/Vocabulary.java index 58e9cb95c..1268924ef 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/vocabulary/Vocabulary.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/vocabulary/Vocabulary.java @@ -2,7 +2,6 @@ package eu.dnetlib.dhp.transformation.vocabulary; import java.io.Serializable; import java.util.List; -import java.util.Map; public class Vocabulary implements Serializable { @@ -51,7 +50,4 @@ public class Vocabulary implements Serializable { public void setTerms(List terms) { this.terms = terms; } - - - } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/vocabulary/VocabularyHelper.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/vocabulary/VocabularyHelper.java index b6ecf795c..455777b31 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/vocabulary/VocabularyHelper.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/vocabulary/VocabularyHelper.java @@ -1,23 +1,21 @@ package eu.dnetlib.dhp.transformation.vocabulary; import com.fasterxml.jackson.databind.ObjectMapper; -import org.apache.commons.io.IOUtils; - import java.io.Serializable; import java.net.URL; import java.nio.charset.Charset; +import org.apache.commons.io.IOUtils; public class VocabularyHelper implements Serializable { - private final static String OPENAIRE_URL ="http://api.openaire.eu/vocabularies/%s.json"; + private static final String OPENAIRE_URL = "http://api.openaire.eu/vocabularies/%s.json"; public static Vocabulary getVocabularyFromAPI(final String vocabularyName) throws Exception { final URL url = new URL(String.format(OPENAIRE_URL, vocabularyName)); final String response = IOUtils.toString(url, Charset.defaultCharset()); - final ObjectMapper jsonMapper = new ObjectMapper(); - final Vocabulary vocabulary = jsonMapper.readValue(response, Vocabulary.class); + final ObjectMapper jsonMapper = new ObjectMapper(); + final Vocabulary vocabulary = jsonMapper.readValue(response, Vocabulary.class); return vocabulary; } - } diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/CollectionJobTest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/CollectionJobTest.java index fde928a8b..88fdd4b9c 100644 --- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/CollectionJobTest.java +++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/CollectionJobTest.java @@ -1,89 +1,106 @@ package eu.dnetlib.dhp.collection; -import java.io.IOException; -import java.nio.file.Files; -import java.nio.file.Path; - -import org.apache.commons.io.FileUtils; -import org.apache.commons.io.IOUtils; -import org.junit.jupiter.api.AfterEach; - -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.Test; - -import com.fasterxml.jackson.databind.ObjectMapper; - -import eu.dnetlib.dhp.model.mdstore.MetadataRecord; -import eu.dnetlib.dhp.model.mdstore.Provenance; - import static org.junit.jupiter.api.Assertions.*; +import com.fasterxml.jackson.databind.ObjectMapper; +import eu.dnetlib.dhp.model.mdstore.MetadataRecord; +import eu.dnetlib.dhp.model.mdstore.Provenance; +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import org.apache.commons.io.FileUtils; +import org.apache.commons.io.IOUtils; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + public class CollectionJobTest { - private Path testDir; + private Path testDir; - @BeforeEach - public void setup() throws IOException { - testDir = Files.createTempDirectory("dhp-collection"); - } + @BeforeEach + public void setup() throws IOException { + testDir = Files.createTempDirectory("dhp-collection"); + } - @AfterEach - public void teadDown() throws IOException { - FileUtils.deleteDirectory(testDir.toFile()); - } + @AfterEach + public void teadDown() throws IOException { + FileUtils.deleteDirectory(testDir.toFile()); + } - @Test - public void tesCollection() throws Exception { - final Provenance provenance = new Provenance("pippo", "puppa", "ns_prefix"); - GenerateNativeStoreSparkJob.main(new String[] { - "-mt", "local", - "-w", "wid", - "-e", "XML", - "-d", "" + System.currentTimeMillis(), - "-p", new ObjectMapper().writeValueAsString(provenance), - "-x", "./*[local-name()='record']/*[local-name()='header']/*[local-name()='identifier']", - "-i", this.getClass().getResource("/eu/dnetlib/dhp/collection/native.seq").toString(), - "-o", testDir.toString() + "/store", - "-t", "true", - "-ru", "", - "-rp", "", - "-rh", "", - "-ro", "", - "-rr", "" }); - System.out.println(new ObjectMapper().writeValueAsString(provenance)); - } + @Test + public void tesCollection() throws Exception { + final Provenance provenance = new Provenance("pippo", "puppa", "ns_prefix"); + GenerateNativeStoreSparkJob.main( + new String[] { + "-mt", "local", + "-w", "wid", + "-e", "XML", + "-d", "" + System.currentTimeMillis(), + "-p", new ObjectMapper().writeValueAsString(provenance), + "-x", + "./*[local-name()='record']/*[local-name()='header']/*[local-name()='identifier']", + "-i", + this.getClass() + .getResource("/eu/dnetlib/dhp/collection/native.seq") + .toString(), + "-o", testDir.toString() + "/store", + "-t", "true", + "-ru", "", + "-rp", "", + "-rh", "", + "-ro", "", + "-rr", "" + }); + System.out.println(new ObjectMapper().writeValueAsString(provenance)); + } - @Test - public void testGenerationMetadataRecord() throws Exception { + @Test + public void testGenerationMetadataRecord() throws Exception { - final String xml = IOUtils.toString(this.getClass().getResourceAsStream("./record.xml")); + final String xml = IOUtils.toString(this.getClass().getResourceAsStream("./record.xml")); - final MetadataRecord record = GenerateNativeStoreSparkJob - .parseRecord(xml, "./*[local-name()='record']/*[local-name()='header']/*[local-name()='identifier']", "XML", new Provenance("foo", "bar", - "ns_prefix"), System.currentTimeMillis(), null, null); + final MetadataRecord record = + GenerateNativeStoreSparkJob.parseRecord( + xml, + "./*[local-name()='record']/*[local-name()='header']/*[local-name()='identifier']", + "XML", + new Provenance("foo", "bar", "ns_prefix"), + System.currentTimeMillis(), + null, + null); - assert record != null; - System.out.println(record.getId()); - System.out.println(record.getOriginalId()); + assert record != null; + System.out.println(record.getId()); + System.out.println(record.getOriginalId()); + } - } - - @Test - public void TestEquals() throws IOException { - - final String xml = IOUtils.toString(this.getClass().getResourceAsStream("./record.xml")); - final MetadataRecord record = GenerateNativeStoreSparkJob - .parseRecord(xml, "./*[local-name()='record']/*[local-name()='header']/*[local-name()='identifier']", "XML", new Provenance("foo", "bar", - "ns_prefix"), System.currentTimeMillis(), null, null); - final MetadataRecord record1 = GenerateNativeStoreSparkJob - .parseRecord(xml, "./*[local-name()='record']/*[local-name()='header']/*[local-name()='identifier']", "XML", new Provenance("foo", "bar", - "ns_prefix"), System.currentTimeMillis(), null, null); - assert record != null; - record.setBody("ciao"); - assert record1 != null; - record1.setBody("mondo"); - assertEquals(record, record1); - - } + @Test + public void TestEquals() throws IOException { + final String xml = IOUtils.toString(this.getClass().getResourceAsStream("./record.xml")); + final MetadataRecord record = + GenerateNativeStoreSparkJob.parseRecord( + xml, + "./*[local-name()='record']/*[local-name()='header']/*[local-name()='identifier']", + "XML", + new Provenance("foo", "bar", "ns_prefix"), + System.currentTimeMillis(), + null, + null); + final MetadataRecord record1 = + GenerateNativeStoreSparkJob.parseRecord( + xml, + "./*[local-name()='record']/*[local-name()='header']/*[local-name()='identifier']", + "XML", + new Provenance("foo", "bar", "ns_prefix"), + System.currentTimeMillis(), + null, + null); + assert record != null; + record.setBody("ciao"); + assert record1 != null; + record1.setBody("mondo"); + assertEquals(record, record1); + } } diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collector/worker/DnetCollectorWorkerApplicationTests.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collector/worker/DnetCollectorWorkerApplicationTests.java index 665e989d8..7957d9041 100644 --- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collector/worker/DnetCollectorWorkerApplicationTests.java +++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collector/worker/DnetCollectorWorkerApplicationTests.java @@ -1,5 +1,8 @@ package eu.dnetlib.dhp.collector.worker; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.mockito.Mockito.*; + import com.fasterxml.jackson.databind.ObjectMapper; import eu.dnetlib.collector.worker.model.ApiDescriptor; import eu.dnetlib.dhp.application.ArgumentApplicationParser; @@ -7,23 +10,18 @@ import eu.dnetlib.dhp.collection.worker.DnetCollectorWorker; import eu.dnetlib.dhp.collection.worker.utils.CollectorPluginFactory; import eu.dnetlib.message.Message; import eu.dnetlib.message.MessageManager; +import java.io.File; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; -import java.io.File; - -import static org.junit.jupiter.api.Assertions.assertNotNull; -import static org.mockito.Mockito.*; - - public class DnetCollectorWorkerApplicationTests { - private ArgumentApplicationParser argumentParser = mock(ArgumentApplicationParser.class); private MessageManager messageManager = mock(MessageManager.class); private DnetCollectorWorker worker; + @BeforeEach public void setup() throws Exception { ObjectMapper mapper = new ObjectMapper(); @@ -35,25 +33,30 @@ public class DnetCollectorWorkerApplicationTests { when(argumentParser.get("workflowId")).thenReturn("sandro"); when(argumentParser.get("rabbitOngoingQueue")).thenReturn("sandro"); - when(messageManager.sendMessage(any(Message.class), anyString(), anyBoolean(),anyBoolean())).thenAnswer(a -> { - System.out.println("sent message: "+a.getArguments()[0]); - return true; - }); - when(messageManager.sendMessage(any(Message.class), anyString())).thenAnswer(a -> { - System.out.println("Called"); - return true; - }); - worker = new DnetCollectorWorker(new CollectorPluginFactory(), argumentParser, messageManager); + when(messageManager.sendMessage( + any(Message.class), anyString(), anyBoolean(), anyBoolean())) + .thenAnswer( + a -> { + System.out.println("sent message: " + a.getArguments()[0]); + return true; + }); + when(messageManager.sendMessage(any(Message.class), anyString())) + .thenAnswer( + a -> { + System.out.println("Called"); + return true; + }); + worker = + new DnetCollectorWorker( + new CollectorPluginFactory(), argumentParser, messageManager); } - @AfterEach - public void dropDown(){ + public void dropDown() { File f = new File("/tmp/file.seq"); f.delete(); } - @Test public void testFindPlugin() throws Exception { final CollectorPluginFactory collectorPluginEnumerator = new CollectorPluginFactory(); @@ -61,7 +64,6 @@ public class DnetCollectorWorkerApplicationTests { assertNotNull(collectorPluginEnumerator.getPluginByProtocol("OAI")); } - @Test public void testCollectionOAI() throws Exception { final ApiDescriptor api = new ApiDescriptor(); @@ -86,5 +88,4 @@ public class DnetCollectorWorkerApplicationTests { api.getParams().put("format", "oai_dc"); return api; } - } diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/transformation/TransformationJobTest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/transformation/TransformationJobTest.java index dfa0c3720..5bc9434cb 100644 --- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/transformation/TransformationJobTest.java +++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/transformation/TransformationJobTest.java @@ -1,10 +1,18 @@ package eu.dnetlib.dhp.transformation; +import static org.junit.jupiter.api.Assertions.assertNotNull; + import eu.dnetlib.dhp.model.mdstore.MetadataRecord; import eu.dnetlib.dhp.transformation.functions.Cleaner; import eu.dnetlib.dhp.transformation.vocabulary.Vocabulary; import eu.dnetlib.dhp.transformation.vocabulary.VocabularyHelper; import eu.dnetlib.dhp.utils.DHPUtils; +import java.io.StringWriter; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.HashMap; +import java.util.Map; +import javax.xml.transform.stream.StreamSource; import net.sf.saxon.s9api.*; import org.apache.commons.io.IOUtils; import org.apache.spark.util.LongAccumulator; @@ -18,20 +26,10 @@ import org.junit.jupiter.api.io.TempDir; import org.mockito.Mock; import org.mockito.junit.jupiter.MockitoExtension; -import javax.xml.transform.stream.StreamSource; -import java.io.StringWriter; -import java.nio.file.Files; -import java.nio.file.Path; -import java.util.HashMap; -import java.util.Map; - -import static org.junit.jupiter.api.Assertions.assertNotNull; - @ExtendWith(MockitoExtension.class) public class TransformationJobTest { - @Mock - private LongAccumulator accumulator; + @Mock private LongAccumulator accumulator; @Test public void testTransformSaxonHE() throws Exception { @@ -42,13 +40,24 @@ public class TransformationJobTest { Processor proc = new Processor(false); proc.registerExtensionFunction(cleanFunction); final XsltCompiler comp = proc.newXsltCompiler(); - XsltExecutable exp = comp.compile(new StreamSource(this.getClass().getResourceAsStream("/eu/dnetlib/dhp/transform/ext_simple.xsl"))); - XdmNode source = proc.newDocumentBuilder().build(new StreamSource(this.getClass().getResourceAsStream("/eu/dnetlib/dhp/transform/input.xml"))); + XsltExecutable exp = + comp.compile( + new StreamSource( + this.getClass() + .getResourceAsStream( + "/eu/dnetlib/dhp/transform/ext_simple.xsl"))); + XdmNode source = + proc.newDocumentBuilder() + .build( + new StreamSource( + this.getClass() + .getResourceAsStream( + "/eu/dnetlib/dhp/transform/input.xml"))); XsltTransformer trans = exp.load(); trans.setInitialContextNode(source); final StringWriter output = new StringWriter(); Serializer out = proc.newSerializer(output); - out.setOutputProperty(Serializer.Property.METHOD,"xml"); + out.setOutputProperty(Serializer.Property.METHOD, "xml"); out.setOutputProperty(Serializer.Property.INDENT, "yes"); trans.setDestination(out); trans.transform(); @@ -58,27 +67,35 @@ public class TransformationJobTest { @DisplayName("Test TransformSparkJobNode.main") @Test public void transformTest(@TempDir Path testDir) throws Exception { - final String mdstore_input = this.getClass().getResource("/eu/dnetlib/dhp/transform/mdstorenative").getFile(); - final String mdstore_output = testDir.toString()+"/version"; - final String xslt = DHPUtils.compressString(IOUtils.toString(this.getClass().getResourceAsStream("/eu/dnetlib/dhp/transform/tr.xml"))); - TransformSparkJobNode.main(new String[]{ - "-mt", "local", - "-i", mdstore_input, - "-o", mdstore_output, - "-d", "1", - "-w", "1", - "-tr", xslt, - "-t", "true", - "-ru", "", - "-rp", "", - "-rh", "", - "-ro", "", - "-rr", ""}); + final String mdstore_input = + this.getClass().getResource("/eu/dnetlib/dhp/transform/mdstorenative").getFile(); + final String mdstore_output = testDir.toString() + "/version"; + final String xslt = + DHPUtils.compressString( + IOUtils.toString( + this.getClass() + .getResourceAsStream("/eu/dnetlib/dhp/transform/tr.xml"))); + TransformSparkJobNode.main( + new String[] { + "-mt", "local", + "-i", mdstore_input, + "-o", mdstore_output, + "-d", "1", + "-w", "1", + "-tr", xslt, + "-t", "true", + "-ru", "", + "-rp", "", + "-rh", "", + "-ro", "", + "-rr", "" + }); } @Test public void tryLoadFolderOnCP() throws Exception { - final String path = this.getClass().getResource("/eu/dnetlib/dhp/transform/mdstorenative").getFile(); + final String path = + this.getClass().getResource("/eu/dnetlib/dhp/transform/mdstorenative").getFile(); System.out.println("path = " + path); Path tempDirWithPrefix = Files.createTempDirectory("mdstore_output"); @@ -88,20 +105,25 @@ public class TransformationJobTest { Files.deleteIfExists(tempDirWithPrefix); } - @Test public void testTransformFunction() throws Exception { SAXReader reader = new SAXReader(); - Document document = reader.read(this.getClass().getResourceAsStream("/eu/dnetlib/dhp/transform/tr.xml")); + Document document = + reader.read( + this.getClass().getResourceAsStream("/eu/dnetlib/dhp/transform/tr.xml")); Node node = document.selectSingleNode("//CODE/*[local-name()='stylesheet']"); final String xslt = node.asXML(); Map vocabularies = new HashMap<>(); vocabularies.put("dnet:languages", VocabularyHelper.getVocabularyFromAPI("dnet:languages")); - TransformFunction tf = new TransformFunction(accumulator, accumulator, accumulator, xslt, 1, vocabularies); + TransformFunction tf = + new TransformFunction(accumulator, accumulator, accumulator, xslt, 1, vocabularies); MetadataRecord record = new MetadataRecord(); - record.setBody(IOUtils.toString(this.getClass().getResourceAsStream("/eu/dnetlib/dhp/transform/input.xml"))); + record.setBody( + IOUtils.toString( + this.getClass() + .getResourceAsStream("/eu/dnetlib/dhp/transform/input.xml"))); final MetadataRecord result = tf.call(record); assertNotNull(result.getBody()); @@ -109,17 +131,19 @@ public class TransformationJobTest { System.out.println(result.getBody()); } - @Test public void extractTr() throws Exception { - final String xmlTr = IOUtils.toString(this.getClass().getResourceAsStream("/eu/dnetlib/dhp/transform/tr.xml")); + final String xmlTr = + IOUtils.toString( + this.getClass().getResourceAsStream("/eu/dnetlib/dhp/transform/tr.xml")); SAXReader reader = new SAXReader(); - Document document = reader.read(this.getClass().getResourceAsStream("/eu/dnetlib/dhp/transform/tr.xml")); + Document document = + reader.read( + this.getClass().getResourceAsStream("/eu/dnetlib/dhp/transform/tr.xml")); Node node = document.selectSingleNode("//CODE/*[local-name()='stylesheet']"); System.out.println(node.asXML()); } - } diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/transformation/vocabulary/VocabularyTest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/transformation/vocabulary/VocabularyTest.java index c2db17a9d..04b933fb5 100644 --- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/transformation/vocabulary/VocabularyTest.java +++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/transformation/vocabulary/VocabularyTest.java @@ -1,17 +1,15 @@ package eu.dnetlib.dhp.transformation.vocabulary; -import org.junit.jupiter.api.Test; import static org.junit.jupiter.api.Assertions.*; + +import org.junit.jupiter.api.Test; + public class VocabularyTest { - - @Test public void testLoadVocabulary() throws Exception { final Vocabulary vocabulary = VocabularyHelper.getVocabularyFromAPI("dnet:languages"); - assertEquals("dnet:languages",vocabulary.getName()); - - + assertEquals("dnet:languages", vocabulary.getName()); } } diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/AbstractSparkAction.java b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/AbstractSparkAction.java index f52e4bb39..bb310050c 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/AbstractSparkAction.java +++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/AbstractSparkAction.java @@ -4,11 +4,15 @@ import com.fasterxml.jackson.databind.DeserializationFeature; import com.fasterxml.jackson.databind.ObjectMapper; import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.common.HdfsSupport; -import eu.dnetlib.dhp.schema.common.ModelSupport; import eu.dnetlib.dhp.schema.oaf.*; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; import eu.dnetlib.pace.config.DedupConfig; +import java.io.IOException; +import java.io.Serializable; +import java.io.StringReader; +import java.util.ArrayList; +import java.util.List; import org.apache.spark.SparkConf; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.SaveMode; @@ -18,19 +22,13 @@ import org.dom4j.DocumentException; import org.dom4j.Element; import org.dom4j.io.SAXReader; -import java.io.IOException; -import java.io.Serializable; -import java.io.StringReader; -import java.util.ArrayList; -import java.util.List; - abstract class AbstractSparkAction implements Serializable { - protected static final ObjectMapper OBJECT_MAPPER = new ObjectMapper() - .configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); + protected static final ObjectMapper OBJECT_MAPPER = + new ObjectMapper().configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); - public ArgumentApplicationParser parser; //parameters for the spark action - public SparkSession spark; //the spark session + public ArgumentApplicationParser parser; // parameters for the spark action + public SparkSession spark; // the spark session public AbstractSparkAction(ArgumentApplicationParser parser, SparkSession spark) { @@ -38,9 +36,12 @@ abstract class AbstractSparkAction implements Serializable { this.spark = spark; } - public List getConfigurations(ISLookUpService isLookUpService, String orchestrator) throws ISLookUpException, DocumentException, IOException { + public List getConfigurations(ISLookUpService isLookUpService, String orchestrator) + throws ISLookUpException, DocumentException, IOException { - final String xquery = String.format("/RESOURCE_PROFILE[.//DEDUPLICATION/ACTION_SET/@id = '%s']", orchestrator); + final String xquery = + String.format( + "/RESOURCE_PROFILE[.//DEDUPLICATION/ACTION_SET/@id = '%s']", orchestrator); String orchestratorProfile = isLookUpService.getResourceProfileByQuery(xquery); @@ -57,14 +58,16 @@ abstract class AbstractSparkAction implements Serializable { return configurations; } - private DedupConfig loadConfig(final ISLookUpService isLookUpService, final String actionSetId, final Object o) + private DedupConfig loadConfig( + final ISLookUpService isLookUpService, final String actionSetId, final Object o) throws ISLookUpException, IOException { final Element s = (Element) o; final String configProfileId = s.attributeValue("id"); final String conf = - isLookUpService.getResourceProfileByQuery(String.format( - "for $x in /RESOURCE_PROFILE[.//RESOURCE_IDENTIFIER/@value = '%s'] return $x//DEDUPLICATION/text()", - configProfileId)); + isLookUpService.getResourceProfileByQuery( + String.format( + "for $x in /RESOURCE_PROFILE[.//RESOURCE_IDENTIFIER/@value = '%s'] return $x//DEDUPLICATION/text()", + configProfileId)); DedupConfig dedupConfig = new ObjectMapper().readValue(conf, DedupConfig.class); dedupConfig.getPace().initModel(); @@ -74,21 +77,15 @@ abstract class AbstractSparkAction implements Serializable { return dedupConfig; } - abstract void run(ISLookUpService isLookUpService) throws DocumentException, IOException, ISLookUpException; + abstract void run(ISLookUpService isLookUpService) + throws DocumentException, IOException, ISLookUpException; protected static SparkSession getSparkSession(SparkConf conf) { - return SparkSession - .builder() - .config(conf) - .getOrCreate(); + return SparkSession.builder().config(conf).getOrCreate(); } protected static void save(Dataset dataset, String outPath, SaveMode mode) { - dataset - .write() - .option("compression", "gzip") - .mode(mode) - .json(outPath); + dataset.write().option("compression", "gzip").mode(mode).json(outPath); } protected static void removeOutputDir(SparkSession spark, String path) { diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/DatePicker.java b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/DatePicker.java index b4d0e268a..905fbace6 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/DatePicker.java +++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/DatePicker.java @@ -1,18 +1,17 @@ package eu.dnetlib.dhp.oa.dedup; -import eu.dnetlib.dhp.schema.oaf.Field; -import org.apache.commons.lang.StringUtils; - -import java.time.Year; -import java.util.*; -import java.util.stream.Collectors; - import static java.util.Collections.reverseOrder; import static java.util.Map.Entry.comparingByValue; import static java.util.stream.Collectors.toMap; import static org.apache.commons.lang.StringUtils.endsWith; import static org.apache.commons.lang.StringUtils.substringBefore; +import eu.dnetlib.dhp.schema.oaf.Field; +import java.time.Year; +import java.util.*; +import java.util.stream.Collectors; +import org.apache.commons.lang.StringUtils; + public class DatePicker { private static final String DATE_PATTERN = "\\d{4}-\\d{2}-\\d{2}"; @@ -22,33 +21,32 @@ public class DatePicker { public static Field pick(final Collection dateofacceptance) { - final Map frequencies = dateofacceptance - .parallelStream() - .filter(StringUtils::isNotBlank) - .collect( - Collectors.toConcurrentMap( - w -> w, w -> 1, Integer::sum)); + final Map frequencies = + dateofacceptance + .parallelStream() + .filter(StringUtils::isNotBlank) + .collect(Collectors.toConcurrentMap(w -> w, w -> 1, Integer::sum)); if (frequencies.isEmpty()) { return new Field<>(); } final Field date = new Field<>(); - date.setValue(frequencies.keySet().iterator().next()); + date.setValue(frequencies.keySet().iterator().next()); // let's sort this map by values first, filtering out invalid dates - final Map sorted = frequencies - .entrySet() - .stream() - .filter(d -> StringUtils.isNotBlank(d.getKey())) - .filter(d -> d.getKey().matches(DATE_PATTERN)) - .filter(d -> inRange(d.getKey())) - .sorted(reverseOrder(comparingByValue())) - .collect( - toMap( - Map.Entry::getKey, - Map.Entry::getValue, (e1, e2) -> e2, - LinkedHashMap::new)); + final Map sorted = + frequencies.entrySet().stream() + .filter(d -> StringUtils.isNotBlank(d.getKey())) + .filter(d -> d.getKey().matches(DATE_PATTERN)) + .filter(d -> inRange(d.getKey())) + .sorted(reverseOrder(comparingByValue())) + .collect( + toMap( + Map.Entry::getKey, + Map.Entry::getValue, + (e1, e2) -> e2, + LinkedHashMap::new)); // shortcut if (sorted.size() == 0) { @@ -58,18 +56,24 @@ public class DatePicker { // voting method (1/3 + 1) wins if (sorted.size() >= 3) { final int acceptThreshold = (sorted.size() / 3) + 1; - final List accepted = sorted.entrySet().stream() - .filter(e -> e.getValue() >= acceptThreshold) - .map(e -> e.getKey()) - .collect(Collectors.toList()); + final List accepted = + sorted.entrySet().stream() + .filter(e -> e.getValue() >= acceptThreshold) + .map(e -> e.getKey()) + .collect(Collectors.toList()); // cannot find strong majority if (accepted.isEmpty()) { final int max = sorted.values().iterator().next(); - Optional first = sorted.entrySet().stream() - .filter(e -> e.getValue() == max && !endsWith(e.getKey(), DATE_DEFAULT_SUFFIX)) - .map(Map.Entry::getKey) - .findFirst(); + Optional first = + sorted.entrySet().stream() + .filter( + e -> + e.getValue() == max + && !endsWith( + e.getKey(), DATE_DEFAULT_SUFFIX)) + .map(Map.Entry::getKey) + .findFirst(); if (first.isPresent()) { date.setValue(first.get()); return date; @@ -83,9 +87,10 @@ public class DatePicker { date.setValue(accepted.get(0)); return date; } else { - final Optional first = accepted.stream() - .filter(d -> !endsWith(d, DATE_DEFAULT_SUFFIX)) - .findFirst(); + final Optional first = + accepted.stream() + .filter(d -> !endsWith(d, DATE_DEFAULT_SUFFIX)) + .findFirst(); if (first.isPresent()) { date.setValue(first.get()); return date; @@ -94,7 +99,7 @@ public class DatePicker { return date; } - //1st non YYYY-01-01 is returned + // 1st non YYYY-01-01 is returned } else { if (sorted.size() == 2) { for (Map.Entry e : sorted.entrySet()) { @@ -115,5 +120,4 @@ public class DatePicker { final int year = Integer.parseInt(substringBefore(date, "-")); return year >= YEAR_LB && year <= YEAR_UB; } - -} \ No newline at end of file +} diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/DedupRecordFactory.java b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/DedupRecordFactory.java index 1c957c9e0..1cb04d02f 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/DedupRecordFactory.java +++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/DedupRecordFactory.java @@ -3,63 +3,76 @@ package eu.dnetlib.dhp.oa.dedup; import com.fasterxml.jackson.databind.DeserializationFeature; import com.fasterxml.jackson.databind.ObjectMapper; import com.google.common.collect.Lists; -import eu.dnetlib.dhp.schema.common.EntityType; import eu.dnetlib.dhp.schema.common.ModelSupport; import eu.dnetlib.dhp.schema.oaf.*; import eu.dnetlib.pace.config.DedupConfig; -import org.apache.spark.api.java.JavaPairRDD; +import java.util.Collection; +import java.util.Iterator; import org.apache.spark.api.java.function.FilterFunction; import org.apache.spark.api.java.function.MapFunction; import org.apache.spark.api.java.function.MapGroupsFunction; -import org.apache.spark.api.java.function.PairFunction; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Encoders; import org.apache.spark.sql.SparkSession; - import scala.Tuple2; -import java.util.Collection; -import java.util.Iterator; - public class DedupRecordFactory { - protected static final ObjectMapper OBJECT_MAPPER = new com.fasterxml.jackson.databind.ObjectMapper() - .configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); + protected static final ObjectMapper OBJECT_MAPPER = + new com.fasterxml.jackson.databind.ObjectMapper() + .configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); public static Dataset createDedupRecord( - final SparkSession spark, final String mergeRelsInputPath, final String entitiesInputPath, final Class clazz) { + final SparkSession spark, + final String mergeRelsInputPath, + final String entitiesInputPath, + final Class clazz, + final DedupConfig dedupConf) { long ts = System.currentTimeMillis(); - // - Dataset> entities = spark.read() - .textFile(entitiesInputPath) - .map((MapFunction>) s -> { - T entity = OBJECT_MAPPER.readValue(s, clazz); - return new Tuple2<>(entity.getId(), entity); - }, Encoders.tuple(Encoders.STRING(), Encoders.kryo(clazz))); + // + Dataset> entities = + spark.read() + .textFile(entitiesInputPath) + .map( + (MapFunction>) + it -> { + T entity = OBJECT_MAPPER.readValue(it, clazz); + return new Tuple2<>(entity.getId(), entity); + }, + Encoders.tuple(Encoders.STRING(), Encoders.kryo(clazz))); + // : source is the dedup_id, target is the id of the mergedIn + Dataset> mergeRels = + spark.read() + .load(mergeRelsInputPath) + .as(Encoders.bean(Relation.class)) + .where("relClass == 'merges'") + .map( + (MapFunction>) + r -> new Tuple2<>(r.getSource(), r.getTarget()), + Encoders.tuple(Encoders.STRING(), Encoders.STRING())); - //: source is the dedup_id, target is the id of the mergedIn - Dataset> mergeRels = spark - .read() - .load(mergeRelsInputPath) - .as(Encoders.bean(Relation.class)) - .where("relClass == 'merges'") - .map((MapFunction>) - r -> new Tuple2<>(r.getSource(), r.getTarget()), Encoders.tuple(Encoders.STRING(), Encoders.STRING())); - - // - return mergeRels.joinWith(entities, mergeRels.col("_1").equalTo(entities.col("_1")), "left_outer") - .filter((FilterFunction, Tuple2>>) value -> value._2() != null) - .map((MapFunction, Tuple2>, T>) - value -> value._2()._2(), Encoders.kryo(clazz)) + // + return mergeRels + .joinWith(entities, mergeRels.col("_1").equalTo(entities.col("_1")), "left_outer") + .filter( + (FilterFunction, Tuple2>>) + value -> value._2() != null) + .map( + (MapFunction, Tuple2>, T>) + value -> value._2()._2(), + Encoders.kryo(clazz)) .groupByKey((MapFunction) value -> value.getId(), Encoders.STRING()) - .mapGroups((MapGroupsFunction) - (key, values) -> entityMerger(key, values, ts, clazz), Encoders.bean(clazz)); + .mapGroups( + (MapGroupsFunction) + (key, values) -> entityMerger(key, values, ts, clazz), + Encoders.bean(clazz)); } - private static T entityMerger(String id, Iterator entities, final long ts, Class clazz) { + private static T entityMerger( + String id, Iterator entities, final long ts, Class clazz) { try { T entity = clazz.newInstance(); entity.setId(id); @@ -70,18 +83,19 @@ public class DedupRecordFactory { entity.setLastupdatetimestamp(ts); final Collection dates = Lists.newArrayList(); - entities.forEachRemaining(e -> { - entity.mergeFrom(e); - if (ModelSupport.isSubClass(e, Result.class)) { - Result r1 = (Result) e; - Result er = (Result) entity; - er.setAuthor(DedupUtility.mergeAuthor(er.getAuthor(), r1.getAuthor())); + entities.forEachRemaining( + e -> { + entity.mergeFrom(e); + if (ModelSupport.isSubClass(e, Result.class)) { + Result r1 = (Result) e; + Result er = (Result) entity; + er.setAuthor(DedupUtility.mergeAuthor(er.getAuthor(), r1.getAuthor())); - if (er.getDateofacceptance() != null) { - dates.add(r1.getDateofacceptance().getValue()); - } - } - }); + if (er.getDateofacceptance() != null) { + dates.add(r1.getDateofacceptance().getValue()); + } + } + }); if (ModelSupport.isSubClass(entity, Result.class)) { ((Result) entity).setDateofacceptance(DatePicker.pick(dates)); @@ -91,5 +105,4 @@ public class DedupRecordFactory { throw new RuntimeException(e); } } - } diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/DedupUtility.java b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/DedupUtility.java index 39f52151a..a832a5868 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/DedupUtility.java +++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/DedupUtility.java @@ -9,9 +9,13 @@ import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; import eu.dnetlib.pace.clustering.BlacklistAwareClusteringCombiner; import eu.dnetlib.pace.config.DedupConfig; - import eu.dnetlib.pace.model.MapDocument; import eu.dnetlib.pace.model.Person; +import java.io.StringReader; +import java.security.MessageDigest; +import java.text.Normalizer; +import java.util.*; +import java.util.stream.Collectors; import org.apache.commons.codec.binary.Hex; import org.apache.commons.lang3.StringUtils; import org.apache.spark.SparkContext; @@ -22,30 +26,43 @@ import org.dom4j.Element; import org.dom4j.io.SAXReader; import scala.Tuple2; -import java.io.StringReader; -import java.security.MessageDigest; -import java.text.Normalizer; -import java.util.*; -import java.util.stream.Collectors; - public class DedupUtility { private static final Double THRESHOLD = 0.95; - public static Map constructAccumulator(final DedupConfig dedupConf, final SparkContext context) { + public static Map constructAccumulator( + final DedupConfig dedupConf, final SparkContext context) { Map accumulators = new HashMap<>(); - String acc1 = String.format("%s::%s", dedupConf.getWf().getEntityType(), "records per hash key = 1"); + String acc1 = + String.format( + "%s::%s", dedupConf.getWf().getEntityType(), "records per hash key = 1"); accumulators.put(acc1, context.longAccumulator(acc1)); - String acc2 = String.format("%s::%s", dedupConf.getWf().getEntityType(), "missing " + dedupConf.getWf().getOrderField()); + String acc2 = + String.format( + "%s::%s", + dedupConf.getWf().getEntityType(), + "missing " + dedupConf.getWf().getOrderField()); accumulators.put(acc2, context.longAccumulator(acc2)); - String acc3 = String.format("%s::%s", dedupConf.getWf().getEntityType(), String.format("Skipped records for count(%s) >= %s", dedupConf.getWf().getOrderField(), dedupConf.getWf().getGroupMaxSize())); + String acc3 = + String.format( + "%s::%s", + dedupConf.getWf().getEntityType(), + String.format( + "Skipped records for count(%s) >= %s", + dedupConf.getWf().getOrderField(), + dedupConf.getWf().getGroupMaxSize())); accumulators.put(acc3, context.longAccumulator(acc3)); String acc4 = String.format("%s::%s", dedupConf.getWf().getEntityType(), "skip list"); accumulators.put(acc4, context.longAccumulator(acc4)); - String acc5 = String.format("%s::%s", dedupConf.getWf().getEntityType(), "dedupSimilarity (x2)"); + String acc5 = + String.format("%s::%s", dedupConf.getWf().getEntityType(), "dedupSimilarity (x2)"); accumulators.put(acc5, context.longAccumulator(acc5)); - String acc6 = String.format("%s::%s", dedupConf.getWf().getEntityType(), "d < " + dedupConf.getWf().getThreshold()); + String acc6 = + String.format( + "%s::%s", + dedupConf.getWf().getEntityType(), + "d < " + dedupConf.getWf().getThreshold()); accumulators.put(acc6, context.longAccumulator(acc6)); return accumulators; @@ -66,7 +83,6 @@ public class DedupUtility { } } - public static List mergeAuthor(final List a, final List b) { int pa = countAuthorsPids(a); int pb = countAuthorsPids(b); @@ -86,32 +102,44 @@ public class DedupUtility { } private static void enrichPidFromList(List base, List enrich) { - if (base == null || enrich == null) - return; - final Map basePidAuthorMap = base.stream() - .filter(a -> a.getPid() != null && a.getPid().size() > 0) - .flatMap(a -> a.getPid() - .stream() - .map(p -> new Tuple2<>(p.toComparableString(), a)) - ).collect(Collectors.toMap(Tuple2::_1, Tuple2::_2, (x1, x2) -> x1)); + if (base == null || enrich == null) return; + final Map basePidAuthorMap = + base.stream() + .filter(a -> a.getPid() != null && a.getPid().size() > 0) + .flatMap( + a -> + a.getPid().stream() + .map(p -> new Tuple2<>(p.toComparableString(), a))) + .collect(Collectors.toMap(Tuple2::_1, Tuple2::_2, (x1, x2) -> x1)); - final List> pidToEnrich = enrich - .stream() - .filter(a -> a.getPid() != null && a.getPid().size() > 0) - .flatMap(a -> a.getPid().stream().filter(p -> !basePidAuthorMap.containsKey(p.toComparableString())).map(p -> new Tuple2<>(p, a))) - .collect(Collectors.toList()); + final List> pidToEnrich = + enrich.stream() + .filter(a -> a.getPid() != null && a.getPid().size() > 0) + .flatMap( + a -> + a.getPid().stream() + .filter( + p -> + !basePidAuthorMap.containsKey( + p.toComparableString())) + .map(p -> new Tuple2<>(p, a))) + .collect(Collectors.toList()); - - pidToEnrich.forEach(a -> { - Optional> simAuhtor = base.stream().map(ba -> new Tuple2<>(sim(ba, a._2()), ba)).max(Comparator.comparing(Tuple2::_1)); - if (simAuhtor.isPresent() && simAuhtor.get()._1() > THRESHOLD) { - Author r = simAuhtor.get()._2(); - r.getPid().add(a._1()); - } - }); + pidToEnrich.forEach( + a -> { + Optional> simAuhtor = + base.stream() + .map(ba -> new Tuple2<>(sim(ba, a._2()), ba)) + .max(Comparator.comparing(Tuple2::_1)); + if (simAuhtor.isPresent() && simAuhtor.get()._1() > THRESHOLD) { + Author r = simAuhtor.get()._2(); + r.getPid().add(a._1()); + } + }); } - public static String createDedupRecordPath(final String basePath, final String actionSetId, final String entityType) { + public static String createDedupRecordPath( + final String basePath, final String actionSetId, final String entityType) { return String.format("%s/%s/%s_deduprecord", basePath, actionSetId, entityType); } @@ -119,11 +147,13 @@ public class DedupUtility { return String.format("%s/%s", basePath, entityType); } - public static String createSimRelPath(final String basePath, final String actionSetId,final String entityType) { + public static String createSimRelPath( + final String basePath, final String actionSetId, final String entityType) { return String.format("%s/%s/%s_simrel", basePath, actionSetId, entityType); } - public static String createMergeRelPath(final String basePath, final String actionSetId, final String entityType) { + public static String createMergeRelPath( + final String basePath, final String actionSetId, final String entityType) { return String.format("%s/%s/%s_mergerel", basePath, actionSetId, entityType); } @@ -133,19 +163,21 @@ public class DedupUtility { final Person pb = parse(b); if (pa.isAccurate() & pb.isAccurate()) { - return new JaroWinkler().score( - normalize(pa.getSurnameString()), - normalize(pb.getSurnameString())); + return new JaroWinkler() + .score(normalize(pa.getSurnameString()), normalize(pb.getSurnameString())); } else { - return new JaroWinkler().score( - normalize(pa.getNormalisedFullname()), - normalize(pb.getNormalisedFullname())); + return new JaroWinkler() + .score( + normalize(pa.getNormalisedFullname()), + normalize(pb.getNormalisedFullname())); } } private static String normalize(final String s) { return nfd(s).toLowerCase() - // do not compact the regexes in a single expression, would cause StackOverflowError in case of large input strings + // do not compact the regexes in a single expression, would cause StackOverflowError + // in case + // of large input strings .replaceAll("(\\W)+", " ") .replaceAll("(\\p{InCombiningDiacriticalMarks})+", " ") .replaceAll("(\\p{Punct})+", " ") @@ -166,30 +198,29 @@ public class DedupUtility { } } - private static int countAuthorsPids(List authors) { - if (authors == null) - return 0; + if (authors == null) return 0; return (int) authors.stream().filter(DedupUtility::hasPid).count(); } private static int authorsSize(List authors) { - if (authors == null) - return 0; + if (authors == null) return 0; return authors.size(); } private static boolean hasPid(Author a) { - if (a == null || a.getPid() == null || a.getPid().size() == 0) - return false; + if (a == null || a.getPid() == null || a.getPid().size() == 0) return false; return a.getPid().stream().anyMatch(p -> p != null && StringUtils.isNotBlank(p.getValue())); } - public static List getConfigurations(String isLookUpUrl, String orchestrator) throws ISLookUpException, DocumentException { + public static List getConfigurations(String isLookUpUrl, String orchestrator) + throws ISLookUpException, DocumentException { final ISLookUpService isLookUpService = ISLookupClientFactory.getLookUpService(isLookUpUrl); - final String xquery = String.format("/RESOURCE_PROFILE[.//DEDUPLICATION/ACTION_SET/@id = '%s']", orchestrator); + final String xquery = + String.format( + "/RESOURCE_PROFILE[.//DEDUPLICATION/ACTION_SET/@id = '%s']", orchestrator); String orchestratorProfile = isLookUpService.getResourceProfileByQuery(xquery); @@ -203,17 +234,18 @@ public class DedupUtility { } return configurations; - } - private static DedupConfig loadConfig(final ISLookUpService isLookUpService, final String actionSetId, final Object o) + private static DedupConfig loadConfig( + final ISLookUpService isLookUpService, final String actionSetId, final Object o) throws ISLookUpException { final Element s = (Element) o; final String configProfileId = s.attributeValue("id"); final String conf = - isLookUpService.getResourceProfileByQuery(String.format( - "for $x in /RESOURCE_PROFILE[.//RESOURCE_IDENTIFIER/@value = '%s'] return $x//DEDUPLICATION/text()", - configProfileId)); + isLookUpService.getResourceProfileByQuery( + String.format( + "for $x in /RESOURCE_PROFILE[.//RESOURCE_IDENTIFIER/@value = '%s'] return $x//DEDUPLICATION/text()", + configProfileId)); final DedupConfig dedupConfig = DedupConfig.load(conf); dedupConfig.getWf().setConfigurationId(actionSetId); return dedupConfig; diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/Deduper.java b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/Deduper.java index 28b85853f..a7554ebe1 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/Deduper.java +++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/Deduper.java @@ -4,47 +4,50 @@ import eu.dnetlib.dhp.oa.dedup.model.Block; import eu.dnetlib.pace.config.DedupConfig; import eu.dnetlib.pace.model.MapDocument; import eu.dnetlib.pace.util.BlockProcessor; +import java.util.Map; +import java.util.stream.Collectors; import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.util.LongAccumulator; import scala.Serializable; import scala.Tuple2; -import java.util.Map; -import java.util.stream.Collectors; - public class Deduper implements Serializable { - public static JavaPairRDD computeRelations(JavaSparkContext context, JavaPairRDD blocks, DedupConfig config) { - Map accumulators = DedupUtility.constructAccumulator(config, context.sc()); + public static JavaPairRDD computeRelations( + JavaSparkContext context, JavaPairRDD blocks, DedupConfig config) { + Map accumulators = + DedupUtility.constructAccumulator(config, context.sc()); - return blocks - .flatMapToPair(it -> { - final SparkReporter reporter = new SparkReporter(accumulators); - new BlockProcessor(config).processSortedBlock(it._1(), it._2().getDocuments(), reporter); - return reporter.getRelations().iterator(); - }) + return blocks.flatMapToPair( + it -> { + final SparkReporter reporter = new SparkReporter(accumulators); + new BlockProcessor(config) + .processSortedBlock(it._1(), it._2().getDocuments(), reporter); + return reporter.getRelations().iterator(); + }) .mapToPair(it -> new Tuple2<>(it._1() + it._2(), it)) .reduceByKey((a, b) -> a) .mapToPair(Tuple2::_2); } - public static JavaPairRDD createSortedBlocks(JavaPairRDD mapDocs, DedupConfig config) { + public static JavaPairRDD createSortedBlocks( + JavaPairRDD mapDocs, DedupConfig config) { final String of = config.getWf().getOrderField(); final int maxQueueSize = config.getWf().getGroupMaxSize(); return mapDocs - //the reduce is just to be sure that we haven't document with same id + // the reduce is just to be sure that we haven't document with same id .reduceByKey((a, b) -> a) .map(Tuple2::_2) - //Clustering: from to List - .flatMap(a -> DedupUtility.getGroupingKeys(config, a) - .stream() - .map(it -> Block.from(it, a)) - .collect(Collectors.toList()) - .iterator()) + // Clustering: from to List + .flatMap( + a -> + DedupUtility.getGroupingKeys(config, a).stream() + .map(it -> Block.from(it, a)) + .collect(Collectors.toList()) + .iterator()) .mapToPair(block -> new Tuple2<>(block.getKey(), block)) .reduceByKey((b1, b2) -> Block.from(b1, b2, of, maxQueueSize)); } - -} \ No newline at end of file +} diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkCreateDedupRecord.java b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkCreateDedupRecord.java index 8e90d2a1f..127a19139 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkCreateDedupRecord.java +++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkCreateDedupRecord.java @@ -8,6 +8,7 @@ import eu.dnetlib.dhp.utils.ISLookupClientFactory; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; import eu.dnetlib.pace.config.DedupConfig; +import java.io.IOException; import org.apache.commons.io.IOUtils; import org.apache.spark.SparkConf; import org.apache.spark.api.java.function.MapFunction; @@ -18,8 +19,6 @@ import org.dom4j.DocumentException; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.io.IOException; - public class SparkCreateDedupRecord extends AbstractSparkAction { private static final Logger log = LoggerFactory.getLogger(SparkCreateDedupRecord.class); @@ -29,9 +28,11 @@ public class SparkCreateDedupRecord extends AbstractSparkAction { } public static void main(String[] args) throws Exception { - ArgumentApplicationParser parser = new ArgumentApplicationParser( - IOUtils.toString( - SparkCreateSimRels.class.getResourceAsStream("/eu/dnetlib/dhp/oa/dedup/createDedupRecord_parameters.json"))); + ArgumentApplicationParser parser = + new ArgumentApplicationParser( + IOUtils.toString( + SparkCreateSimRels.class.getResourceAsStream( + "/eu/dnetlib/dhp/oa/dedup/createDedupRecord_parameters.json"))); parser.parseArgument(args); SparkConf conf = new SparkConf(); @@ -43,7 +44,8 @@ public class SparkCreateDedupRecord extends AbstractSparkAction { } @Override - public void run(ISLookUpService isLookUpService) throws ISLookUpException, DocumentException, IOException { + public void run(ISLookUpService isLookUpService) + throws ISLookUpException, DocumentException, IOException { final String graphBasePath = parser.get("graphBasePath"); final String isLookUpUrl = parser.get("isLookUpUrl"); @@ -55,25 +57,28 @@ public class SparkCreateDedupRecord extends AbstractSparkAction { log.info("actionSetId: '{}'", actionSetId); log.info("workingPath: '{}'", workingPath); - for (DedupConfig dedupConf: getConfigurations(isLookUpService, actionSetId)) { + for (DedupConfig dedupConf : getConfigurations(isLookUpService, actionSetId)) { String subEntity = dedupConf.getWf().getSubEntityValue(); log.info("Creating deduprecords for: '{}'", subEntity); - final String outputPath = DedupUtility.createDedupRecordPath(workingPath, actionSetId, subEntity); + final String outputPath = + DedupUtility.createDedupRecordPath(workingPath, actionSetId, subEntity); removeOutputDir(spark, outputPath); - final String mergeRelPath = DedupUtility.createMergeRelPath(workingPath, actionSetId, subEntity); + final String mergeRelPath = + DedupUtility.createMergeRelPath(workingPath, actionSetId, subEntity); final String entityPath = DedupUtility.createEntityPath(graphBasePath, subEntity); Class clazz = ModelSupport.entityTypes.get(EntityType.valueOf(subEntity)); - DedupRecordFactory.createDedupRecord(spark, mergeRelPath, entityPath, clazz) - .map((MapFunction) value -> OBJECT_MAPPER.writeValueAsString(value), Encoders.STRING()) - .write() - .mode(SaveMode.Overwrite) - .json(outputPath); + DedupRecordFactory.createDedupRecord(spark, mergeRelPath, entityPath, clazz, dedupConf) + .map( + (MapFunction) + value -> OBJECT_MAPPER.writeValueAsString(value), + Encoders.STRING()) + .write() + .mode(SaveMode.Overwrite) + .json(outputPath); } - } - } diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkCreateMergeRels.java b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkCreateMergeRels.java index 9c46404b7..ca6d04d49 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkCreateMergeRels.java +++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkCreateMergeRels.java @@ -1,8 +1,8 @@ package eu.dnetlib.dhp.oa.dedup; import com.google.common.hash.Hashing; -import eu.dnetlib.dhp.oa.dedup.graph.ConnectedComponent; import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.oa.dedup.graph.ConnectedComponent; import eu.dnetlib.dhp.oa.dedup.graph.GraphProcessor; import eu.dnetlib.dhp.schema.common.ModelSupport; import eu.dnetlib.dhp.schema.oaf.DataInfo; @@ -13,6 +13,10 @@ import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; import eu.dnetlib.pace.config.DedupConfig; import eu.dnetlib.pace.util.MapDocumentUtil; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; import org.apache.commons.io.IOUtils; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaPairRDD; @@ -25,14 +29,9 @@ import org.apache.spark.sql.Encoders; import org.apache.spark.sql.SaveMode; import org.apache.spark.sql.SparkSession; import org.dom4j.DocumentException; -import scala.Tuple2; import org.slf4j.Logger; import org.slf4j.LoggerFactory; - -import java.io.IOException; -import java.util.ArrayList; -import java.util.Iterator; -import java.util.List; +import scala.Tuple2; public class SparkCreateMergeRels extends AbstractSparkAction { @@ -45,9 +44,11 @@ public class SparkCreateMergeRels extends AbstractSparkAction { } public static void main(String[] args) throws Exception { - ArgumentApplicationParser parser = new ArgumentApplicationParser( - IOUtils.toString( - SparkCreateSimRels.class.getResourceAsStream("/eu/dnetlib/dhp/oa/dedup/createCC_parameters.json"))); + ArgumentApplicationParser parser = + new ArgumentApplicationParser( + IOUtils.toString( + SparkCreateSimRels.class.getResourceAsStream( + "/eu/dnetlib/dhp/oa/dedup/createCC_parameters.json"))); parser.parseArgument(args); final String isLookUpUrl = parser.get("isLookUpUrl"); @@ -57,11 +58,13 @@ public class SparkCreateMergeRels extends AbstractSparkAction { conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); conf.registerKryoClasses(ModelSupport.getOafModelClasses()); - new SparkCreateMergeRels(parser, getSparkSession(conf)).run(ISLookupClientFactory.getLookUpService(isLookUpUrl)); + new SparkCreateMergeRels(parser, getSparkSession(conf)) + .run(ISLookupClientFactory.getLookUpService(isLookUpUrl)); } @Override - public void run(ISLookUpService isLookUpService) throws ISLookUpException, DocumentException, IOException { + public void run(ISLookUpService isLookUpService) + throws ISLookUpException, DocumentException, IOException { final String graphBasePath = parser.get("graphBasePath"); final String workingPath = parser.get("workingPath"); @@ -75,7 +78,7 @@ public class SparkCreateMergeRels extends AbstractSparkAction { final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); - for (DedupConfig dedupConf: getConfigurations(isLookUpService, actionSetId)) { + for (DedupConfig dedupConf : getConfigurations(isLookUpService, actionSetId)) { final String subEntity = dedupConf.getWf().getSubEntityValue(); log.info("Creating mergerels for: '{}'", subEntity); @@ -83,46 +86,59 @@ public class SparkCreateMergeRels extends AbstractSparkAction { final int maxIterations = dedupConf.getWf().getMaxIterations(); log.info("Max iterations {}", maxIterations); - final String mergeRelPath = DedupUtility.createMergeRelPath(workingPath, actionSetId, subEntity); + final String mergeRelPath = + DedupUtility.createMergeRelPath(workingPath, actionSetId, subEntity); - final JavaPairRDD vertexes = sc.textFile(graphBasePath + "/" + subEntity) - .map(s -> MapDocumentUtil.getJPathString(dedupConf.getWf().getIdPath(), s)) - .mapToPair((PairFunction) s -> new Tuple2<>(hash(s), s)); + final JavaPairRDD vertexes = + sc.textFile(graphBasePath + "/" + subEntity) + .map( + s -> + MapDocumentUtil.getJPathString( + dedupConf.getWf().getIdPath(), s)) + .mapToPair( + (PairFunction) + s -> new Tuple2<>(hash(s), s)); - final RDD> edgeRdd = spark - .read() - .load(DedupUtility.createSimRelPath(workingPath, actionSetId, subEntity)) - .as(Encoders.bean(Relation.class)) - .javaRDD() - .map(it -> new Edge<>(hash(it.getSource()), hash(it.getTarget()), it.getRelClass())) - .rdd(); + final RDD> edgeRdd = + spark.read() + .load( + DedupUtility.createSimRelPath( + workingPath, actionSetId, subEntity)) + .as(Encoders.bean(Relation.class)) + .javaRDD() + .map( + it -> + new Edge<>( + hash(it.getSource()), + hash(it.getTarget()), + it.getRelClass())) + .rdd(); - final Dataset mergeRels = spark - .createDataset(GraphProcessor.findCCs(vertexes.rdd(), edgeRdd, maxIterations) - .toJavaRDD() - .filter(k -> k.getDocIds().size() > 1) - .flatMap(cc -> ccToMergeRel(cc, dedupConf)) - .rdd(), Encoders.bean(Relation.class)); + final Dataset mergeRels = + spark.createDataset( + GraphProcessor.findCCs(vertexes.rdd(), edgeRdd, maxIterations) + .toJavaRDD() + .filter(k -> k.getDocIds().size() > 1) + .flatMap(cc -> ccToMergeRel(cc, dedupConf)) + .rdd(), + Encoders.bean(Relation.class)); - mergeRels - .write() - .mode(SaveMode.Append) - .parquet(mergeRelPath); + mergeRels.write().mode(SaveMode.Append).parquet(mergeRelPath); } - } - public Iterator ccToMergeRel(ConnectedComponent cc, DedupConfig dedupConf){ - return cc.getDocIds() - .stream() - .flatMap(id -> { - List tmp = new ArrayList<>(); + public Iterator ccToMergeRel(ConnectedComponent cc, DedupConfig dedupConf) { + return cc.getDocIds().stream() + .flatMap( + id -> { + List tmp = new ArrayList<>(); - tmp.add(rel(cc.getCcId(), id, "merges", dedupConf)); - tmp.add(rel(id, cc.getCcId(), "isMergedIn", dedupConf)); + tmp.add(rel(cc.getCcId(), id, "merges", dedupConf)); + tmp.add(rel(id, cc.getCcId(), "isMergedIn", dedupConf)); - return tmp.stream(); - }).iterator(); + return tmp.stream(); + }) + .iterator(); } private Relation rel(String source, String target, String relClass, DedupConfig dedupConf) { @@ -144,8 +160,8 @@ public class SparkCreateMergeRels extends AbstractSparkAction { provenanceAction.setSchemename(DNET_PROVENANCE_ACTIONS); info.setProvenanceaction(provenanceAction); - //TODO calculate the trust value based on the similarity score of the elements in the CC - //info.setTrust(); + // TODO calculate the trust value based on the similarity score of the elements in the CC + // info.setTrust(); r.setDataInfo(info); return r; @@ -154,5 +170,4 @@ public class SparkCreateMergeRels extends AbstractSparkAction { public static long hash(final String id) { return Hashing.murmur3_128().hashString(id).asLong(); } - } diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkCreateSimRels.java b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkCreateSimRels.java index d02aef64c..2fd807ccb 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkCreateSimRels.java +++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkCreateSimRels.java @@ -2,7 +2,6 @@ package eu.dnetlib.dhp.oa.dedup; import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.oa.dedup.model.Block; -import eu.dnetlib.dhp.schema.common.ModelSupport; import eu.dnetlib.dhp.schema.oaf.DataInfo; import eu.dnetlib.dhp.schema.oaf.Relation; import eu.dnetlib.dhp.utils.ISLookupClientFactory; @@ -13,14 +12,13 @@ import eu.dnetlib.pace.model.FieldListImpl; import eu.dnetlib.pace.model.FieldValueImpl; import eu.dnetlib.pace.model.MapDocument; import eu.dnetlib.pace.util.MapDocumentUtil; +import java.io.IOException; import org.apache.commons.io.IOUtils; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; -import org.apache.spark.api.java.function.MapFunction; import org.apache.spark.api.java.function.PairFunction; -import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Encoders; import org.apache.spark.sql.SaveMode; import org.apache.spark.sql.SparkSession; @@ -29,8 +27,6 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; import scala.Tuple2; -import java.io.IOException; - public class SparkCreateSimRels extends AbstractSparkAction { private static final Logger log = LoggerFactory.getLogger(SparkCreateSimRels.class); @@ -40,28 +36,29 @@ public class SparkCreateSimRels extends AbstractSparkAction { } public static void main(String[] args) throws Exception { - ArgumentApplicationParser parser = new ArgumentApplicationParser( - IOUtils.toString( - SparkCreateSimRels.class.getResourceAsStream("/eu/dnetlib/dhp/oa/dedup/createSimRels_parameters.json"))); + ArgumentApplicationParser parser = + new ArgumentApplicationParser( + IOUtils.toString( + SparkCreateSimRels.class.getResourceAsStream( + "/eu/dnetlib/dhp/oa/dedup/createSimRels_parameters.json"))); parser.parseArgument(args); SparkConf conf = new SparkConf(); conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); - conf.registerKryoClasses(new Class[] { - MapDocument.class, - FieldListImpl.class, - FieldValueImpl.class, - Block.class - }); + conf.registerKryoClasses( + new Class[] { + MapDocument.class, FieldListImpl.class, FieldValueImpl.class, Block.class + }); new SparkCreateSimRels(parser, getSparkSession(conf)) .run(ISLookupClientFactory.getLookUpService(parser.get("isLookUpUrl"))); } @Override - public void run(ISLookUpService isLookUpService) throws DocumentException, IOException, ISLookUpException { + public void run(ISLookUpService isLookUpService) + throws DocumentException, IOException, ISLookUpException { - //read oozie parameters + // read oozie parameters final String graphBasePath = parser.get("graphBasePath"); final String isLookUpUrl = parser.get("isLookUpUrl"); final String actionSetId = parser.get("actionSetId"); @@ -72,32 +69,39 @@ public class SparkCreateSimRels extends AbstractSparkAction { log.info("actionSetId: '{}'", actionSetId); log.info("workingPath: '{}'", workingPath); - //for each dedup configuration - for (DedupConfig dedupConf: getConfigurations(isLookUpService, actionSetId)) { + // for each dedup configuration + for (DedupConfig dedupConf : getConfigurations(isLookUpService, actionSetId)) { final String entity = dedupConf.getWf().getEntityType(); final String subEntity = dedupConf.getWf().getSubEntityValue(); log.info("Creating simrels for: '{}'", subEntity); - final String outputPath = DedupUtility.createSimRelPath(workingPath, actionSetId, subEntity); + final String outputPath = + DedupUtility.createSimRelPath(workingPath, actionSetId, subEntity); removeOutputDir(spark, outputPath); JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); - JavaPairRDD mapDocuments = sc.textFile(DedupUtility.createEntityPath(graphBasePath, subEntity)) - .mapToPair((PairFunction) s -> { - MapDocument d = MapDocumentUtil.asMapDocumentWithJPath(dedupConf, s); - return new Tuple2<>(d.getIdentifier(), d); - }); + JavaPairRDD mapDocuments = + sc.textFile(DedupUtility.createEntityPath(graphBasePath, subEntity)) + .mapToPair( + (PairFunction) + s -> { + MapDocument d = + MapDocumentUtil.asMapDocumentWithJPath( + dedupConf, s); + return new Tuple2<>(d.getIdentifier(), d); + }); - //create blocks for deduplication + // create blocks for deduplication JavaPairRDD blocks = Deduper.createSortedBlocks(mapDocuments, dedupConf); - //create relations by comparing only elements in the same group - JavaRDD relations = Deduper.computeRelations(sc, blocks, dedupConf) - .map(t -> createSimRel(t._1(), t._2(), entity)); + // create relations by comparing only elements in the same group + JavaRDD relations = + Deduper.computeRelations(sc, blocks, dedupConf) + .map(t -> createSimRel(t._1(), t._2(), entity)); - //save the simrel in the workingdir + // save the simrel in the workingdir spark.createDataset(relations.rdd(), Encoders.bean(Relation.class)) .write() .mode(SaveMode.Append) @@ -113,7 +117,7 @@ public class SparkCreateSimRels extends AbstractSparkAction { r.setRelClass("isSimilarTo"); r.setDataInfo(new DataInfo()); - switch(entity) { + switch (entity) { case "result": r.setRelType("resultResult"); break; @@ -125,5 +129,4 @@ public class SparkCreateSimRels extends AbstractSparkAction { } return r; } - } diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkPropagateRelation.java b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkPropagateRelation.java index 450cbac5e..17c99c002 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkPropagateRelation.java +++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkPropagateRelation.java @@ -1,7 +1,7 @@ package eu.dnetlib.dhp.oa.dedup; -import com.fasterxml.jackson.databind.DeserializationFeature; -import com.fasterxml.jackson.databind.ObjectMapper; +import static org.apache.spark.sql.functions.col; + import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.schema.common.ModelSupport; import eu.dnetlib.dhp.schema.oaf.DataInfo; @@ -9,8 +9,6 @@ import eu.dnetlib.dhp.schema.oaf.Relation; import eu.dnetlib.dhp.utils.ISLookupClientFactory; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; import org.apache.commons.io.IOUtils; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; import org.apache.spark.SparkConf; import org.apache.spark.api.java.function.MapFunction; import org.apache.spark.sql.*; @@ -18,10 +16,6 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; import scala.Tuple2; -import java.io.IOException; - -import static org.apache.spark.sql.functions.col; - public class SparkPropagateRelation extends AbstractSparkAction { private static final Logger log = LoggerFactory.getLogger(SparkPropagateRelation.class); @@ -31,14 +25,17 @@ public class SparkPropagateRelation extends AbstractSparkAction { TARGET } - public SparkPropagateRelation(ArgumentApplicationParser parser, SparkSession spark) throws Exception { + public SparkPropagateRelation(ArgumentApplicationParser parser, SparkSession spark) + throws Exception { super(parser, spark); } public static void main(String[] args) throws Exception { - ArgumentApplicationParser parser = new ArgumentApplicationParser( - IOUtils.toString(SparkCreateSimRels.class.getResourceAsStream( - "/eu/dnetlib/dhp/oa/dedup/propagateRelation_parameters.json"))); + ArgumentApplicationParser parser = + new ArgumentApplicationParser( + IOUtils.toString( + SparkCreateSimRels.class.getResourceAsStream( + "/eu/dnetlib/dhp/oa/dedup/propagateRelation_parameters.json"))); parser.parseArgument(args); @@ -64,49 +61,63 @@ public class SparkPropagateRelation extends AbstractSparkAction { final String outputRelationPath = DedupUtility.createEntityPath(dedupGraphPath, "relation"); removeOutputDir(spark, outputRelationPath); - Dataset mergeRels = spark.read() - .load(DedupUtility.createMergeRelPath(workingPath, "*", "*")) - .as(Encoders.bean(Relation.class)); + Dataset mergeRels = + spark.read() + .load(DedupUtility.createMergeRelPath(workingPath, "*", "*")) + .as(Encoders.bean(Relation.class)); - Dataset> mergedIds = mergeRels - .where(col("relClass").equalTo("merges")) - .select(col("source"), col("target")) - .distinct() - .map((MapFunction>) - r -> new Tuple2<>(r.getString(1), r.getString(0)), - Encoders.tuple(Encoders.STRING(), Encoders.STRING())) - .cache(); + Dataset> mergedIds = + mergeRels + .where(col("relClass").equalTo("merges")) + .select(col("source"), col("target")) + .distinct() + .map( + (MapFunction>) + r -> new Tuple2<>(r.getString(1), r.getString(0)), + Encoders.tuple(Encoders.STRING(), Encoders.STRING())) + .cache(); final String relationPath = DedupUtility.createEntityPath(graphBasePath, "relation"); - Dataset rels = spark.read() - .textFile(relationPath) - .map(patchRelFn(), Encoders.bean(Relation.class)); + Dataset rels = + spark.read() + .textFile(relationPath) + .map(patchRelFn(), Encoders.bean(Relation.class)); - //change raw ids with dedup ids Dataset newRels = processDataset( - processDataset(rels, mergedIds, FieldType.SOURCE, getFixRelFn(FieldType.SOURCE)), - mergedIds, FieldType.TARGET, getFixRelFn(FieldType.TARGET)) - .filter(SparkPropagateRelation::containsDedup); + processDataset( + rels, + mergedIds, + FieldType.SOURCE, + getFixRelFn(FieldType.SOURCE)), + mergedIds, + FieldType.TARGET, + getFixRelFn(FieldType.TARGET)) + .filter(SparkPropagateRelation::containsDedup); - //update deletedbyinference - Dataset updated = processDataset( - processDataset(rels, mergedIds, FieldType.SOURCE, getDeletedFn()), - mergedIds, FieldType.TARGET, getDeletedFn()); + Dataset updated = + processDataset( + processDataset(rels, mergedIds, FieldType.SOURCE, getDeletedFn()), + mergedIds, + FieldType.TARGET, + getDeletedFn()); save(newRels.union(updated), outputRelationPath, SaveMode.Overwrite); - } - private static Dataset processDataset(Dataset rels, Dataset> mergedIds, FieldType type, - MapFunction, Tuple2>, Relation> mapFn) { - final Dataset> mapped = rels - .map((MapFunction>) + private static Dataset processDataset( + Dataset rels, + Dataset> mergedIds, + FieldType type, + MapFunction, Tuple2>, Relation> mapFn) { + final Dataset> mapped = + rels.map( + (MapFunction>) r -> new Tuple2<>(getId(r, type), r), Encoders.tuple(Encoders.STRING(), Encoders.kryo(Relation.class))); - return mapped - .joinWith(mergedIds, mapped.col("_1").equalTo(mergedIds.col("_1")), "left_outer") + return mapped.joinWith( + mergedIds, mapped.col("_1").equalTo(mergedIds.col("_1")), "left_outer") .map(mapFn, Encoders.bean(Relation.class)); } @@ -131,7 +142,8 @@ public class SparkPropagateRelation extends AbstractSparkAction { } } - private static MapFunction, Tuple2>, Relation> getFixRelFn(FieldType type) { + private static MapFunction, Tuple2>, Relation> + getFixRelFn(FieldType type) { return value -> { if (value._2() != null) { Relation r = value._1()._2(); @@ -155,7 +167,8 @@ public class SparkPropagateRelation extends AbstractSparkAction { }; } - private static MapFunction, Tuple2>, Relation> getDeletedFn() { + private static MapFunction, Tuple2>, Relation> + getDeletedFn() { return value -> { if (value._2() != null) { Relation r = value._1()._2(); @@ -170,7 +183,7 @@ public class SparkPropagateRelation extends AbstractSparkAction { } private static boolean containsDedup(final Relation r) { - return r.getSource().toLowerCase().contains("dedup") || r.getTarget().toLowerCase().contains("dedup"); + return r.getSource().toLowerCase().contains("dedup") + || r.getTarget().toLowerCase().contains("dedup"); } - -} \ No newline at end of file +} diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkReporter.java b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkReporter.java index 98ee37e14..99c3b73f3 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkReporter.java +++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkReporter.java @@ -1,15 +1,12 @@ package eu.dnetlib.dhp.oa.dedup; import eu.dnetlib.pace.util.Reporter; -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; -import org.apache.spark.util.LongAccumulator; -import scala.Serializable; -import scala.Tuple2; - import java.util.ArrayList; import java.util.List; import java.util.Map; +import org.apache.spark.util.LongAccumulator; +import scala.Serializable; +import scala.Tuple2; public class SparkReporter implements Serializable, Reporter { @@ -17,17 +14,20 @@ public class SparkReporter implements Serializable, Reporter { private Map accumulators; - public SparkReporter(Map accumulators){ + public SparkReporter(Map accumulators) { this.accumulators = accumulators; } - public void incrementCounter(String counterGroup, String counterName, long delta, Map accumulators) { + public void incrementCounter( + String counterGroup, + String counterName, + long delta, + Map accumulators) { final String accumulatorName = String.format("%s::%s", counterGroup, counterName); - if (accumulators.containsKey(accumulatorName)){ + if (accumulators.containsKey(accumulatorName)) { accumulators.get(accumulatorName).add(delta); } - } @Override diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkUpdateEntity.java b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkUpdateEntity.java index ea0a06bbe..fde5dd75f 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkUpdateEntity.java +++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkUpdateEntity.java @@ -1,7 +1,6 @@ package eu.dnetlib.dhp.oa.dedup; import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.oa.dedup.model.Block; import eu.dnetlib.dhp.schema.common.ModelSupport; import eu.dnetlib.dhp.schema.oaf.DataInfo; import eu.dnetlib.dhp.schema.oaf.Oaf; @@ -9,10 +8,8 @@ import eu.dnetlib.dhp.schema.oaf.OafEntity; import eu.dnetlib.dhp.schema.oaf.Relation; import eu.dnetlib.dhp.utils.ISLookupClientFactory; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; -import eu.dnetlib.pace.model.FieldListImpl; -import eu.dnetlib.pace.model.FieldValueImpl; -import eu.dnetlib.pace.model.MapDocument; import eu.dnetlib.pace.util.MapDocumentUtil; +import java.io.IOException; import org.apache.commons.io.IOUtils; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileStatus; @@ -32,8 +29,6 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; import scala.Tuple2; -import java.io.IOException; - public class SparkUpdateEntity extends AbstractSparkAction { private static final Logger log = LoggerFactory.getLogger(SparkUpdateEntity.class); @@ -45,10 +40,11 @@ public class SparkUpdateEntity extends AbstractSparkAction { } public static void main(String[] args) throws Exception { - ArgumentApplicationParser parser = new ArgumentApplicationParser( - IOUtils.toString( - SparkUpdateEntity.class.getResourceAsStream( - "/eu/dnetlib/dhp/oa/dedup/updateEntity_parameters.json"))); + ArgumentApplicationParser parser = + new ArgumentApplicationParser( + IOUtils.toString( + SparkUpdateEntity.class.getResourceAsStream( + "/eu/dnetlib/dhp/oa/dedup/updateEntity_parameters.json"))); parser.parseArgument(args); SparkConf conf = new SparkConf(); @@ -71,43 +67,62 @@ public class SparkUpdateEntity extends AbstractSparkAction { final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); - //for each entity - ModelSupport.entityTypes.forEach((entity, clazz) -> { - final String outputPath = dedupGraphPath + "/" + entity; - removeOutputDir(spark, outputPath); + // for each entity + ModelSupport.entityTypes.forEach( + (entity, clazz) -> { + final String outputPath = dedupGraphPath + "/" + entity; + removeOutputDir(spark, outputPath); - JavaRDD sourceEntity = sc.textFile(DedupUtility.createEntityPath(graphBasePath, entity.toString())); + JavaRDD sourceEntity = + sc.textFile( + DedupUtility.createEntityPath( + graphBasePath, entity.toString())); - if (mergeRelExists(workingPath, entity.toString())) { + if (mergeRelExists(workingPath, entity.toString())) { - final String mergeRelPath = DedupUtility.createMergeRelPath(workingPath, "*", entity.toString()); - final String dedupRecordPath = DedupUtility.createDedupRecordPath(workingPath, "*", entity.toString()); + final String mergeRelPath = + DedupUtility.createMergeRelPath( + workingPath, "*", entity.toString()); + final String dedupRecordPath = + DedupUtility.createDedupRecordPath( + workingPath, "*", entity.toString()); - final Dataset rel = spark.read() - .load(mergeRelPath) - .as(Encoders.bean(Relation.class)); + final Dataset rel = + spark.read().load(mergeRelPath).as(Encoders.bean(Relation.class)); - final JavaPairRDD mergedIds = rel - .where("relClass == 'merges'") - .select(rel.col("target")) - .distinct() - .toJavaRDD() - .mapToPair((PairFunction) r -> new Tuple2<>(r.getString(0), "d")); + final JavaPairRDD mergedIds = + rel.where("relClass == 'merges'") + .select(rel.col("target")) + .distinct() + .toJavaRDD() + .mapToPair( + (PairFunction) + r -> new Tuple2<>(r.getString(0), "d")); - JavaPairRDD entitiesWithId = sourceEntity - .mapToPair((PairFunction) s -> new Tuple2<>(MapDocumentUtil.getJPathString(IDJSONPATH, s), s)); + JavaPairRDD entitiesWithId = + sourceEntity.mapToPair( + (PairFunction) + s -> + new Tuple2<>( + MapDocumentUtil.getJPathString( + IDJSONPATH, s), + s)); - JavaRDD map = entitiesWithId - .leftOuterJoin(mergedIds) - .map(k -> k._2()._2().isPresent() ? updateDeletedByInference(k._2()._1(), clazz) : k._2()._1()); + JavaRDD map = + entitiesWithId + .leftOuterJoin(mergedIds) + .map( + k -> + k._2()._2().isPresent() + ? updateDeletedByInference( + k._2()._1(), clazz) + : k._2()._1()); - sourceEntity = map.union(sc.textFile(dedupRecordPath)); - } - - sourceEntity.saveAsTextFile(outputPath, GzipCodec.class); - - }); + sourceEntity = map.union(sc.textFile(dedupRecordPath)); + } + sourceEntity.saveAsTextFile(outputPath, GzipCodec.class); + }); } public boolean mergeRelExists(String basePath, String entity) { @@ -120,7 +135,10 @@ public class SparkUpdateEntity extends AbstractSparkAction { for (FileStatus fs : fileStatuses) { if (fs.isDirectory()) - if (fileSystem.exists(new Path(DedupUtility.createMergeRelPath(basePath, fs.getPath().getName(), entity)))) + if (fileSystem.exists( + new Path( + DedupUtility.createMergeRelPath( + basePath, fs.getPath().getName(), entity)))) result = true; } @@ -130,16 +148,15 @@ public class SparkUpdateEntity extends AbstractSparkAction { } } - private static String updateDeletedByInference(final String json, final Class clazz) { + private static String updateDeletedByInference( + final String json, final Class clazz) { try { Oaf entity = OBJECT_MAPPER.readValue(json, clazz); - if (entity.getDataInfo()== null) - entity.setDataInfo(new DataInfo()); + if (entity.getDataInfo() == null) entity.setDataInfo(new DataInfo()); entity.getDataInfo().setDeletedbyinference(true); return OBJECT_MAPPER.writeValueAsString(entity); } catch (IOException e) { throw new RuntimeException("Unable to convert json", e); } } - } diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/graph/ConnectedComponent.java b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/graph/ConnectedComponent.java index 7bfa5dc3d..4baac0229 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/graph/ConnectedComponent.java +++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/graph/ConnectedComponent.java @@ -3,21 +3,18 @@ package eu.dnetlib.dhp.oa.dedup.graph; import com.fasterxml.jackson.databind.ObjectMapper; import eu.dnetlib.dhp.oa.dedup.DedupUtility; import eu.dnetlib.pace.util.PaceException; -import org.apache.commons.lang.StringUtils; -import org.codehaus.jackson.annotate.JsonIgnore; - import java.io.IOException; import java.io.Serializable; import java.util.Set; +import org.apache.commons.lang.StringUtils; +import org.codehaus.jackson.annotate.JsonIgnore; public class ConnectedComponent implements Serializable { private Set docIds; private String ccId; - - public ConnectedComponent() { - } + public ConnectedComponent() {} public ConnectedComponent(Set docIds) { this.docIds = docIds; @@ -28,7 +25,7 @@ public class ConnectedComponent implements Serializable { if (docIds.size() > 1) { final String s = getMin(); String prefix = s.split("\\|")[0]; - ccId =prefix + "|dedup_______::" + DedupUtility.md5(s); + ccId = prefix + "|dedup_______::" + DedupUtility.md5(s); return ccId; } else { return docIds.iterator().next(); @@ -36,24 +33,25 @@ public class ConnectedComponent implements Serializable { } @JsonIgnore - public String getMin(){ + public String getMin() { final StringBuilder min = new StringBuilder(); - docIds.forEach(i -> { - if (StringUtils.isBlank(min.toString())) { - min.append(i); - } else { - if (min.toString().compareTo(i) > 0) { - min.setLength(0); - min.append(i); - } - } - }); + docIds.forEach( + i -> { + if (StringUtils.isBlank(min.toString())) { + min.append(i); + } else { + if (min.toString().compareTo(i) > 0) { + min.setLength(0); + min.append(i); + } + } + }); return min.toString(); } @Override - public String toString(){ + public String toString() { ObjectMapper mapper = new ObjectMapper(); try { return mapper.writeValueAsString(this); diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/model/Block.java b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/model/Block.java index e1ccf143c..8f3d3ffa4 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/model/Block.java +++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/model/Block.java @@ -2,7 +2,6 @@ package eu.dnetlib.dhp.oa.dedup.model; import com.google.common.collect.Lists; import eu.dnetlib.pace.model.MapDocument; - import java.io.Serializable; import java.util.ArrayList; import java.util.Comparator; @@ -36,11 +35,13 @@ public class Block implements Serializable { Iterable it = () -> blocks; block.setDocuments( - StreamSupport.stream(it.spliterator(), false) - .flatMap(b -> b.getDocuments().stream()) - .sorted(Comparator.comparing(a -> a.getFieldMap().get(orderField).stringValue())) - .limit(maxSize) - .collect(Collectors.toCollection(ArrayList::new))); + StreamSupport.stream(it.spliterator(), false) + .flatMap(b -> b.getDocuments().stream()) + .sorted( + Comparator.comparing( + a -> a.getFieldMap().get(orderField).stringValue())) + .limit(maxSize) + .collect(Collectors.toCollection(ArrayList::new))); return block; } @@ -48,12 +49,12 @@ public class Block implements Serializable { Block block = new Block(); block.setKey(b1.getKey()); block.setDocuments( - Stream.concat( - b1.getDocuments().stream(), - b2.getDocuments().stream()) - .sorted(Comparator.comparing(a -> a.getFieldMap().get(orderField).stringValue())) - .limit(maxSize) - .collect(Collectors.toCollection(ArrayList::new))); + Stream.concat(b1.getDocuments().stream(), b2.getDocuments().stream()) + .sorted( + Comparator.comparing( + a -> a.getFieldMap().get(orderField).stringValue())) + .limit(maxSize) + .collect(Collectors.toCollection(ArrayList::new))); return block; } diff --git a/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/MergeAuthorTest.java b/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/MergeAuthorTest.java index 970fc0ddb..29e010cbd 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/MergeAuthorTest.java +++ b/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/MergeAuthorTest.java @@ -1,15 +1,13 @@ package eu.dnetlib.dhp.oa.dedup; -import eu.dnetlib.dhp.oa.dedup.DedupUtility; import eu.dnetlib.dhp.schema.oaf.Publication; -import org.apache.commons.io.IOUtils; -import org.codehaus.jackson.map.ObjectMapper; -import org.junit.jupiter.api.BeforeEach; - import java.io.IOException; import java.util.Arrays; import java.util.List; import java.util.stream.Collectors; +import org.apache.commons.io.IOUtils; +import org.codehaus.jackson.map.ObjectMapper; +import org.junit.jupiter.api.BeforeEach; public class MergeAuthorTest { @@ -18,28 +16,36 @@ public class MergeAuthorTest { @BeforeEach public void setUp() throws Exception { - final String json = IOUtils.toString(this.getClass().getResourceAsStream("/eu/dnetlib/dhp/dedup/json/authors_merge.json")); + final String json = + IOUtils.toString( + this.getClass() + .getResourceAsStream( + "/eu/dnetlib/dhp/dedup/json/authors_merge.json")); - publicationsToMerge = Arrays.asList(json.split("\n")).stream().map(s-> { - try { - return mapper.readValue(s, Publication.class); - } catch (IOException e) { - throw new RuntimeException(e); - } - }).collect(Collectors.toList()); + publicationsToMerge = + Arrays.asList(json.split("\n")).stream() + .map( + s -> { + try { + return mapper.readValue(s, Publication.class); + } catch (IOException e) { + throw new RuntimeException(e); + } + }) + .collect(Collectors.toList()); } - //FIX ME Michele DB this tests doesn't work - //@Test - public void test() throws Exception { + // FIX ME Michele DB this tests doesn't work + // @Test + public void test() throws Exception { Publication dedup = new Publication(); - publicationsToMerge.forEach(p-> { - dedup.mergeFrom(p); - dedup.setAuthor(DedupUtility.mergeAuthor(dedup.getAuthor(),p.getAuthor())); - }); + publicationsToMerge.forEach( + p -> { + dedup.mergeFrom(p); + dedup.setAuthor(DedupUtility.mergeAuthor(dedup.getAuthor(), p.getAuthor())); + }); System.out.println(mapper.writeValueAsString(dedup)); } - } diff --git a/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/SparkDedupTest.java b/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/SparkDedupTest.java index 01cbf0489..7ad2f6d17 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/SparkDedupTest.java +++ b/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/SparkDedupTest.java @@ -1,37 +1,29 @@ package eu.dnetlib.dhp.oa.dedup; +import static java.nio.file.Files.createTempDirectory; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.mockito.Mockito.lenient; + import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.schema.oaf.Relation; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; -import eu.dnetlib.pace.util.MapDocumentUtil; +import java.io.File; +import java.io.IOException; +import java.io.Serializable; +import java.net.URISyntaxException; +import java.nio.file.Paths; import org.apache.commons.io.FileUtils; import org.apache.commons.io.IOUtils; import org.apache.spark.SparkConf; -import org.apache.spark.api.java.JavaPairRDD; -import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; -import org.apache.spark.api.java.function.PairFunction; -import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Encoders; -import org.apache.spark.sql.Row; import org.apache.spark.sql.SparkSession; import org.junit.jupiter.api.*; import org.junit.jupiter.api.extension.ExtendWith; import org.mockito.Mock; import org.mockito.Mockito; import org.mockito.junit.jupiter.MockitoExtension; -import scala.Tuple2; - -import java.io.File; -import java.io.IOException; -import java.io.Serializable; -import java.net.URISyntaxException; -import java.nio.file.Paths; - -import static java.nio.file.Files.createTempDirectory; -import static org.junit.jupiter.api.Assertions.assertEquals; -import static org.mockito.Mockito.lenient; @ExtendWith(MockitoExtension.class) @TestMethodOrder(MethodOrderer.OrderAnnotation.class) @@ -46,164 +38,224 @@ public class SparkDedupTest implements Serializable { private static String testGraphBasePath; private static String testOutputBasePath; private static String testDedupGraphBasePath; - private final static String testActionSetId = "test-orchestrator"; + private static final String testActionSetId = "test-orchestrator"; @BeforeAll private static void cleanUp() throws IOException, URISyntaxException { - testGraphBasePath = Paths.get(SparkDedupTest.class.getResource("/eu/dnetlib/dhp/dedup/entities").toURI()).toFile().getAbsolutePath(); + testGraphBasePath = + Paths.get( + SparkDedupTest.class + .getResource("/eu/dnetlib/dhp/dedup/entities") + .toURI()) + .toFile() + .getAbsolutePath(); - testOutputBasePath = createTempDirectory(SparkDedupTest.class.getSimpleName() + "-").toAbsolutePath().toString(); - testDedupGraphBasePath = createTempDirectory(SparkDedupTest.class.getSimpleName() + "-").toAbsolutePath().toString(); + testOutputBasePath = + createTempDirectory(SparkDedupTest.class.getSimpleName() + "-") + .toAbsolutePath() + .toString(); + testDedupGraphBasePath = + createTempDirectory(SparkDedupTest.class.getSimpleName() + "-") + .toAbsolutePath() + .toString(); FileUtils.deleteDirectory(new File(testOutputBasePath)); FileUtils.deleteDirectory(new File(testDedupGraphBasePath)); - spark = SparkSession - .builder() - .appName(SparkCreateSimRels.class.getSimpleName()) - .master("local[*]") - .config(new SparkConf()) - .getOrCreate(); + spark = + SparkSession.builder() + .appName(SparkCreateSimRels.class.getSimpleName()) + .master("local[*]") + .config(new SparkConf()) + .getOrCreate(); jsc = JavaSparkContext.fromSparkContext(spark.sparkContext()); - } @BeforeEach private void setUp() throws IOException, ISLookUpException { - lenient().when(isLookUpService.getResourceProfileByQuery(Mockito.contains(testActionSetId))) - .thenReturn(IOUtils.toString(SparkDedupTest.class.getResourceAsStream("/eu/dnetlib/dhp/dedup/profiles/mock_orchestrator.xml"))); + lenient() + .when(isLookUpService.getResourceProfileByQuery(Mockito.contains(testActionSetId))) + .thenReturn( + IOUtils.toString( + SparkDedupTest.class.getResourceAsStream( + "/eu/dnetlib/dhp/dedup/profiles/mock_orchestrator.xml"))); - lenient().when(isLookUpService.getResourceProfileByQuery(Mockito.contains("organization"))) - .thenReturn(IOUtils.toString(SparkDedupTest.class.getResourceAsStream("/eu/dnetlib/dhp/dedup/conf/org.curr.conf.json"))); - - lenient().when(isLookUpService.getResourceProfileByQuery(Mockito.contains("publication"))) - .thenReturn(IOUtils.toString(SparkDedupTest.class.getResourceAsStream("/eu/dnetlib/dhp/dedup/conf/pub.curr.conf.json"))); - - lenient().when(isLookUpService.getResourceProfileByQuery(Mockito.contains("software"))) - .thenReturn(IOUtils.toString(SparkDedupTest.class.getResourceAsStream("/eu/dnetlib/dhp/dedup/conf/sw.curr.conf.json"))); + lenient() + .when(isLookUpService.getResourceProfileByQuery(Mockito.contains("organization"))) + .thenReturn( + IOUtils.toString( + SparkDedupTest.class.getResourceAsStream( + "/eu/dnetlib/dhp/dedup/conf/org.curr.conf.json"))); + lenient() + .when(isLookUpService.getResourceProfileByQuery(Mockito.contains("publication"))) + .thenReturn( + IOUtils.toString( + SparkDedupTest.class.getResourceAsStream( + "/eu/dnetlib/dhp/dedup/conf/pub.curr.conf.json"))); } @Test @Order(1) public void createSimRelsTest() throws Exception { - ArgumentApplicationParser parser = new ArgumentApplicationParser( - IOUtils.toString( - SparkCreateSimRels.class.getResourceAsStream("/eu/dnetlib/dhp/oa/dedup/createSimRels_parameters.json"))); - parser.parseArgument(new String[]{ - "-i", testGraphBasePath, - "-asi", testActionSetId, - "-la", "lookupurl", - "-w", testOutputBasePath}); + ArgumentApplicationParser parser = + new ArgumentApplicationParser( + IOUtils.toString( + SparkCreateSimRels.class.getResourceAsStream( + "/eu/dnetlib/dhp/oa/dedup/createSimRels_parameters.json"))); + parser.parseArgument( + new String[] { + "-mt", "local[*]", + "-i", testGraphBasePath, + "-asi", testActionSetId, + "-la", "lookupurl", + "-w", testOutputBasePath + }); new SparkCreateSimRels(parser, spark).run(isLookUpService); - long orgs_simrel = spark.read().load(testOutputBasePath + "/" + testActionSetId + "/organization_simrel").count(); - long pubs_simrel = spark.read().load(testOutputBasePath + "/" + testActionSetId + "/publication_simrel").count(); - long sw_simrel = spark.read().load(testOutputBasePath + "/" + testActionSetId + "/software_simrel").count(); + long orgs_simrel = + spark.read() + .load(testOutputBasePath + "/" + testActionSetId + "/organization_simrel") + .count(); + long pubs_simrel = + spark.read() + .load(testOutputBasePath + "/" + testActionSetId + "/publication_simrel") + .count(); assertEquals(3288, orgs_simrel); assertEquals(7260, pubs_simrel); - assertEquals(344, sw_simrel); } @Test @Order(2) public void createMergeRelsTest() throws Exception { - ArgumentApplicationParser parser = new ArgumentApplicationParser( - IOUtils.toString( - SparkCreateMergeRels.class.getResourceAsStream("/eu/dnetlib/dhp/oa/dedup/createCC_parameters.json"))); - parser.parseArgument(new String[]{ - "-i", testGraphBasePath, - "-asi", testActionSetId, - "-la", "lookupurl", - "-w", testOutputBasePath}); + ArgumentApplicationParser parser = + new ArgumentApplicationParser( + IOUtils.toString( + SparkCreateMergeRels.class.getResourceAsStream( + "/eu/dnetlib/dhp/oa/dedup/createCC_parameters.json"))); + parser.parseArgument( + new String[] { + "-mt", "local[*]", + "-i", testGraphBasePath, + "-asi", testActionSetId, + "-la", "lookupurl", + "-w", testOutputBasePath + }); new SparkCreateMergeRels(parser, spark).run(isLookUpService); - long orgs_mergerel = spark.read().load(testOutputBasePath + "/" + testActionSetId + "/organization_mergerel").count(); - long pubs_mergerel = spark.read().load(testOutputBasePath + "/" + testActionSetId + "/publication_mergerel").count(); - long sw_mergerel = spark.read().load(testOutputBasePath + "/" + testActionSetId + "/software_mergerel").count(); + long orgs_mergerel = + spark.read() + .load(testOutputBasePath + "/" + testActionSetId + "/organization_mergerel") + .count(); + long pubs_mergerel = + spark.read() + .load(testOutputBasePath + "/" + testActionSetId + "/publication_mergerel") + .count(); assertEquals(1244, orgs_mergerel); assertEquals(1460, pubs_mergerel); - assertEquals(288, sw_mergerel); } @Test @Order(3) public void createDedupRecordTest() throws Exception { - ArgumentApplicationParser parser = new ArgumentApplicationParser( - IOUtils.toString( - SparkCreateDedupRecord.class.getResourceAsStream("/eu/dnetlib/dhp/oa/dedup/createDedupRecord_parameters.json"))); - parser.parseArgument(new String[]{ - "-i", testGraphBasePath, - "-asi", testActionSetId, - "-la", "lookupurl", - "-w", testOutputBasePath}); + ArgumentApplicationParser parser = + new ArgumentApplicationParser( + IOUtils.toString( + SparkCreateDedupRecord.class.getResourceAsStream( + "/eu/dnetlib/dhp/oa/dedup/createDedupRecord_parameters.json"))); + parser.parseArgument( + new String[] { + "-mt", "local[*]", + "-i", testGraphBasePath, + "-asi", testActionSetId, + "-la", "lookupurl", + "-w", testOutputBasePath + }); new SparkCreateDedupRecord(parser, spark).run(isLookUpService); - long orgs_deduprecord = jsc.textFile(testOutputBasePath + "/" + testActionSetId + "/organization_deduprecord").count(); - long pubs_deduprecord = jsc.textFile(testOutputBasePath + "/" + testActionSetId + "/publication_deduprecord").count(); - long sw_deduprecord = jsc.textFile(testOutputBasePath + "/" + testActionSetId + "/software_deduprecord").count(); + long orgs_deduprecord = + jsc.textFile( + testOutputBasePath + + "/" + + testActionSetId + + "/organization_deduprecord") + .count(); + long pubs_deduprecord = + jsc.textFile( + testOutputBasePath + + "/" + + testActionSetId + + "/publication_deduprecord") + .count(); assertEquals(82, orgs_deduprecord); assertEquals(66, pubs_deduprecord); - assertEquals(51, sw_deduprecord); } @Test @Order(4) public void updateEntityTest() throws Exception { - ArgumentApplicationParser parser = new ArgumentApplicationParser( - IOUtils.toString( - SparkUpdateEntity.class.getResourceAsStream("/eu/dnetlib/dhp/oa/dedup/updateEntity_parameters.json"))); - parser.parseArgument(new String[]{ - "-i", testGraphBasePath, - "-w", testOutputBasePath, - "-o", testDedupGraphBasePath - }); + ArgumentApplicationParser parser = + new ArgumentApplicationParser( + IOUtils.toString( + SparkUpdateEntity.class.getResourceAsStream( + "/eu/dnetlib/dhp/oa/dedup/updateEntity_parameters.json"))); + parser.parseArgument( + new String[] { + "-mt", "local[*]", + "-i", testGraphBasePath, + "-w", testOutputBasePath, + "-o", testDedupGraphBasePath + }); new SparkUpdateEntity(parser, spark).run(isLookUpService); long organizations = jsc.textFile(testDedupGraphBasePath + "/organization").count(); long publications = jsc.textFile(testDedupGraphBasePath + "/publication").count(); - long projects = jsc.textFile(testDedupGraphBasePath + "/project").count(); - long datasource = jsc.textFile(testDedupGraphBasePath + "/datasource").count(); - long softwares = jsc.textFile(testDedupGraphBasePath + "/software").count(); - long mergedOrgs = spark - .read().load(testOutputBasePath + "/" + testActionSetId + "/organization_mergerel").as(Encoders.bean(Relation.class)) - .where("relClass=='merges'") - .javaRDD() - .map(Relation::getTarget) - .distinct().count(); + long mergedOrgs = + spark.read() + .load(testOutputBasePath + "/" + testActionSetId + "/organization_mergerel") + .as(Encoders.bean(Relation.class)) + .where("relClass=='merges'") + .javaRDD() + .map(Relation::getTarget) + .distinct() + .count(); - long mergedPubs = spark - .read().load(testOutputBasePath + "/" + testActionSetId + "/publication_mergerel").as(Encoders.bean(Relation.class)) - .where("relClass=='merges'") - .javaRDD() - .map(Relation::getTarget) - .distinct().count(); + long mergedPubs = + spark.read() + .load(testOutputBasePath + "/" + testActionSetId + "/publication_mergerel") + .as(Encoders.bean(Relation.class)) + .where("relClass=='merges'") + .javaRDD() + .map(Relation::getTarget) + .distinct() + .count(); assertEquals(897, publications); assertEquals(835, organizations); - assertEquals(100, projects); - assertEquals(100, datasource); - assertEquals(200, softwares); - long deletedOrgs = jsc.textFile(testDedupGraphBasePath + "/organization") - .filter(this::isDeletedByInference).count(); - long deletedPubs = jsc.textFile(testDedupGraphBasePath + "/publication") - .filter(this::isDeletedByInference).count(); + long deletedOrgs = + jsc.textFile(testDedupGraphBasePath + "/organization") + .filter(this::isDeletedByInference) + .count(); + long deletedPubs = + jsc.textFile(testDedupGraphBasePath + "/publication") + .filter(this::isDeletedByInference) + .count(); assertEquals(mergedOrgs, deletedOrgs); assertEquals(mergedPubs, deletedPubs); @@ -213,42 +265,24 @@ public class SparkDedupTest implements Serializable { @Order(5) public void propagateRelationTest() throws Exception { - ArgumentApplicationParser parser = new ArgumentApplicationParser( - IOUtils.toString( - SparkPropagateRelation.class.getResourceAsStream("/eu/dnetlib/dhp/oa/dedup/propagateRelation_parameters.json"))); - parser.parseArgument(new String[]{ - "-i", testGraphBasePath, - "-w", testOutputBasePath, - "-o", testDedupGraphBasePath - }); + ArgumentApplicationParser parser = + new ArgumentApplicationParser( + IOUtils.toString( + SparkPropagateRelation.class.getResourceAsStream( + "/eu/dnetlib/dhp/oa/dedup/propagateRelation_parameters.json"))); + parser.parseArgument( + new String[] { + "-mt", "local[*]", + "-i", testGraphBasePath, + "-w", testOutputBasePath, + "-o", testDedupGraphBasePath + }); new SparkPropagateRelation(parser, spark).run(isLookUpService); long relations = jsc.textFile(testDedupGraphBasePath + "/relation").count(); assertEquals(826, relations); - - //check deletedbyinference - final Dataset mergeRels = spark.read().load(DedupUtility.createMergeRelPath(testOutputBasePath, "*", "*")).as(Encoders.bean(Relation.class)); - final JavaPairRDD mergedIds = mergeRels - .where("relClass == 'merges'") - .select(mergeRels.col("target")) - .distinct() - .toJavaRDD() - .mapToPair((PairFunction) r -> new Tuple2(r.getString(0), "d")); - - JavaRDD toCheck = jsc.textFile(testDedupGraphBasePath + "/relation") - .mapToPair(json -> new Tuple2<>(MapDocumentUtil.getJPathString("$.source", json), json)) - .join(mergedIds) - .map(t -> t._2()._1()) - .mapToPair(json -> new Tuple2<>(MapDocumentUtil.getJPathString("$.target", json), json)) - .join(mergedIds) - .map(t -> t._2()._1()); - - long deletedbyinference = toCheck.filter(this::isDeletedByInference).count(); - long updated = toCheck.count(); - - assertEquals(updated, deletedbyinference); } @AfterAll @@ -258,6 +292,6 @@ public class SparkDedupTest implements Serializable { } public boolean isDeletedByInference(String s) { - return s.contains("\"deletedbyinference\":true"); + return s.contains("\"deletedbyinference\":true"); } } diff --git a/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/jpath/JsonPathTest.java b/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/jpath/JsonPathTest.java index d72734ed5..b8b83b758 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/jpath/JsonPathTest.java +++ b/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/jpath/JsonPathTest.java @@ -7,283 +7,285 @@ import org.junit.jupiter.api.Test; public class JsonPathTest { - String json = "{\t\"dataInfo\":{\t\t\"invisible\":false,\t\t\"inferred\":false,\t\t\"deletedbyinference\":false,\t\t\"trust\":\"0.810000002384185791\",\t\t\"inferenceprovenance\":\"\",\t\t\"provenanceaction\":{\t\t\t\"classid\":\"sysimport:crosswalk:entityregistry\",\t\t\t\"classname\":\"sysimport:crosswalk:entityregistry\",\t\t\t\"schemeid\":\"dnet:provenance_actions\",\t\t\t\"schemename\":\"dnet:provenance_actions\"\t\t}\t},\t\"lastupdatetimestamp\":1584960968152,\t\"id\":\"20|corda__h2020::9faf23721249f26ac2c16eb857ea1fb9\",\t\"originalId\":[\t\t\"corda__h2020::927957582\"\t],\t\"collectedfrom\":[\t\t{\t\t\t\"key\":\"openaire____::corda_h2020\",\t\t\t\"value\":\"CORDA - COmmon Research DAta Warehouse - Horizon 2020\",\t\t\t\"dataInfo\":null\t\t}\t],\t\"pid\":[\t],\t\"dateofcollection\":\"2016-06-05\",\t\"dateoftransformation\":\"2019-11-19\",\t\"extraInfo\":[\t],\t\"oaiprovenance\":null,\t\"legalshortname\":{\t\t\"value\":\"Comentor AB\",\t\t\"dataInfo\":{\t\t\t\"invisible\":false,\t\t\t\"inferred\":false,\t\t\t\"deletedbyinference\":false,\t\t\t\"trust\":\"0.810000002384185791\",\t\t\t\"inferenceprovenance\":\"\",\t\t\t\"provenanceaction\":{\t\t\t\t\"classid\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"classname\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"schemeid\":\"dnet:provenance_actions\",\t\t\t\t\"schemename\":\"dnet:provenance_actions\"\t\t\t}\t\t}\t},\t\"legalname\":{\t\t\"value\":\"Comentor AB\",\t\t\"dataInfo\":{\t\t\t\"invisible\":false,\t\t\t\"inferred\":false,\t\t\t\"deletedbyinference\":false,\t\t\t\"trust\":\"0.810000002384185791\",\t\t\t\"inferenceprovenance\":\"\",\t\t\t\"provenanceaction\":{\t\t\t\t\"classid\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"classname\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"schemeid\":\"dnet:provenance_actions\",\t\t\t\t\"schemename\":\"dnet:provenance_actions\"\t\t\t}\t\t}\t},\t\"alternativeNames\":[\t],\t\"websiteurl\":{\t\t\"value\":\"http://www.comentor.se\",\t\t\"dataInfo\":{\t\t\t\"invisible\":false,\t\t\t\"inferred\":false,\t\t\t\"deletedbyinference\":false,\t\t\t\"trust\":\"0.810000002384185791\",\t\t\t\"inferenceprovenance\":\"\",\t\t\t\"provenanceaction\":{\t\t\t\t\"classid\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"classname\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"schemeid\":\"dnet:provenance_actions\",\t\t\t\t\"schemename\":\"dnet:provenance_actions\"\t\t\t}\t\t}\t},\t\"logourl\":null,\t\"eclegalbody\":{\t\t\"value\":\"false\",\t\t\"dataInfo\":{\t\t\t\"invisible\":false,\t\t\t\"inferred\":false,\t\t\t\"deletedbyinference\":false,\t\t\t\"trust\":\"0.810000002384185791\",\t\t\t\"inferenceprovenance\":\"\",\t\t\t\"provenanceaction\":{\t\t\t\t\"classid\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"classname\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"schemeid\":\"dnet:provenance_actions\",\t\t\t\t\"schemename\":\"dnet:provenance_actions\"\t\t\t}\t\t}\t},\t\"eclegalperson\":{\t\t\"value\":\"true\",\t\t\"dataInfo\":{\t\t\t\"invisible\":false,\t\t\t\"inferred\":false,\t\t\t\"deletedbyinference\":false,\t\t\t\"trust\":\"0.810000002384185791\",\t\t\t\"inferenceprovenance\":\"\",\t\t\t\"provenanceaction\":{\t\t\t\t\"classid\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"classname\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"schemeid\":\"dnet:provenance_actions\",\t\t\t\t\"schemename\":\"dnet:provenance_actions\"\t\t\t}\t\t}\t},\t\"ecnonprofit\":{\t\t\"value\":\"false\",\t\t\"dataInfo\":{\t\t\t\"invisible\":false,\t\t\t\"inferred\":false,\t\t\t\"deletedbyinference\":false,\t\t\t\"trust\":\"0.810000002384185791\",\t\t\t\"inferenceprovenance\":\"\",\t\t\t\"provenanceaction\":{\t\t\t\t\"classid\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"classname\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"schemeid\":\"dnet:provenance_actions\",\t\t\t\t\"schemename\":\"dnet:provenance_actions\"\t\t\t}\t\t}\t},\t\"ecresearchorganization\":{\t\t\"value\":\"false\",\t\t\"dataInfo\":{\t\t\t\"invisible\":false,\t\t\t\"inferred\":false,\t\t\t\"deletedbyinference\":false,\t\t\t\"trust\":\"0.810000002384185791\",\t\t\t\"inferenceprovenance\":\"\",\t\t\t\"provenanceaction\":{\t\t\t\t\"classid\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"classname\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"schemeid\":\"dnet:provenance_actions\",\t\t\t\t\"schemename\":\"dnet:provenance_actions\"\t\t\t}\t\t}\t},\t\"echighereducation\":{\t\t\"value\":\"false\",\t\t\"dataInfo\":{\t\t\t\"invisible\":false,\t\t\t\"inferred\":false,\t\t\t\"deletedbyinference\":false,\t\t\t\"trust\":\"0.810000002384185791\",\t\t\t\"inferenceprovenance\":\"\",\t\t\t\"provenanceaction\":{\t\t\t\t\"classid\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"classname\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"schemeid\":\"dnet:provenance_actions\",\t\t\t\t\"schemename\":\"dnet:provenance_actions\"\t\t\t}\t\t}\t},\t\"ecinternationalorganizationeurinterests\":{\t\t\"value\":\"false\",\t\t\"dataInfo\":{\t\t\t\"invisible\":false,\t\t\t\"inferred\":false,\t\t\t\"deletedbyinference\":false,\t\t\t\"trust\":\"0.810000002384185791\",\t\t\t\"inferenceprovenance\":\"\",\t\t\t\"provenanceaction\":{\t\t\t\t\"classid\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"classname\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"schemeid\":\"dnet:provenance_actions\",\t\t\t\t\"schemename\":\"dnet:provenance_actions\"\t\t\t}\t\t}\t},\t\"ecinternationalorganization\":{\t\t\"value\":\"false\",\t\t\"dataInfo\":{\t\t\t\"invisible\":false,\t\t\t\"inferred\":false,\t\t\t\"deletedbyinference\":false,\t\t\t\"trust\":\"0.810000002384185791\",\t\t\t\"inferenceprovenance\":\"\",\t\t\t\"provenanceaction\":{\t\t\t\t\"classid\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"classname\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"schemeid\":\"dnet:provenance_actions\",\t\t\t\t\"schemename\":\"dnet:provenance_actions\"\t\t\t}\t\t}\t},\t\"ecenterprise\":{\t\t\"value\":\"false\",\t\t\"dataInfo\":{\t\t\t\"invisible\":false,\t\t\t\"inferred\":false,\t\t\t\"deletedbyinference\":false,\t\t\t\"trust\":\"0.810000002384185791\",\t\t\t\"inferenceprovenance\":\"\",\t\t\t\"provenanceaction\":{\t\t\t\t\"classid\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"classname\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"schemeid\":\"dnet:provenance_actions\",\t\t\t\t\"schemename\":\"dnet:provenance_actions\"\t\t\t}\t\t}\t},\t\"ecsmevalidated\":{\t\t\"value\":\"true\",\t\t\"dataInfo\":{\t\t\t\"invisible\":false,\t\t\t\"inferred\":false,\t\t\t\"deletedbyinference\":false,\t\t\t\"trust\":\"0.810000002384185791\",\t\t\t\"inferenceprovenance\":\"\",\t\t\t\"provenanceaction\":{\t\t\t\t\"classid\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"classname\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"schemeid\":\"dnet:provenance_actions\",\t\t\t\t\"schemename\":\"dnet:provenance_actions\"\t\t\t}\t\t}\t},\t\"ecnutscode\":{\t\t\"value\":\"false\",\t\t\"dataInfo\":{\t\t\t\"invisible\":false,\t\t\t\"inferred\":false,\t\t\t\"deletedbyinference\":false,\t\t\t\"trust\":\"0.810000002384185791\",\t\t\t\"inferenceprovenance\":\"\",\t\t\t\"provenanceaction\":{\t\t\t\t\"classid\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"classname\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"schemeid\":\"dnet:provenance_actions\",\t\t\t\t\"schemename\":\"dnet:provenance_actions\"\t\t\t}\t\t}\t},\t\"country\":null}"; - DedupConfig conf = DedupConfig.load("{\n" + - " \"wf\" : {\n" + - " \"threshold\" : \"0.99\",\n" + - " \"dedupRun\" : \"001\",\n" + - " \"entityType\" : \"organization\",\n" + - " \"subEntityValue\": \"organization\",\n" + - " \"orderField\" : \"legalname\",\n" + - " \"queueMaxSize\" : \"2000\",\n" + - " \"groupMaxSize\" : \"50\",\n" + - " \"slidingWindowSize\" : \"200\",\n" + - " \"idPath\":\"$.id\",\n" + - " \"rootBuilder\" : [ \"organization\", \"projectOrganization_participation_isParticipant\", \"datasourceOrganization_provision_isProvidedBy\" ],\n" + - " \"includeChildren\" : \"true\",\n" + - " \"maxIterations\": \"20\"\n" + - " },\n" + - " \"pace\" : {\n" + - " \"clustering\" : [\n" + - " { \"name\" : \"sortedngrampairs\", \"fields\" : [ \"legalname\" ], \"params\" : { \"max\" : 2, \"ngramLen\" : \"3\"} },\n" + - " { \"name\" : \"suffixprefix\", \"fields\" : [ \"legalname\" ], \"params\" : { \"max\" : 1, \"len\" : \"3\" } },\n" + - " { \"name\" : \"urlclustering\", \"fields\" : [ \"websiteurl\" ], \"params\" : { } },\n" + - " { \"name\" : \"keywordsclustering\", \"fields\" : [ \"legalname\" ], \"params\" : { \"max\": 2, \"windowSize\": 4} }\n" + - " ],\n" + - " \"decisionTree\" : {\n" + - " \"start\": {\n" + - " \"fields\": [\n" + - " {\n" + - " \"field\": \"gridid\",\n" + - " \"comparator\": \"exactMatch\",\n" + - " \"weight\": 1,\n" + - " \"countIfUndefined\": \"false\",\n" + - " \"params\": {}\n" + - " }\n" + - " ],\n" + - " \"threshold\": 1,\n" + - " \"aggregation\": \"AVG\",\n" + - " \"positive\": \"MATCH\",\n" + - " \"negative\": \"NO_MATCH\",\n" + - " \"undefined\": \"layer2\",\n" + - " \"ignoreUndefined\": \"false\"\n" + - " },\n" + - " \"layer2\": {\n" + - " \"fields\": [\n" + - " {\n" + - " \"field\": \"websiteurl\",\n" + - " \"comparator\": \"domainExactMatch\",\n" + - " \"weight\": 1,\n" + - " \"countIfUndefined\": \"false\",\n" + - " \"params\": {}\n" + - " },\n" + - " {\n" + - " \"field\": \"country\",\n" + - " \"comparator\": \"exactMatch\",\n" + - " \"weight\": 1,\n" + - " \"countIfUndefined\": \"true\",\n" + - " \"params\": {}\n" + - " },\n" + - " {\n" + - " \"field\": \"legalname\",\n" + - " \"comparator\": \"numbersMatch\",\n" + - " \"weight\": 1,\n" + - " \"countIfUndefined\": \"true\",\n" + - " \"params\": {}\n" + - " },\n" + - " {\n" + - " \"field\": \"legalname\",\n" + - " \"comparator\": \"romansMatch\",\n" + - " \"weight\": 1,\n" + - " \"countIfUndefined\": \"true\",\n" + - " \"params\": {}\n" + - " }\n" + - " ],\n" + - " \"threshold\": 1,\n" + - " \"aggregation\": \"AND\",\n" + - " \"positive\": \"layer3\",\n" + - " \"negative\": \"NO_MATCH\",\n" + - " \"undefined\": \"layer3\",\n" + - " \"ignoreUndefined\": \"true\"\n" + - " },\n" + - " \"layer3\": {\n" + - " \"fields\": [\n" + - " {\n" + - " \"field\": \"legalname\",\n" + - " \"comparator\": \"cityMatch\",\n" + - " \"weight\": 1.0,\n" + - " \"countIfUndefined\": \"true\",\n" + - " \"params\": {\n" + - " \"windowSize\": \"4\"\n" + - " }\n" + - " }\n" + - " ],\n" + - " \"threshold\": 0.1,\n" + - " \"aggregation\": \"AVG\",\n" + - " \"positive\": \"layer4\",\n" + - " \"negative\": \"NO_MATCH\",\n" + - " \"undefined\": \"NO_MATCH\",\n" + - " \"ignoreUndefined\": \"true\"\n" + - " },\n" + - " \"layer4\": {\n" + - " \"fields\": [\n" + - " {\n" + - " \"field\": \"legalname\",\n" + - " \"comparator\": \"keywordMatch\",\n" + - " \"weight\": 1.0,\n" + - " \"countIfUndefined\": \"true\",\n" + - " \"params\": {\n" + - " \"windowSize\": \"4\"\n" + - " }\n" + - " }\n" + - " ],\n" + - " \"threshold\": 0.7,\n" + - " \"aggregation\": \"AVG\",\n" + - " \"positive\": \"layer5\",\n" + - " \"negative\": \"NO_MATCH\",\n" + - " \"undefined\": \"layer5\",\n" + - " \"ignoreUndefined\": \"true\"\n" + - " },\n" + - " \"layer5\": {\n" + - " \"fields\": [\n" + - " {\n" + - " \"field\": \"legalname\",\n" + - " \"comparator\": \"jaroWinklerNormalizedName\",\n" + - " \"weight\": 0.9,\n" + - " \"countIfUndefined\": \"true\",\n" + - " \"params\": {\n" + - " \"windowSize\": \"4\"\n" + - " }\n" + - " },\n" + - " {\n" + - " \"field\": \"legalshortname\",\n" + - " \"comparator\": \"jaroWinklerNormalizedName\",\n" + - " \"weight\": 0.1,\n" + - " \"countIfUndefined\": \"false\",\n" + - " \"params\": {\n" + - " \"windowSize\": 4\n" + - " }\n" + - " }\n" + - " ],\n" + - " \"threshold\": 0.9,\n" + - " \"aggregation\": \"W_MEAN\",\n" + - " \"positive\": \"MATCH\",\n" + - " \"negative\": \"NO_MATCH\",\n" + - " \"undefined\": \"NO_MATCH\",\n" + - " \"ignoreUndefined\": \"true\"\n" + - " }\n" + - " },\n" + - " \"model\" : [\n" + - " { \"name\" : \"country\", \"type\" : \"String\", \"path\" : \"$.country.classid\"},\n" + - " { \"name\" : \"legalshortname\", \"type\" : \"String\", \"path\" : \"$.legalshortname.value\"},\n" + - " { \"name\" : \"legalname\", \"type\" : \"String\", \"path\" : \"$.legalname.value\" },\n" + - " { \"name\" : \"websiteurl\", \"type\" : \"URL\", \"path\" : \"$.websiteurl.value\" },\n" + - " { \"name\" : \"gridid\", \"type\" : \"String\", \"path\" : \"$.pid[?(@.qualifier.classid =='grid')].value\"},\n" + - " { \"name\" : \"originalId\", \"type\" : \"String\", \"path\" : \"$.id\" }\n" + - " ],\n" + - " \"blacklists\" : {\n" + - " \"legalname\" : []\n" + - " },\n" + - " \"synonyms\": {\n" + - " \"key::1\": [\"university\",\"università\", \"universitas\", \"università studi\",\"universitario\",\"universitaria\",\"université\", \"universite\", \"universitaire\",\"universitaires\",\"universidad\",\"universitade\",\"Universität\",\"universitaet\",\"Uniwersytet\",\"университет\",\"universiteit\",\"πανεπιστήμιο\",\"universitesi\",\"universiteti\", \"universiti\"],\n" + - " \"key::2\": [\"studies\",\"studi\",\"études\",\"estudios\",\"estudos\",\"Studien\",\"studia\",\"исследования\",\"studies\",\"σπουδές\"],\n" + - " \"key::3\": [\"advanced\",\"superiore\",\"supérieur\",\"supérieure\",\"supérieurs\",\"supérieures\",\"avancado\",\"avancados\",\"fortgeschrittene\",\"fortgeschritten\",\"zaawansowany\",\"передовой\",\"gevorderd\",\"gevorderde\",\"προχωρημένος\",\"προχωρημένη\",\"προχωρημένο\",\"προχωρημένες\",\"προχωρημένα\",\"wyzsza\"],\n" + - " \"key::4\": [\"institute\",\"istituto\",\"institut\",\"instituto\",\"instituto\",\"Institut\",\"instytut\",\"институт\",\"instituut\",\"ινστιτούτο\"],\n" + - " \"key::5\": [\"hospital\",\"ospedale\",\"hôpital\",\"hospital\",\"hospital\",\"Krankenhaus\",\"szpital\",\"больница\",\"ziekenhuis\",\"νοσοκομείο\"],\n" + - " \"key::6\": [\"research\",\"ricerca\",\"recherche\",\"investigacion\",\"pesquisa\",\"Forschung\",\"badania\",\"исследования\",\"onderzoek\",\"έρευνα\",\"erevna\",\"erevnas\"],\n" + - " \"key::7\": [\"college\",\"collegio\",\"colegio\",\"faculdade\",\"Hochschule\",\"Szkoła Wyższa\",\"Высшая школа\",\"κολλέγιο\"],\n" + - " \"key::8\": [\"foundation\",\"fondazione\",\"fondation\",\"fundación\",\"fundação\",\"Stiftung\",\"Fundacja\",\"фонд\",\"stichting\",\"ίδρυμα\",\"idryma\"],\n" + - " \"key::9\": [\"center\",\"centro\",\"centre\",\"centro\",\"centro\",\"zentrum\",\"centrum\",\"центр\",\"centrum\",\"κέντρο\"],\n" + - " \"key::10\": [\"national\",\"nazionale\",\"national\",\"nationale\",\"nationaux\",\"nationales\",\"nacional\",\"nacional\",\"national\",\"krajowy\",\"национальный\",\"nationaal\",\"nationale\",\"εθνικό\"],\n" + - " \"key::11\": [\"association\",\"associazione\",\"association\",\"asociación\",\"associação\",\"Verein\",\"verband\",\"stowarzyszenie\",\"ассоциация\",\"associatie\"],\n" + - " \"key::12\": [\"society\",\"societa\",\"société\",\"sociedad\",\"sociedade\",\"gesellschaft\",\"społeczeństwo\",\"общество\",\"maatschappij\",\"κοινωνία\"],\n" + - " \"key::13\": [\"international\",\"internazionale\",\"international\",\"internacional\",\"internacional\",\"international\",\"międzynarodowy\",\"Международный\",\"internationaal\",\"internationale\",\"διεθνής\",\"διεθνή\",\"διεθνές\"],\n" + - " \"key::14\": [\"community\",\"comunita\",\"communauté\",\"comunidad\",\"comunidade\",\"Gemeinschaft\",\"społeczność\",\"сообщество\",\"gemeenschap\",\"κοινότητα\"],\n" + - " \"key::15\": [\"school\",\"scuola\",\"école\",\"escuela\",\"escola\",\"schule\",\"Szkoła\",\"школа\",\"school\",\"σχολείο\"],\n" + - " \"key::16\": [\"education\",\"educazione\",\"éducation\",\"educacion\",\"Educação\",\"Bildung\",\"Edukacja\",\"образование\",\"opleiding\",\"εκπαίδευση\"],\n" + - " \"key::17\": [\"academy\",\"accademia\",\"académie\",\"academia\",\"academia\",\"Akademie\",\"akademie\",\"академия\",\"academie\",\"ακαδημία\"],\n" + - " \"key::18\": [\"public\",\"pubblico\",\"public\",\"publique\",\"publics\",\"publiques\",\"publico\",\"publico\",\"Öffentlichkeit\",\"publiczny\",\"публичный\",\"publiek\",\"publieke\",\"δημόσιος\",\"δημόσια\",\"δημόσιο\"],\n" + - " \"key::19\": [\"museum\",\"museo\",\"musée\",\"mueso\",\"museu\",\"museum\",\"muzeum\",\"музей\",\"museum\",\"μουσείο\"],\n" + - " \"key::20\": [\"group\",\"gruppo\",\"groupe\",\"grupo\",\"grupo\",\"gruppe\",\"grupa\",\"группа\",\"groep\",\"ομάδα\",\"όμιλος\"],\n" + - " \"key::21\": [\"department\",\"dipartimento\",\"département\",\"departamento\",\"departamento\",\"abteilung\",\"departament\",\"отдел\",\"afdeling\",\"τμήμα\"],\n" + - " \"key::22\": [\"council\",\"consiglio\",\"conseil\",\"Consejo\",\"conselho\",\"gesellschaft\",\"rada\",\"совет\",\"raad\",\"συμβούλιο\"],\n" + - " \"key::23\": [\"library\",\"biblioteca\",\"bibliothèque\",\"biblioteca\",\"biblioteca\",\"Bibliothek\",\"biblioteka\",\"библиотека\",\"bibliotheek\",\"βιβλιοθήκη\"],\n" + - " \"key::24\": [\"ministry\",\"ministero\",\"ministère\",\"ministerio\",\"ministério\",\"Ministerium\",\"ministerstwo\",\"министерство\",\"ministerie\",\"υπουργείο\"],\n" + - " \"key::25\": [\"services\",\"servizi\",\"services\",\"servicios\",\"Serviços\",\"Dienstleistungen\",\"usługi\",\"услуги\",\"diensten\",\"υπηρεσίες\"],\n" + - " \"key::26\": [\"central\",\"centrale\",\"central\",\"centrale\",\"centrales\",\"central\",\"central\",\"zentral\",\"centralny\",\"цетральный\",\"centraal\",\"κεντρικός\",\"κεντρική\",\"κεντρικό\",\"κεντρικά\"],\n" + - " \"key::27\": [\"general\",\"generale\",\"général\",\"générale\",\"généraux\",\"générales\",\"general\",\"geral\",\"general\",\"Allgemeines\",\"general\",\"общий\",\"algemeen\",\"algemene\",\"γενικός\",\"γενική\",\"γενικό\",\"γενικά\"],\n" + - " \"key::28\": [\"applied\",\"applicati\",\"appliqué\",\"appliquée\",\"appliqués\",\"appliquées\",\"aplicado\",\"aplicada\",\"angewendet\",\"stosowany\",\"прикладной\",\"toegepast\",\"toegepaste\",\"εφαρμοσμένος\",\"εφαρμοσμένη\",\"εφαρμοσμένο\",\"εφαρμοσμένα\"],\n" + - " \"key::29\": [\"european\",\"europee\",\"europea\",\"européen\",\"européenne\",\"européens\",\"européennes\",\"europeo\",\"europeu\",\"europäisch\",\"europejski\",\"европейский\",\"Europees\",\"Europese\",\"ευρωπαϊκός\",\"ευρωπαϊκή\",\"ευρωπαϊκό\",\"ευρωπαϊκά\"],\n" + - " \"key::30\": [\"agency\",\"agenzia\",\"agence\",\"agencia\",\"agencia\",\"agentur\",\"agencja\",\"агенция\",\"agentschap\",\"πρακτορείο\"],\n" + - " \"key::31\": [\"laboratory\",\"laboratorio\",\"laboratoire\",\"laboratorio\",\"laboratorio\",\"labor\",\"laboratorium\",\"лаборатория\",\"laboratorium\",\"εργαστήριο\"],\n" + - " \"key::32\": [\"industry\",\"industria\",\"industrie\",\"индустрия\",\"industrie\",\"βιομηχανία\"],\n" + - " \"key::33\": [\"industrial\",\"industriale\",\"industriel\",\"industrielle\",\"industriels\",\"industrielles\",\"индустриальный\",\"industrieel\",\"βιομηχανικός\",\"βιομηχανική\",\"βιομηχανικό\",\"βιομηχανικά\",\"βιομηχανικές\"],\n" + - " \"key::34\": [\"consortium\",\"consorzio\",\"consortium\",\"консорциум\",\"consortium\",\"κοινοπραξία\"],\n" + - " \"key::35\": [\"organization\",\"organizzazione\",\"organisation\",\"organización\",\"organização\",\"organizacja\",\"организация\",\"organisatie\",\"οργανισμός\"],\n" + - " \"key::36\": [\"authority\",\"autorità\",\"autorité\",\"авторитет\",\"autoriteit\"],\n" + - " \"key::37\": [\"federation\",\"federazione\",\"fédération\",\"федерация\",\"federatie\",\"ομοσπονδία\"],\n" + - " \"key::38\": [\"observatory\",\"osservatorio\",\"observatoire\",\"обсерватория\",\"observatorium\",\"αστεροσκοπείο\"],\n" + - " \"key::39\": [\"bureau\",\"ufficio\",\"bureau\",\"офис\",\"bureau\",\"γραφείο\"],\n" + - " \"key::40\": [\"company\",\"impresa\",\"compagnie\",\"société\",\"компания\",\"bedrijf\",\"εταιρία\"],\n" + - " \"key::41\": [\"polytechnic\",\"politecnico\",\"polytechnique\",\"политехника\",\"polytechnisch\",\"πολυτεχνείο\",\"universita politecnica\",\"polytechnic university\",\"universidad politecnica\",\"universitat politecnica\",\"politechnika\",\"politechniki\",\"university technology\",\"university science technology\"],\n" + - " \"key::42\": [\"coalition\",\"coalizione\",\"coalition\",\"коалиция\",\"coalitie\",\"συνασπισμός\"],\n" + - " \"key::43\": [\"initiative\",\"iniziativa\",\"initiative\",\"инициатива\",\"initiatief\",\"πρωτοβουλία\"],\n" + - " \"key::44\": [\"academic\",\"accademico\",\"académique\",\"universitaire\",\"акадеческий academisch\",\"ακαδημαϊκός\",\"ακαδημαϊκή\",\"ακαδημαϊκό\",\"ακαδημαϊκές\",\"ακαδημαϊκοί\"],\n" + - " \"key::45\": [\"institution\",\"istituzione\",\"institution\",\"институциональный\",\"instelling\",\"ινστιτούτο\"],\n" + - " \"key::46\": [\"division\",\"divisione\",\"division\",\"отделение\",\"divisie\",\"τμήμα\"],\n" + - " \"key::47\": [\"committee\",\"comitato\",\"comité\",\"комитет\",\"commissie\",\"επιτροπή\"],\n" + - " \"key::48\": [\"promotion\",\"promozione\",\"продвижение\",\"proothisis\",\"forderung\"],\n" + - " \"key::49\": [\"medical\",\"medicine\",\"clinical\",\"medicina\",\"clinici\",\"médico\",\"medicina\",\"clínica\",\"médico\",\"medicina\",\"clínica\",\"medizinisch\",\"Medizin\",\"klinisch\",\"medisch\",\"geneeskunde\",\"klinisch\",\"ιατρικός\",\"ιατρική\",\"ιατρικό\",\"ιατρικά\",\"κλινικός\",\"κλινική\",\"κλινικό\",\"κλινικά\",\"tıbbi\",\"tıp\",\"klinik\",\"orvosi\",\"orvostudomány\",\"klinikai\",\"zdravniški\",\"medicinski\",\"klinični\",\"meditsiini\",\"kliinik\",\"kliiniline\"],\n" + - " \"key::50\": [\"technology\",\"technological\",\"tecnologia\",\"tecnologie\",\"tecnología\",\"tecnológico\",\"tecnologia\",\"tecnológico\",\"Technologie\",\"technologisch\",\"technologie\",\"technologisch\",\"τεχνολογία\",\"τεχνολογικός\",\"τεχνολογική\",\"τεχνολογικό\",\"teknoloji\",\"teknolojik\",\"technológia\",\"technológiai\",\"tehnologija\",\"tehnološki\",\"tehnoloogia\",\"tehnoloogiline\",\"technologii\",\"technical\",\"texniki\",\"teknik\"],\n" + - " \"key::51\": [\"science\",\"scientific\",\"scienza\",\"scientifiche\",\"scienze\",\"ciencia\",\"científico\",\"ciência\",\"científico\",\"Wissenschaft\",\"wissenschaftlich\",\"wetenschap\",\"wetenschappelijk\",\"επιστήμη\",\"επιστημονικός\",\"επιστημονική\",\"επιστημονικό\",\"επιστημονικά\",\"bilim\",\"bilimsel\",\"tudomány\",\"tudományos\",\"znanost\",\"znanstveni\",\"teadus\",\"teaduslik\",\"\"],\n" + - " \"key::52\": [\"engineering\",\"ingegneria\",\"ingeniería\",\"engenharia\",\"Ingenieurwissenschaft\",\"ingenieurswetenschappen\",\"bouwkunde\",\"μηχανικός\",\"μηχανική\",\"μηχανικό\",\"mühendislik\",\"mérnöki\",\"Inženirstvo\",\"inseneeria\",\"inseneri\",\"\"],\n" + - " \"key::53\": [\"management\",\"gestione\",\"gestionale\",\"gestionali\",\"gestión\",\"administración\",\"gestão\",\"administração\",\"Verwaltung\",\"management\",\"διαχείριση\",\"yönetim\",\"menedzsment\",\"vodstvo\",\"upravljanje\",\"management\",\"juhtkond\",\"juhtimine\",\"haldus\",\"\"],\n" + - " \"key::54\": [\"energy\",\"energia\",\"energía\",\"energia\",\"Energie\",\"energie\",\"ενέργεια\",\"enerji\",\"energia\",\"energija\",\"energia\",\"\"],\n" + - " \"key::55\": [\"agricultural\",\"agriculture\",\"agricoltura\",\"agricole\",\"agrícola\",\"agricultura\",\"agrícola\",\"agricultura\",\"landwirtschaftlich\",\"Landwirtschaft\",\"landbouwkundig\",\"landbouw\",\"αγροτικός\",\"αγροτική\",\"αγροτικό\",\"γεωργικός\",\"γεωργική\",\"γεωργικό\",\"γεωργία\",\"tarımsal\",\"tarım\",\"mezőgazdasági\",\"mezőgazdaság\",\"poljedelski\",\"poljedelstvo\",\"põllumajandus\",\"põllumajanduslik\",\"\"],\n" + - " \"key::56\": [\"information\",\"informazione\",\"información\",\"informação\",\"Information\",\"informatie\",\"πληροφορία\",\"bilgi\",\"információ\",\"informacija\",\"informatsioon\",\"informatycznych\",\"\"],\n" + - " \"key::57\": [\"social\",\"sociali\",\"social\",\"social\",\"Sozial\",\"sociaal\",\"maatschappelijk\",\"κοινωνικός\",\"κοινωνική\",\"κοινωνικό\",\"κοινωνικά\",\"sosyal\",\"szociális\",\"družbeni\",\"sotsiaal\",\"sotsiaalne\",\"\"],\n" + - " \"key::58\": [\"environmental\",\"ambiente\",\"medioambiental\",\"ambiente\",\"medioambiente\",\"meioambiente\",\"Umwelt\",\"milieu\",\"milieuwetenschap\",\"milieukunde\",\"περιβαλλοντικός\",\"περιβαλλοντική\",\"περιβαλλοντικό\",\"περιβαλλοντικά\",\"çevre\",\"környezeti\",\"okoliški\",\"keskonna\",\"\"],\n" + - " \"key::59\": [\"business\",\"economia\",\"economiche\",\"economica\",\"negocio\",\"empresa\",\"negócio\",\"Unternehmen\",\"bedrijf\",\"bedrijfskunde\",\"επιχείρηση\",\"iş\",\"üzleti\",\"posel\",\"ettevõte/äri\",\"\"],\n" + - " \"key::60\": [\"pharmaceuticals\",\"pharmacy\",\"farmacia\",\"farmaceutica\",\"farmacéutica\",\"farmacia\",\"farmacêutica\",\"farmácia\",\"Pharmazeutika\",\"Arzneimittelkunde\",\"farmaceutica\",\"geneesmiddelen\",\"apotheek\",\"φαρμακευτικός\",\"φαρμακευτική\",\"φαρμακευτικό\",\"φαρμακευτικά\",\"φαρμακείο\",\"ilaç\",\"eczane\",\"gyógyszerészeti\",\"gyógyszertár\",\"farmacevtika\",\"lekarništvo\",\"farmaatsia\",\"farmatseutiline\",\"\"],\n" + - " \"key::61\": [\"healthcare\",\"health services\",\"salute\",\"atenciónmédica\",\"cuidadodelasalud\",\"cuidadoscomasaúde\",\"Gesundheitswesen\",\"gezondheidszorg\",\"ιατροφαρμακευτικήπερίθαλψη\",\"sağlıkhizmeti\",\"egészségügy\",\"zdravstvo\",\"tervishoid\",\"tervishoiu\",\"\"],\n" + - " \"key::62\": [\"history\",\"storia\",\"historia\",\"história\",\"Geschichte\",\"geschiedenis\",\"geschiedkunde\",\"ιστορία\",\"tarih\",\"történelem\",\"zgodovina\",\"ajalugu\",\"\"],\n" + - " \"key::63\": [\"materials\",\"materiali\",\"materia\",\"materiales\",\"materiais\",\"materialen\",\"υλικά\",\"τεκμήρια\",\"malzemeler\",\"anyagok\",\"materiali\",\"materjalid\",\"vahendid\",\"\"],\n" + - " \"key::64\": [\"economics\",\"economia\",\"economiche\",\"economica\",\"economía\",\"economia\",\"Wirtschaft\",\"economie\",\"οικονομικά\",\"οικονομικέςεπιστήμες\",\"ekonomi\",\"közgazdaságtan\",\"gospodarstvo\",\"ekonomija\",\"majanduslik\",\"majandus\",\"\"],\n" + - " \"key::65\": [\"therapeutics\",\"terapeutica\",\"terapéutica\",\"terapêutica\",\"therapie\",\"θεραπευτική\",\"tedavibilimi\",\"gyógykezelés\",\"terapevtika\",\"terapeutiline\",\"ravi\",\"\"],\n" + - " \"key::66\": [\"oncology\",\"oncologia\",\"oncologico\",\"oncología\",\"oncologia\",\"Onkologie\",\"oncologie\",\"ογκολογία\",\"onkoloji\",\"onkológia\",\"onkologija\",\"onkoloogia\",\"\"],\n" + - " \"key::67\": [\"natural\",\"naturali\",\"naturale\",\"natural\",\"natural\",\"natürlich\",\"natuurlijk\",\"φυσικός\",\"φυσική\",\"φυσικό\",\"φυσικά\",\"doğal\",\"természetes\",\"naraven\",\"loodus\",\"\"],\n" + - " \"key::68\": [\"educational\",\"educazione\",\"pedagogia\",\"educacional\",\"educativo\",\"educacional\",\"pädagogisch\",\"educatief\",\"εκπαιδευτικός\",\"εκπαιδευτική\",\"εκπαιδευτικό\",\"εκπαιδευτικά\",\"eğitimsel\",\"oktatási\",\"izobraževalen\",\"haridus\",\"hariduslik\",\"\"],\n" + - " \"key::69\": [\"biomedical\",\"biomedica\",\"biomédico\",\"biomédico\",\"biomedizinisch\",\"biomedisch\",\"βιοιατρικός\",\"βιοιατρική\",\"βιοιατρικό\",\"βιοιατρικά\",\"biyomedikal\",\"orvosbiológiai\",\"biomedicinski\",\"biomeditsiiniline\",\"\"],\n" + - " \"key::70\": [\"veterinary\",\"veterinaria\",\"veterinarie\",\"veterinaria\",\"veterinária\",\"tierärtzlich\",\"veterinair\",\"veeartsenijlkunde\",\"κτηνιατρικός\",\"κτηνιατρική\",\"κτηνιατρικό\",\"κτηνιατρικά\",\"veteriner\",\"állatorvosi\",\"veterinar\",\"veterinarski\",\"veterinaaria\",\"\"],\n" + - " \"key::71\": [\"chemistry\",\"chimica\",\"química\",\"química\",\"Chemie\",\"chemie\",\"scheikunde\",\"χημεία\",\"kimya\",\"kémia\",\"kemija\",\"keemia\",\"\"],\n" + - " \"key::72\": [\"security\",\"sicurezza\",\"seguridad\",\"segurança\",\"Sicherheit\",\"veiligheid\",\"ασφάλεια\",\"güvenlik\",\"biztonsági\",\"varnost\",\"turvalisus\",\"julgeolek\",\"\"],\n" + - " \"key::73\": [\"biotechnology\",\"biotecnologia\",\"biotecnologie\",\"biotecnología\",\"biotecnologia\",\"Biotechnologie\",\"biotechnologie\",\"βιοτεχνολογία\",\"biyoteknoloji\",\"biotechnológia\",\"biotehnologija\",\"biotehnoloogia\",\"\"],\n" + - " \"key::74\": [\"military\",\"militare\",\"militari\",\"militar\",\"militar\",\"Militär\",\"militair\",\"leger\",\"στρατιωτικός\",\"στρατιωτική\",\"στρατιωτικό\",\"στρατιωτικά\",\"askeri\",\"katonai\",\"vojaški\",\"vojni\",\"militaar\",\"wojskowa\",\"\"],\n" + - " \"key::75\": [\"theological\",\"teologia\",\"teologico\",\"teológico\",\"tecnológica\",\"theologisch\",\"theologisch\",\"θεολογικός\",\"θεολογική\",\"θεολογικό\",\"θεολογικά\",\"teolojik\",\"technológiai\",\"teološki\",\"teoloogia\",\"usuteadus\",\"teoloogiline\",\"\"],\n" + - " \"key::76\": [\"electronics\",\"elettronica\",\"electrónica\",\"eletrônicos\",\"Elektronik\",\"elektronica\",\"ηλεκτρονική\",\"elektronik\",\"elektronika\",\"elektronika\",\"elektroonika\",\"\"],\n" + - " \"key::77\": [\"forestry\",\"forestale\",\"forestali\",\"silvicultura\",\"forestal\",\"floresta\",\"Forstwirtschaft\",\"bosbouw\",\"δασοκομία\",\"δασολογία\",\"ormancılık\",\"erdészet\",\"gozdarstvo\",\"metsandus\",\"\"],\n" + - " \"key::78\": [\"maritime\",\"marittima\",\"marittime\",\"marittimo\",\"marítimo\",\"marítimo\",\"maritiem\",\"ναυτικός\",\"ναυτική\",\"ναυτικό\",\"ναυτικά\",\"ναυτιλιακός\",\"ναυτιλιακή\",\"ναυτιλιακό\",\"ναυτιλιακά\",\"θαλάσσιος\",\"θαλάσσια\",\"θαλάσσιο\",\"denizcilik\",\"tengeri\",\"morski\",\"mere\",\"merendus\",\"\"],\n" + - " \"key::79\": [\"sports\",\"sport\",\"deportes\",\"esportes\",\"Sport\",\"sport\",\"sportwetenschappen\",\"άθληση\",\"γυμναστικήδραστηριότητα\",\"spor\",\"sport\",\"šport\",\"sport\",\"spordi\",\"\"],\n" + - " \"key::80\": [\"surgery\",\"chirurgia\",\"chirurgiche\",\"cirugía\",\"cirurgia\",\"Chirurgie\",\"chirurgie\",\"heelkunde\",\"εγχείρηση\",\"επέμβαση\",\"χειρουργικήεπέμβαση\",\"cerrahi\",\"sebészet\",\"kirurgija\",\"kirurgia\",\"\"],\n" + - " \"key::81\": [\"cultural\",\"culturale\",\"culturali\",\"cultura\",\"cultural\",\"cultural\",\"kulturell\",\"cultureel\",\"πολιτιστικός\",\"πολιτιστική\",\"πολιτιστικό\",\"πολιτισμικός\",\"πολιτισμική\",\"πολιτισμικό\",\"kültürel\",\"kultúrális\",\"kulturni\",\"kultuuri\",\"kultuuriline\",\"\"],\n" + - " \"key::82\": [\"computerscience\",\"informatica\",\"ordenador\",\"computadora\",\"informática\",\"computación\",\"cienciasdelacomputación\",\"ciênciadacomputação\",\"Computer\",\"computer\",\"υπολογιστής\",\"ηλεκτρονικόςυπολογιστής\",\"bilgisayar\",\"számítógép\",\"računalnik\",\"arvuti\",\"\"],\n" + - " \"key::83\": [\"finance\",\"financial\",\"finanza\",\"finanziarie\",\"finanza\",\"financiero\",\"finanças\",\"financeiro\",\"Finanzen\",\"finanziell\",\"financiën\",\"financieel\",\"χρηματοοικονομικά\",\"χρηματοδότηση\",\"finanse\",\"finansal\",\"pénzügy\",\"pénzügyi\",\"finance\",\"finančni\",\"finants\",\"finantsiline\",\"\"],\n" + - " \"key::84\": [\"communication\",\"comunicazione\",\"comuniciación\",\"comunicação\",\"Kommunikation\",\"communication\",\"επικοινωνία\",\"iletişim\",\"kommunikáció\",\"komuniciranje\",\"kommunikatsioon\",\"\"],\n" + - " \"key::85\": [\"justice\",\"giustizia\",\"justicia\",\"justiça\",\"Recht\",\"Justiz\",\"justitie\",\"gerechtigheid\",\"δικαιοσύνη\",\"υπουργείοδικαιοσύνης\",\"δίκαιο\",\"adalet\",\"igazságügy\",\"pravo\",\"õigus\",\"\"],\n" + - " \"key::86\": [\"aerospace\",\"aerospaziale\",\"aerospaziali\",\"aeroespacio\",\"aeroespaço\",\"Luftfahrt\",\"luchtvaart\",\"ruimtevaart\",\"αεροπορικός\",\"αεροπορική\",\"αεροπορικό\",\"αεροναυπηγικός\",\"αεροναυπηγική\",\"αεροναυπηγικό\",\"αεροναυπηγικά\",\"havacılıkveuzay\",\"légtér\",\"zrakoplovstvo\",\"atmosfäär\",\"kosmos\",\"\"],\n" + - " \"key::87\": [\"dermatology\",\"dermatologia\",\"dermatología\",\"dermatologia\",\"Dermatologie\",\"dermatologie\",\"δρματολογία\",\"dermatoloji\",\"bőrgyógyászat\",\"dermatológia\",\"dermatologija\",\"dermatoloogia\",\"\"],\n" + - " \"key::88\": [\"architecture\",\"architettura\",\"arquitectura\",\"arquitetura\",\"Architektur\",\"architectuur\",\"αρχιτεκτονική\",\"mimarlık\",\"építészet\",\"arhitektura\",\"arhitektuur\",\"\"],\n" + - " \"key::89\": [\"mathematics\",\"matematica\",\"matematiche\",\"matemáticas\",\"matemáticas\",\"Mathematik\",\"wiskunde\",\"mathematica\",\"μαθηματικά\",\"matematik\",\"matematika\",\"matematika\",\"matemaatika\",\"\"],\n" + - " \"key::90\": [\"language\",\"lingue\",\"linguistica\",\"linguistiche\",\"lenguaje\",\"idioma\",\"língua\",\"idioma\",\"Sprache\",\"taal\",\"taalkunde\",\"γλώσσα\",\"dil\",\"nyelv\",\"jezik\",\"keel\",\"\"],\n" + - " \"key::91\": [\"neuroscience\",\"neuroscienza\",\"neurociencia\",\"neurociência\",\"Neurowissenschaft\",\"neurowetenschappen\",\"νευροεπιστήμη\",\"nörobilim\",\"idegtudomány\",\"nevroznanost\",\"neuroteadused\",\"\"],\n" + - " \"key::92\": [\"automation\",\"automazione\",\"automatización\",\"automação\",\"Automatisierung\",\"automatisering\",\"αυτοματοποίηση\",\"otomasyon\",\"automatizálás\",\"avtomatizacija\",\"automatiseeritud\",\"\"],\n" + - " \"key::93\": [\"pediatric\",\"pediatria\",\"pediatriche\",\"pediatrico\",\"pediátrico\",\"pediatría\",\"pediátrico\",\"pediatria\",\"pädiatrisch\",\"pediatrische\",\"παιδιατρική\",\"pediatrik\",\"gyermekgyógyászat\",\"pediatrija\",\"pediaatria\",\"\"],\n" + - " \"key::94\": [\"photonics\",\"fotonica\",\"fotoniche\",\"fotónica\",\"fotônica\",\"Photonik\",\"fotonica\",\"φωτονική\",\"fotonik\",\"fotonika\",\"fotonika\",\"fotoonika\",\"\"],\n" + - " \"key::95\": [\"mechanics\", \"mechanical\", \"meccanica\",\"meccaniche\",\"mecánica\",\"mecânica\",\"Mechanik\",\"Maschinenbau\",\"mechanica\",\"werktuigkunde\",\"μηχανικής\",\"mekanik\",\"gépészet\",\"mehanika\",\"mehaanika\",\"\"],\n" + - " \"key::96\": [\"psychiatrics\",\"psichiatria\",\"psichiatrica\",\"psichiatriche\",\"psiquiatría\",\"psiquiatria\",\"Psychiatrie\",\"psychiatrie\",\"ψυχιατρική\",\"psikiyatrik\",\"pszihiátria\",\"psihiatrija\",\"psühhaatria\",\"\"],\n" + - " \"key::97\": [\"psychology\",\"fisiologia\",\"psicología\",\"psicologia\",\"Psychologie\",\"psychologie\",\"ψυχολογία\",\"psikoloji\",\"pszihológia\",\"psihologija\",\"psühholoogia\",\"\"],\n" + - " \"key::98\": [\"automotive\",\"industriaautomobilistica\",\"industriadelautomóvil\",\"automotriz\",\"industriaautomotriz\",\"automotivo\",\"Automobilindustrie\",\"autoindustrie\",\"αυτοκίνητος\",\"αυτοκίνητη\",\"αυτοκίνητο\",\"αυτοκινούμενος\",\"αυτοκινούμενη\",\"αυτοκινούμενο\",\"αυτοκινητιστικός\",\"αυτοκινητιστική\",\"αυτοκινητιστικό\",\"otomotiv\",\"autóipari\",\"samogiben\",\"avtomobilskaindustrija\",\"auto-\",\"\"],\n" + - " \"key::99\": [\"neurology\",\"neurologia\",\"neurologiche\",\"neurología\",\"neurologia\",\"Neurologie\",\"neurologie\",\"zenuwleer\",\"νευρολογία\",\"nöroloji\",\"neurológia\",\"ideggyógyászat\",\"nevrologija\",\"neuroloogia\",\"\"],\n" + - " \"key::100\": [\"geology\",\"geologia\",\"geologiche\",\"geología\",\"geologia\",\"Geologie\",\"geologie\",\"aardkunde\",\"γεωλογία\",\"jeoloji\",\"geológia\",\"földtudomány\",\"geologija\",\"geoloogia\",\"\"],\n" + - " \"key::101\": [\"microbiology\",\"microbiologia\",\"micro-biologia\",\"microbiologiche\",\"microbiología\",\"microbiologia\",\"Mikrobiologie\",\"microbiologie\",\"μικροβιολογία\",\"mikrobiyoloji\",\"mikrobiológia\",\"mikrobiologija\",\"mikrobioloogia\",\"\"],\n" + - " \"key::102\": [\"informatics\",\"informatica\",\"informática\",\"informática\",\"informatica\",\"\"],\n" + - " \"key::103\": [\"forschungsgemeinschaft\",\"comunita ricerca\",\"research community\",\"research foundation\",\"research association\"],\n" + - " \"key::104\": [\"commerce\",\"ticaret\",\"ticarət\",\"commercio\",\"trade\",\"handel\",\"comercio\"],\n" + - " \"key::105\" : [\"state\", \"stato\", \"etade\", \"estado\", \"statale\", \"etat\", \"zustand\", \"estado\"],\n" + - " \"key::106\" : [\"seminary\", \"seminario\", \"seminaire\", \"seminar\"],\n" + - " \"key::107\" : [\"agricultural forestry\", \"af\", \"a f\"],\n" + - " \"key::108\" : [\"agricultural mechanical\", \"am\", \"a m\"],\n" + - " \"key::109\" : [\"catholic\", \"catholique\", \"katholische\", \"catolica\", \"cattolica\", \"catolico\"]\n" + - " }\n" + - " }\n" + - "}"); + String json = + "{\t\"dataInfo\":{\t\t\"invisible\":false,\t\t\"inferred\":false,\t\t\"deletedbyinference\":false,\t\t\"trust\":\"0.810000002384185791\",\t\t\"inferenceprovenance\":\"\",\t\t\"provenanceaction\":{\t\t\t\"classid\":\"sysimport:crosswalk:entityregistry\",\t\t\t\"classname\":\"sysimport:crosswalk:entityregistry\",\t\t\t\"schemeid\":\"dnet:provenance_actions\",\t\t\t\"schemename\":\"dnet:provenance_actions\"\t\t}\t},\t\"lastupdatetimestamp\":1584960968152,\t\"id\":\"20|corda__h2020::9faf23721249f26ac2c16eb857ea1fb9\",\t\"originalId\":[\t\t\"corda__h2020::927957582\"\t],\t\"collectedfrom\":[\t\t{\t\t\t\"key\":\"openaire____::corda_h2020\",\t\t\t\"value\":\"CORDA - COmmon Research DAta Warehouse - Horizon 2020\",\t\t\t\"dataInfo\":null\t\t}\t],\t\"pid\":[\t],\t\"dateofcollection\":\"2016-06-05\",\t\"dateoftransformation\":\"2019-11-19\",\t\"extraInfo\":[\t],\t\"oaiprovenance\":null,\t\"legalshortname\":{\t\t\"value\":\"Comentor AB\",\t\t\"dataInfo\":{\t\t\t\"invisible\":false,\t\t\t\"inferred\":false,\t\t\t\"deletedbyinference\":false,\t\t\t\"trust\":\"0.810000002384185791\",\t\t\t\"inferenceprovenance\":\"\",\t\t\t\"provenanceaction\":{\t\t\t\t\"classid\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"classname\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"schemeid\":\"dnet:provenance_actions\",\t\t\t\t\"schemename\":\"dnet:provenance_actions\"\t\t\t}\t\t}\t},\t\"legalname\":{\t\t\"value\":\"Comentor AB\",\t\t\"dataInfo\":{\t\t\t\"invisible\":false,\t\t\t\"inferred\":false,\t\t\t\"deletedbyinference\":false,\t\t\t\"trust\":\"0.810000002384185791\",\t\t\t\"inferenceprovenance\":\"\",\t\t\t\"provenanceaction\":{\t\t\t\t\"classid\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"classname\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"schemeid\":\"dnet:provenance_actions\",\t\t\t\t\"schemename\":\"dnet:provenance_actions\"\t\t\t}\t\t}\t},\t\"alternativeNames\":[\t],\t\"websiteurl\":{\t\t\"value\":\"http://www.comentor.se\",\t\t\"dataInfo\":{\t\t\t\"invisible\":false,\t\t\t\"inferred\":false,\t\t\t\"deletedbyinference\":false,\t\t\t\"trust\":\"0.810000002384185791\",\t\t\t\"inferenceprovenance\":\"\",\t\t\t\"provenanceaction\":{\t\t\t\t\"classid\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"classname\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"schemeid\":\"dnet:provenance_actions\",\t\t\t\t\"schemename\":\"dnet:provenance_actions\"\t\t\t}\t\t}\t},\t\"logourl\":null,\t\"eclegalbody\":{\t\t\"value\":\"false\",\t\t\"dataInfo\":{\t\t\t\"invisible\":false,\t\t\t\"inferred\":false,\t\t\t\"deletedbyinference\":false,\t\t\t\"trust\":\"0.810000002384185791\",\t\t\t\"inferenceprovenance\":\"\",\t\t\t\"provenanceaction\":{\t\t\t\t\"classid\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"classname\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"schemeid\":\"dnet:provenance_actions\",\t\t\t\t\"schemename\":\"dnet:provenance_actions\"\t\t\t}\t\t}\t},\t\"eclegalperson\":{\t\t\"value\":\"true\",\t\t\"dataInfo\":{\t\t\t\"invisible\":false,\t\t\t\"inferred\":false,\t\t\t\"deletedbyinference\":false,\t\t\t\"trust\":\"0.810000002384185791\",\t\t\t\"inferenceprovenance\":\"\",\t\t\t\"provenanceaction\":{\t\t\t\t\"classid\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"classname\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"schemeid\":\"dnet:provenance_actions\",\t\t\t\t\"schemename\":\"dnet:provenance_actions\"\t\t\t}\t\t}\t},\t\"ecnonprofit\":{\t\t\"value\":\"false\",\t\t\"dataInfo\":{\t\t\t\"invisible\":false,\t\t\t\"inferred\":false,\t\t\t\"deletedbyinference\":false,\t\t\t\"trust\":\"0.810000002384185791\",\t\t\t\"inferenceprovenance\":\"\",\t\t\t\"provenanceaction\":{\t\t\t\t\"classid\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"classname\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"schemeid\":\"dnet:provenance_actions\",\t\t\t\t\"schemename\":\"dnet:provenance_actions\"\t\t\t}\t\t}\t},\t\"ecresearchorganization\":{\t\t\"value\":\"false\",\t\t\"dataInfo\":{\t\t\t\"invisible\":false,\t\t\t\"inferred\":false,\t\t\t\"deletedbyinference\":false,\t\t\t\"trust\":\"0.810000002384185791\",\t\t\t\"inferenceprovenance\":\"\",\t\t\t\"provenanceaction\":{\t\t\t\t\"classid\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"classname\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"schemeid\":\"dnet:provenance_actions\",\t\t\t\t\"schemename\":\"dnet:provenance_actions\"\t\t\t}\t\t}\t},\t\"echighereducation\":{\t\t\"value\":\"false\",\t\t\"dataInfo\":{\t\t\t\"invisible\":false,\t\t\t\"inferred\":false,\t\t\t\"deletedbyinference\":false,\t\t\t\"trust\":\"0.810000002384185791\",\t\t\t\"inferenceprovenance\":\"\",\t\t\t\"provenanceaction\":{\t\t\t\t\"classid\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"classname\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"schemeid\":\"dnet:provenance_actions\",\t\t\t\t\"schemename\":\"dnet:provenance_actions\"\t\t\t}\t\t}\t},\t\"ecinternationalorganizationeurinterests\":{\t\t\"value\":\"false\",\t\t\"dataInfo\":{\t\t\t\"invisible\":false,\t\t\t\"inferred\":false,\t\t\t\"deletedbyinference\":false,\t\t\t\"trust\":\"0.810000002384185791\",\t\t\t\"inferenceprovenance\":\"\",\t\t\t\"provenanceaction\":{\t\t\t\t\"classid\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"classname\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"schemeid\":\"dnet:provenance_actions\",\t\t\t\t\"schemename\":\"dnet:provenance_actions\"\t\t\t}\t\t}\t},\t\"ecinternationalorganization\":{\t\t\"value\":\"false\",\t\t\"dataInfo\":{\t\t\t\"invisible\":false,\t\t\t\"inferred\":false,\t\t\t\"deletedbyinference\":false,\t\t\t\"trust\":\"0.810000002384185791\",\t\t\t\"inferenceprovenance\":\"\",\t\t\t\"provenanceaction\":{\t\t\t\t\"classid\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"classname\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"schemeid\":\"dnet:provenance_actions\",\t\t\t\t\"schemename\":\"dnet:provenance_actions\"\t\t\t}\t\t}\t},\t\"ecenterprise\":{\t\t\"value\":\"false\",\t\t\"dataInfo\":{\t\t\t\"invisible\":false,\t\t\t\"inferred\":false,\t\t\t\"deletedbyinference\":false,\t\t\t\"trust\":\"0.810000002384185791\",\t\t\t\"inferenceprovenance\":\"\",\t\t\t\"provenanceaction\":{\t\t\t\t\"classid\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"classname\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"schemeid\":\"dnet:provenance_actions\",\t\t\t\t\"schemename\":\"dnet:provenance_actions\"\t\t\t}\t\t}\t},\t\"ecsmevalidated\":{\t\t\"value\":\"true\",\t\t\"dataInfo\":{\t\t\t\"invisible\":false,\t\t\t\"inferred\":false,\t\t\t\"deletedbyinference\":false,\t\t\t\"trust\":\"0.810000002384185791\",\t\t\t\"inferenceprovenance\":\"\",\t\t\t\"provenanceaction\":{\t\t\t\t\"classid\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"classname\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"schemeid\":\"dnet:provenance_actions\",\t\t\t\t\"schemename\":\"dnet:provenance_actions\"\t\t\t}\t\t}\t},\t\"ecnutscode\":{\t\t\"value\":\"false\",\t\t\"dataInfo\":{\t\t\t\"invisible\":false,\t\t\t\"inferred\":false,\t\t\t\"deletedbyinference\":false,\t\t\t\"trust\":\"0.810000002384185791\",\t\t\t\"inferenceprovenance\":\"\",\t\t\t\"provenanceaction\":{\t\t\t\t\"classid\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"classname\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"schemeid\":\"dnet:provenance_actions\",\t\t\t\t\"schemename\":\"dnet:provenance_actions\"\t\t\t}\t\t}\t},\t\"country\":null}"; + DedupConfig conf = + DedupConfig.load( + "{\n" + + " \"wf\" : {\n" + + " \"threshold\" : \"0.99\",\n" + + " \"dedupRun\" : \"001\",\n" + + " \"entityType\" : \"organization\",\n" + + " \"subEntityValue\": \"organization\",\n" + + " \"orderField\" : \"legalname\",\n" + + " \"queueMaxSize\" : \"2000\",\n" + + " \"groupMaxSize\" : \"50\",\n" + + " \"slidingWindowSize\" : \"200\",\n" + + " \"idPath\":\"$.id\",\n" + + " \"rootBuilder\" : [ \"organization\", \"projectOrganization_participation_isParticipant\", \"datasourceOrganization_provision_isProvidedBy\" ],\n" + + " \"includeChildren\" : \"true\",\n" + + " \"maxIterations\": \"20\"\n" + + " },\n" + + " \"pace\" : {\n" + + " \"clustering\" : [\n" + + " { \"name\" : \"sortedngrampairs\", \"fields\" : [ \"legalname\" ], \"params\" : { \"max\" : 2, \"ngramLen\" : \"3\"} },\n" + + " { \"name\" : \"suffixprefix\", \"fields\" : [ \"legalname\" ], \"params\" : { \"max\" : 1, \"len\" : \"3\" } },\n" + + " { \"name\" : \"urlclustering\", \"fields\" : [ \"websiteurl\" ], \"params\" : { } },\n" + + " { \"name\" : \"keywordsclustering\", \"fields\" : [ \"legalname\" ], \"params\" : { \"max\": 2, \"windowSize\": 4} }\n" + + " ],\n" + + " \"decisionTree\" : {\n" + + " \"start\": {\n" + + " \"fields\": [\n" + + " {\n" + + " \"field\": \"gridid\",\n" + + " \"comparator\": \"exactMatch\",\n" + + " \"weight\": 1,\n" + + " \"countIfUndefined\": \"false\",\n" + + " \"params\": {}\n" + + " }\n" + + " ],\n" + + " \"threshold\": 1,\n" + + " \"aggregation\": \"AVG\",\n" + + " \"positive\": \"MATCH\",\n" + + " \"negative\": \"NO_MATCH\",\n" + + " \"undefined\": \"layer2\",\n" + + " \"ignoreUndefined\": \"false\"\n" + + " },\n" + + " \"layer2\": {\n" + + " \"fields\": [\n" + + " {\n" + + " \"field\": \"websiteurl\",\n" + + " \"comparator\": \"domainExactMatch\",\n" + + " \"weight\": 1,\n" + + " \"countIfUndefined\": \"false\",\n" + + " \"params\": {}\n" + + " },\n" + + " {\n" + + " \"field\": \"country\",\n" + + " \"comparator\": \"exactMatch\",\n" + + " \"weight\": 1,\n" + + " \"countIfUndefined\": \"true\",\n" + + " \"params\": {}\n" + + " },\n" + + " {\n" + + " \"field\": \"legalname\",\n" + + " \"comparator\": \"numbersMatch\",\n" + + " \"weight\": 1,\n" + + " \"countIfUndefined\": \"true\",\n" + + " \"params\": {}\n" + + " },\n" + + " {\n" + + " \"field\": \"legalname\",\n" + + " \"comparator\": \"romansMatch\",\n" + + " \"weight\": 1,\n" + + " \"countIfUndefined\": \"true\",\n" + + " \"params\": {}\n" + + " }\n" + + " ],\n" + + " \"threshold\": 1,\n" + + " \"aggregation\": \"AND\",\n" + + " \"positive\": \"layer3\",\n" + + " \"negative\": \"NO_MATCH\",\n" + + " \"undefined\": \"layer3\",\n" + + " \"ignoreUndefined\": \"true\"\n" + + " },\n" + + " \"layer3\": {\n" + + " \"fields\": [\n" + + " {\n" + + " \"field\": \"legalname\",\n" + + " \"comparator\": \"cityMatch\",\n" + + " \"weight\": 1.0,\n" + + " \"countIfUndefined\": \"true\",\n" + + " \"params\": {\n" + + " \"windowSize\": \"4\"\n" + + " }\n" + + " }\n" + + " ],\n" + + " \"threshold\": 0.1,\n" + + " \"aggregation\": \"AVG\",\n" + + " \"positive\": \"layer4\",\n" + + " \"negative\": \"NO_MATCH\",\n" + + " \"undefined\": \"NO_MATCH\",\n" + + " \"ignoreUndefined\": \"true\"\n" + + " },\n" + + " \"layer4\": {\n" + + " \"fields\": [\n" + + " {\n" + + " \"field\": \"legalname\",\n" + + " \"comparator\": \"keywordMatch\",\n" + + " \"weight\": 1.0,\n" + + " \"countIfUndefined\": \"true\",\n" + + " \"params\": {\n" + + " \"windowSize\": \"4\"\n" + + " }\n" + + " }\n" + + " ],\n" + + " \"threshold\": 0.7,\n" + + " \"aggregation\": \"AVG\",\n" + + " \"positive\": \"layer5\",\n" + + " \"negative\": \"NO_MATCH\",\n" + + " \"undefined\": \"layer5\",\n" + + " \"ignoreUndefined\": \"true\"\n" + + " },\n" + + " \"layer5\": {\n" + + " \"fields\": [\n" + + " {\n" + + " \"field\": \"legalname\",\n" + + " \"comparator\": \"jaroWinklerNormalizedName\",\n" + + " \"weight\": 0.9,\n" + + " \"countIfUndefined\": \"true\",\n" + + " \"params\": {\n" + + " \"windowSize\": \"4\"\n" + + " }\n" + + " },\n" + + " {\n" + + " \"field\": \"legalshortname\",\n" + + " \"comparator\": \"jaroWinklerNormalizedName\",\n" + + " \"weight\": 0.1,\n" + + " \"countIfUndefined\": \"false\",\n" + + " \"params\": {\n" + + " \"windowSize\": 4\n" + + " }\n" + + " }\n" + + " ],\n" + + " \"threshold\": 0.9,\n" + + " \"aggregation\": \"W_MEAN\",\n" + + " \"positive\": \"MATCH\",\n" + + " \"negative\": \"NO_MATCH\",\n" + + " \"undefined\": \"NO_MATCH\",\n" + + " \"ignoreUndefined\": \"true\"\n" + + " }\n" + + " },\n" + + " \"model\" : [\n" + + " { \"name\" : \"country\", \"type\" : \"String\", \"path\" : \"$.country.classid\"},\n" + + " { \"name\" : \"legalshortname\", \"type\" : \"String\", \"path\" : \"$.legalshortname.value\"},\n" + + " { \"name\" : \"legalname\", \"type\" : \"String\", \"path\" : \"$.legalname.value\" },\n" + + " { \"name\" : \"websiteurl\", \"type\" : \"URL\", \"path\" : \"$.websiteurl.value\" },\n" + + " { \"name\" : \"gridid\", \"type\" : \"String\", \"path\" : \"$.pid[?(@.qualifier.classid =='grid')].value\"},\n" + + " { \"name\" : \"originalId\", \"type\" : \"String\", \"path\" : \"$.id\" }\n" + + " ],\n" + + " \"blacklists\" : {\n" + + " \"legalname\" : []\n" + + " },\n" + + " \"synonyms\": {\n" + + " \"key::1\": [\"university\",\"università\", \"universitas\", \"università studi\",\"universitario\",\"universitaria\",\"université\", \"universite\", \"universitaire\",\"universitaires\",\"universidad\",\"universitade\",\"Universität\",\"universitaet\",\"Uniwersytet\",\"университет\",\"universiteit\",\"πανεπιστήμιο\",\"universitesi\",\"universiteti\", \"universiti\"],\n" + + " \"key::2\": [\"studies\",\"studi\",\"études\",\"estudios\",\"estudos\",\"Studien\",\"studia\",\"исследования\",\"studies\",\"σπουδές\"],\n" + + " \"key::3\": [\"advanced\",\"superiore\",\"supérieur\",\"supérieure\",\"supérieurs\",\"supérieures\",\"avancado\",\"avancados\",\"fortgeschrittene\",\"fortgeschritten\",\"zaawansowany\",\"передовой\",\"gevorderd\",\"gevorderde\",\"προχωρημένος\",\"προχωρημένη\",\"προχωρημένο\",\"προχωρημένες\",\"προχωρημένα\",\"wyzsza\"],\n" + + " \"key::4\": [\"institute\",\"istituto\",\"institut\",\"instituto\",\"instituto\",\"Institut\",\"instytut\",\"институт\",\"instituut\",\"ινστιτούτο\"],\n" + + " \"key::5\": [\"hospital\",\"ospedale\",\"hôpital\",\"hospital\",\"hospital\",\"Krankenhaus\",\"szpital\",\"больница\",\"ziekenhuis\",\"νοσοκομείο\"],\n" + + " \"key::6\": [\"research\",\"ricerca\",\"recherche\",\"investigacion\",\"pesquisa\",\"Forschung\",\"badania\",\"исследования\",\"onderzoek\",\"έρευνα\",\"erevna\",\"erevnas\"],\n" + + " \"key::7\": [\"college\",\"collegio\",\"colegio\",\"faculdade\",\"Hochschule\",\"Szkoła Wyższa\",\"Высшая школа\",\"κολλέγιο\"],\n" + + " \"key::8\": [\"foundation\",\"fondazione\",\"fondation\",\"fundación\",\"fundação\",\"Stiftung\",\"Fundacja\",\"фонд\",\"stichting\",\"ίδρυμα\",\"idryma\"],\n" + + " \"key::9\": [\"center\",\"centro\",\"centre\",\"centro\",\"centro\",\"zentrum\",\"centrum\",\"центр\",\"centrum\",\"κέντρο\"],\n" + + " \"key::10\": [\"national\",\"nazionale\",\"national\",\"nationale\",\"nationaux\",\"nationales\",\"nacional\",\"nacional\",\"national\",\"krajowy\",\"национальный\",\"nationaal\",\"nationale\",\"εθνικό\"],\n" + + " \"key::11\": [\"association\",\"associazione\",\"association\",\"asociación\",\"associação\",\"Verein\",\"verband\",\"stowarzyszenie\",\"ассоциация\",\"associatie\"],\n" + + " \"key::12\": [\"society\",\"societa\",\"société\",\"sociedad\",\"sociedade\",\"gesellschaft\",\"społeczeństwo\",\"общество\",\"maatschappij\",\"κοινωνία\"],\n" + + " \"key::13\": [\"international\",\"internazionale\",\"international\",\"internacional\",\"internacional\",\"international\",\"międzynarodowy\",\"Международный\",\"internationaal\",\"internationale\",\"διεθνής\",\"διεθνή\",\"διεθνές\"],\n" + + " \"key::14\": [\"community\",\"comunita\",\"communauté\",\"comunidad\",\"comunidade\",\"Gemeinschaft\",\"społeczność\",\"сообщество\",\"gemeenschap\",\"κοινότητα\"],\n" + + " \"key::15\": [\"school\",\"scuola\",\"école\",\"escuela\",\"escola\",\"schule\",\"Szkoła\",\"школа\",\"school\",\"σχολείο\"],\n" + + " \"key::16\": [\"education\",\"educazione\",\"éducation\",\"educacion\",\"Educação\",\"Bildung\",\"Edukacja\",\"образование\",\"opleiding\",\"εκπαίδευση\"],\n" + + " \"key::17\": [\"academy\",\"accademia\",\"académie\",\"academia\",\"academia\",\"Akademie\",\"akademie\",\"академия\",\"academie\",\"ακαδημία\"],\n" + + " \"key::18\": [\"public\",\"pubblico\",\"public\",\"publique\",\"publics\",\"publiques\",\"publico\",\"publico\",\"Öffentlichkeit\",\"publiczny\",\"публичный\",\"publiek\",\"publieke\",\"δημόσιος\",\"δημόσια\",\"δημόσιο\"],\n" + + " \"key::19\": [\"museum\",\"museo\",\"musée\",\"mueso\",\"museu\",\"museum\",\"muzeum\",\"музей\",\"museum\",\"μουσείο\"],\n" + + " \"key::20\": [\"group\",\"gruppo\",\"groupe\",\"grupo\",\"grupo\",\"gruppe\",\"grupa\",\"группа\",\"groep\",\"ομάδα\",\"όμιλος\"],\n" + + " \"key::21\": [\"department\",\"dipartimento\",\"département\",\"departamento\",\"departamento\",\"abteilung\",\"departament\",\"отдел\",\"afdeling\",\"τμήμα\"],\n" + + " \"key::22\": [\"council\",\"consiglio\",\"conseil\",\"Consejo\",\"conselho\",\"gesellschaft\",\"rada\",\"совет\",\"raad\",\"συμβούλιο\"],\n" + + " \"key::23\": [\"library\",\"biblioteca\",\"bibliothèque\",\"biblioteca\",\"biblioteca\",\"Bibliothek\",\"biblioteka\",\"библиотека\",\"bibliotheek\",\"βιβλιοθήκη\"],\n" + + " \"key::24\": [\"ministry\",\"ministero\",\"ministère\",\"ministerio\",\"ministério\",\"Ministerium\",\"ministerstwo\",\"министерство\",\"ministerie\",\"υπουργείο\"],\n" + + " \"key::25\": [\"services\",\"servizi\",\"services\",\"servicios\",\"Serviços\",\"Dienstleistungen\",\"usługi\",\"услуги\",\"diensten\",\"υπηρεσίες\"],\n" + + " \"key::26\": [\"central\",\"centrale\",\"central\",\"centrale\",\"centrales\",\"central\",\"central\",\"zentral\",\"centralny\",\"цетральный\",\"centraal\",\"κεντρικός\",\"κεντρική\",\"κεντρικό\",\"κεντρικά\"],\n" + + " \"key::27\": [\"general\",\"generale\",\"général\",\"générale\",\"généraux\",\"générales\",\"general\",\"geral\",\"general\",\"Allgemeines\",\"general\",\"общий\",\"algemeen\",\"algemene\",\"γενικός\",\"γενική\",\"γενικό\",\"γενικά\"],\n" + + " \"key::28\": [\"applied\",\"applicati\",\"appliqué\",\"appliquée\",\"appliqués\",\"appliquées\",\"aplicado\",\"aplicada\",\"angewendet\",\"stosowany\",\"прикладной\",\"toegepast\",\"toegepaste\",\"εφαρμοσμένος\",\"εφαρμοσμένη\",\"εφαρμοσμένο\",\"εφαρμοσμένα\"],\n" + + " \"key::29\": [\"european\",\"europee\",\"europea\",\"européen\",\"européenne\",\"européens\",\"européennes\",\"europeo\",\"europeu\",\"europäisch\",\"europejski\",\"европейский\",\"Europees\",\"Europese\",\"ευρωπαϊκός\",\"ευρωπαϊκή\",\"ευρωπαϊκό\",\"ευρωπαϊκά\"],\n" + + " \"key::30\": [\"agency\",\"agenzia\",\"agence\",\"agencia\",\"agencia\",\"agentur\",\"agencja\",\"агенция\",\"agentschap\",\"πρακτορείο\"],\n" + + " \"key::31\": [\"laboratory\",\"laboratorio\",\"laboratoire\",\"laboratorio\",\"laboratorio\",\"labor\",\"laboratorium\",\"лаборатория\",\"laboratorium\",\"εργαστήριο\"],\n" + + " \"key::32\": [\"industry\",\"industria\",\"industrie\",\"индустрия\",\"industrie\",\"βιομηχανία\"],\n" + + " \"key::33\": [\"industrial\",\"industriale\",\"industriel\",\"industrielle\",\"industriels\",\"industrielles\",\"индустриальный\",\"industrieel\",\"βιομηχανικός\",\"βιομηχανική\",\"βιομηχανικό\",\"βιομηχανικά\",\"βιομηχανικές\"],\n" + + " \"key::34\": [\"consortium\",\"consorzio\",\"consortium\",\"консорциум\",\"consortium\",\"κοινοπραξία\"],\n" + + " \"key::35\": [\"organization\",\"organizzazione\",\"organisation\",\"organización\",\"organização\",\"organizacja\",\"организация\",\"organisatie\",\"οργανισμός\"],\n" + + " \"key::36\": [\"authority\",\"autorità\",\"autorité\",\"авторитет\",\"autoriteit\"],\n" + + " \"key::37\": [\"federation\",\"federazione\",\"fédération\",\"федерация\",\"federatie\",\"ομοσπονδία\"],\n" + + " \"key::38\": [\"observatory\",\"osservatorio\",\"observatoire\",\"обсерватория\",\"observatorium\",\"αστεροσκοπείο\"],\n" + + " \"key::39\": [\"bureau\",\"ufficio\",\"bureau\",\"офис\",\"bureau\",\"γραφείο\"],\n" + + " \"key::40\": [\"company\",\"impresa\",\"compagnie\",\"société\",\"компания\",\"bedrijf\",\"εταιρία\"],\n" + + " \"key::41\": [\"polytechnic\",\"politecnico\",\"polytechnique\",\"политехника\",\"polytechnisch\",\"πολυτεχνείο\",\"universita politecnica\",\"polytechnic university\",\"universidad politecnica\",\"universitat politecnica\",\"politechnika\",\"politechniki\",\"university technology\",\"university science technology\"],\n" + + " \"key::42\": [\"coalition\",\"coalizione\",\"coalition\",\"коалиция\",\"coalitie\",\"συνασπισμός\"],\n" + + " \"key::43\": [\"initiative\",\"iniziativa\",\"initiative\",\"инициатива\",\"initiatief\",\"πρωτοβουλία\"],\n" + + " \"key::44\": [\"academic\",\"accademico\",\"académique\",\"universitaire\",\"акадеческий academisch\",\"ακαδημαϊκός\",\"ακαδημαϊκή\",\"ακαδημαϊκό\",\"ακαδημαϊκές\",\"ακαδημαϊκοί\"],\n" + + " \"key::45\": [\"institution\",\"istituzione\",\"institution\",\"институциональный\",\"instelling\",\"ινστιτούτο\"],\n" + + " \"key::46\": [\"division\",\"divisione\",\"division\",\"отделение\",\"divisie\",\"τμήμα\"],\n" + + " \"key::47\": [\"committee\",\"comitato\",\"comité\",\"комитет\",\"commissie\",\"επιτροπή\"],\n" + + " \"key::48\": [\"promotion\",\"promozione\",\"продвижение\",\"proothisis\",\"forderung\"],\n" + + " \"key::49\": [\"medical\",\"medicine\",\"clinical\",\"medicina\",\"clinici\",\"médico\",\"medicina\",\"clínica\",\"médico\",\"medicina\",\"clínica\",\"medizinisch\",\"Medizin\",\"klinisch\",\"medisch\",\"geneeskunde\",\"klinisch\",\"ιατρικός\",\"ιατρική\",\"ιατρικό\",\"ιατρικά\",\"κλινικός\",\"κλινική\",\"κλινικό\",\"κλινικά\",\"tıbbi\",\"tıp\",\"klinik\",\"orvosi\",\"orvostudomány\",\"klinikai\",\"zdravniški\",\"medicinski\",\"klinični\",\"meditsiini\",\"kliinik\",\"kliiniline\"],\n" + + " \"key::50\": [\"technology\",\"technological\",\"tecnologia\",\"tecnologie\",\"tecnología\",\"tecnológico\",\"tecnologia\",\"tecnológico\",\"Technologie\",\"technologisch\",\"technologie\",\"technologisch\",\"τεχνολογία\",\"τεχνολογικός\",\"τεχνολογική\",\"τεχνολογικό\",\"teknoloji\",\"teknolojik\",\"technológia\",\"technológiai\",\"tehnologija\",\"tehnološki\",\"tehnoloogia\",\"tehnoloogiline\",\"technologii\",\"technical\",\"texniki\",\"teknik\"],\n" + + " \"key::51\": [\"science\",\"scientific\",\"scienza\",\"scientifiche\",\"scienze\",\"ciencia\",\"científico\",\"ciência\",\"científico\",\"Wissenschaft\",\"wissenschaftlich\",\"wetenschap\",\"wetenschappelijk\",\"επιστήμη\",\"επιστημονικός\",\"επιστημονική\",\"επιστημονικό\",\"επιστημονικά\",\"bilim\",\"bilimsel\",\"tudomány\",\"tudományos\",\"znanost\",\"znanstveni\",\"teadus\",\"teaduslik\",\"\"],\n" + + " \"key::52\": [\"engineering\",\"ingegneria\",\"ingeniería\",\"engenharia\",\"Ingenieurwissenschaft\",\"ingenieurswetenschappen\",\"bouwkunde\",\"μηχανικός\",\"μηχανική\",\"μηχανικό\",\"mühendislik\",\"mérnöki\",\"Inženirstvo\",\"inseneeria\",\"inseneri\",\"\"],\n" + + " \"key::53\": [\"management\",\"gestione\",\"gestionale\",\"gestionali\",\"gestión\",\"administración\",\"gestão\",\"administração\",\"Verwaltung\",\"management\",\"διαχείριση\",\"yönetim\",\"menedzsment\",\"vodstvo\",\"upravljanje\",\"management\",\"juhtkond\",\"juhtimine\",\"haldus\",\"\"],\n" + + " \"key::54\": [\"energy\",\"energia\",\"energía\",\"energia\",\"Energie\",\"energie\",\"ενέργεια\",\"enerji\",\"energia\",\"energija\",\"energia\",\"\"],\n" + + " \"key::55\": [\"agricultural\",\"agriculture\",\"agricoltura\",\"agricole\",\"agrícola\",\"agricultura\",\"agrícola\",\"agricultura\",\"landwirtschaftlich\",\"Landwirtschaft\",\"landbouwkundig\",\"landbouw\",\"αγροτικός\",\"αγροτική\",\"αγροτικό\",\"γεωργικός\",\"γεωργική\",\"γεωργικό\",\"γεωργία\",\"tarımsal\",\"tarım\",\"mezőgazdasági\",\"mezőgazdaság\",\"poljedelski\",\"poljedelstvo\",\"põllumajandus\",\"põllumajanduslik\",\"\"],\n" + + " \"key::56\": [\"information\",\"informazione\",\"información\",\"informação\",\"Information\",\"informatie\",\"πληροφορία\",\"bilgi\",\"információ\",\"informacija\",\"informatsioon\",\"informatycznych\",\"\"],\n" + + " \"key::57\": [\"social\",\"sociali\",\"social\",\"social\",\"Sozial\",\"sociaal\",\"maatschappelijk\",\"κοινωνικός\",\"κοινωνική\",\"κοινωνικό\",\"κοινωνικά\",\"sosyal\",\"szociális\",\"družbeni\",\"sotsiaal\",\"sotsiaalne\",\"\"],\n" + + " \"key::58\": [\"environmental\",\"ambiente\",\"medioambiental\",\"ambiente\",\"medioambiente\",\"meioambiente\",\"Umwelt\",\"milieu\",\"milieuwetenschap\",\"milieukunde\",\"περιβαλλοντικός\",\"περιβαλλοντική\",\"περιβαλλοντικό\",\"περιβαλλοντικά\",\"çevre\",\"környezeti\",\"okoliški\",\"keskonna\",\"\"],\n" + + " \"key::59\": [\"business\",\"economia\",\"economiche\",\"economica\",\"negocio\",\"empresa\",\"negócio\",\"Unternehmen\",\"bedrijf\",\"bedrijfskunde\",\"επιχείρηση\",\"iş\",\"üzleti\",\"posel\",\"ettevõte/äri\",\"\"],\n" + + " \"key::60\": [\"pharmaceuticals\",\"pharmacy\",\"farmacia\",\"farmaceutica\",\"farmacéutica\",\"farmacia\",\"farmacêutica\",\"farmácia\",\"Pharmazeutika\",\"Arzneimittelkunde\",\"farmaceutica\",\"geneesmiddelen\",\"apotheek\",\"φαρμακευτικός\",\"φαρμακευτική\",\"φαρμακευτικό\",\"φαρμακευτικά\",\"φαρμακείο\",\"ilaç\",\"eczane\",\"gyógyszerészeti\",\"gyógyszertár\",\"farmacevtika\",\"lekarništvo\",\"farmaatsia\",\"farmatseutiline\",\"\"],\n" + + " \"key::61\": [\"healthcare\",\"health services\",\"salute\",\"atenciónmédica\",\"cuidadodelasalud\",\"cuidadoscomasaúde\",\"Gesundheitswesen\",\"gezondheidszorg\",\"ιατροφαρμακευτικήπερίθαλψη\",\"sağlıkhizmeti\",\"egészségügy\",\"zdravstvo\",\"tervishoid\",\"tervishoiu\",\"\"],\n" + + " \"key::62\": [\"history\",\"storia\",\"historia\",\"história\",\"Geschichte\",\"geschiedenis\",\"geschiedkunde\",\"ιστορία\",\"tarih\",\"történelem\",\"zgodovina\",\"ajalugu\",\"\"],\n" + + " \"key::63\": [\"materials\",\"materiali\",\"materia\",\"materiales\",\"materiais\",\"materialen\",\"υλικά\",\"τεκμήρια\",\"malzemeler\",\"anyagok\",\"materiali\",\"materjalid\",\"vahendid\",\"\"],\n" + + " \"key::64\": [\"economics\",\"economia\",\"economiche\",\"economica\",\"economía\",\"economia\",\"Wirtschaft\",\"economie\",\"οικονομικά\",\"οικονομικέςεπιστήμες\",\"ekonomi\",\"közgazdaságtan\",\"gospodarstvo\",\"ekonomija\",\"majanduslik\",\"majandus\",\"\"],\n" + + " \"key::65\": [\"therapeutics\",\"terapeutica\",\"terapéutica\",\"terapêutica\",\"therapie\",\"θεραπευτική\",\"tedavibilimi\",\"gyógykezelés\",\"terapevtika\",\"terapeutiline\",\"ravi\",\"\"],\n" + + " \"key::66\": [\"oncology\",\"oncologia\",\"oncologico\",\"oncología\",\"oncologia\",\"Onkologie\",\"oncologie\",\"ογκολογία\",\"onkoloji\",\"onkológia\",\"onkologija\",\"onkoloogia\",\"\"],\n" + + " \"key::67\": [\"natural\",\"naturali\",\"naturale\",\"natural\",\"natural\",\"natürlich\",\"natuurlijk\",\"φυσικός\",\"φυσική\",\"φυσικό\",\"φυσικά\",\"doğal\",\"természetes\",\"naraven\",\"loodus\",\"\"],\n" + + " \"key::68\": [\"educational\",\"educazione\",\"pedagogia\",\"educacional\",\"educativo\",\"educacional\",\"pädagogisch\",\"educatief\",\"εκπαιδευτικός\",\"εκπαιδευτική\",\"εκπαιδευτικό\",\"εκπαιδευτικά\",\"eğitimsel\",\"oktatási\",\"izobraževalen\",\"haridus\",\"hariduslik\",\"\"],\n" + + " \"key::69\": [\"biomedical\",\"biomedica\",\"biomédico\",\"biomédico\",\"biomedizinisch\",\"biomedisch\",\"βιοιατρικός\",\"βιοιατρική\",\"βιοιατρικό\",\"βιοιατρικά\",\"biyomedikal\",\"orvosbiológiai\",\"biomedicinski\",\"biomeditsiiniline\",\"\"],\n" + + " \"key::70\": [\"veterinary\",\"veterinaria\",\"veterinarie\",\"veterinaria\",\"veterinária\",\"tierärtzlich\",\"veterinair\",\"veeartsenijlkunde\",\"κτηνιατρικός\",\"κτηνιατρική\",\"κτηνιατρικό\",\"κτηνιατρικά\",\"veteriner\",\"állatorvosi\",\"veterinar\",\"veterinarski\",\"veterinaaria\",\"\"],\n" + + " \"key::71\": [\"chemistry\",\"chimica\",\"química\",\"química\",\"Chemie\",\"chemie\",\"scheikunde\",\"χημεία\",\"kimya\",\"kémia\",\"kemija\",\"keemia\",\"\"],\n" + + " \"key::72\": [\"security\",\"sicurezza\",\"seguridad\",\"segurança\",\"Sicherheit\",\"veiligheid\",\"ασφάλεια\",\"güvenlik\",\"biztonsági\",\"varnost\",\"turvalisus\",\"julgeolek\",\"\"],\n" + + " \"key::73\": [\"biotechnology\",\"biotecnologia\",\"biotecnologie\",\"biotecnología\",\"biotecnologia\",\"Biotechnologie\",\"biotechnologie\",\"βιοτεχνολογία\",\"biyoteknoloji\",\"biotechnológia\",\"biotehnologija\",\"biotehnoloogia\",\"\"],\n" + + " \"key::74\": [\"military\",\"militare\",\"militari\",\"militar\",\"militar\",\"Militär\",\"militair\",\"leger\",\"στρατιωτικός\",\"στρατιωτική\",\"στρατιωτικό\",\"στρατιωτικά\",\"askeri\",\"katonai\",\"vojaški\",\"vojni\",\"militaar\",\"wojskowa\",\"\"],\n" + + " \"key::75\": [\"theological\",\"teologia\",\"teologico\",\"teológico\",\"tecnológica\",\"theologisch\",\"theologisch\",\"θεολογικός\",\"θεολογική\",\"θεολογικό\",\"θεολογικά\",\"teolojik\",\"technológiai\",\"teološki\",\"teoloogia\",\"usuteadus\",\"teoloogiline\",\"\"],\n" + + " \"key::76\": [\"electronics\",\"elettronica\",\"electrónica\",\"eletrônicos\",\"Elektronik\",\"elektronica\",\"ηλεκτρονική\",\"elektronik\",\"elektronika\",\"elektronika\",\"elektroonika\",\"\"],\n" + + " \"key::77\": [\"forestry\",\"forestale\",\"forestali\",\"silvicultura\",\"forestal\",\"floresta\",\"Forstwirtschaft\",\"bosbouw\",\"δασοκομία\",\"δασολογία\",\"ormancılık\",\"erdészet\",\"gozdarstvo\",\"metsandus\",\"\"],\n" + + " \"key::78\": [\"maritime\",\"marittima\",\"marittime\",\"marittimo\",\"marítimo\",\"marítimo\",\"maritiem\",\"ναυτικός\",\"ναυτική\",\"ναυτικό\",\"ναυτικά\",\"ναυτιλιακός\",\"ναυτιλιακή\",\"ναυτιλιακό\",\"ναυτιλιακά\",\"θαλάσσιος\",\"θαλάσσια\",\"θαλάσσιο\",\"denizcilik\",\"tengeri\",\"morski\",\"mere\",\"merendus\",\"\"],\n" + + " \"key::79\": [\"sports\",\"sport\",\"deportes\",\"esportes\",\"Sport\",\"sport\",\"sportwetenschappen\",\"άθληση\",\"γυμναστικήδραστηριότητα\",\"spor\",\"sport\",\"šport\",\"sport\",\"spordi\",\"\"],\n" + + " \"key::80\": [\"surgery\",\"chirurgia\",\"chirurgiche\",\"cirugía\",\"cirurgia\",\"Chirurgie\",\"chirurgie\",\"heelkunde\",\"εγχείρηση\",\"επέμβαση\",\"χειρουργικήεπέμβαση\",\"cerrahi\",\"sebészet\",\"kirurgija\",\"kirurgia\",\"\"],\n" + + " \"key::81\": [\"cultural\",\"culturale\",\"culturali\",\"cultura\",\"cultural\",\"cultural\",\"kulturell\",\"cultureel\",\"πολιτιστικός\",\"πολιτιστική\",\"πολιτιστικό\",\"πολιτισμικός\",\"πολιτισμική\",\"πολιτισμικό\",\"kültürel\",\"kultúrális\",\"kulturni\",\"kultuuri\",\"kultuuriline\",\"\"],\n" + + " \"key::82\": [\"computerscience\",\"informatica\",\"ordenador\",\"computadora\",\"informática\",\"computación\",\"cienciasdelacomputación\",\"ciênciadacomputação\",\"Computer\",\"computer\",\"υπολογιστής\",\"ηλεκτρονικόςυπολογιστής\",\"bilgisayar\",\"számítógép\",\"računalnik\",\"arvuti\",\"\"],\n" + + " \"key::83\": [\"finance\",\"financial\",\"finanza\",\"finanziarie\",\"finanza\",\"financiero\",\"finanças\",\"financeiro\",\"Finanzen\",\"finanziell\",\"financiën\",\"financieel\",\"χρηματοοικονομικά\",\"χρηματοδότηση\",\"finanse\",\"finansal\",\"pénzügy\",\"pénzügyi\",\"finance\",\"finančni\",\"finants\",\"finantsiline\",\"\"],\n" + + " \"key::84\": [\"communication\",\"comunicazione\",\"comuniciación\",\"comunicação\",\"Kommunikation\",\"communication\",\"επικοινωνία\",\"iletişim\",\"kommunikáció\",\"komuniciranje\",\"kommunikatsioon\",\"\"],\n" + + " \"key::85\": [\"justice\",\"giustizia\",\"justicia\",\"justiça\",\"Recht\",\"Justiz\",\"justitie\",\"gerechtigheid\",\"δικαιοσύνη\",\"υπουργείοδικαιοσύνης\",\"δίκαιο\",\"adalet\",\"igazságügy\",\"pravo\",\"õigus\",\"\"],\n" + + " \"key::86\": [\"aerospace\",\"aerospaziale\",\"aerospaziali\",\"aeroespacio\",\"aeroespaço\",\"Luftfahrt\",\"luchtvaart\",\"ruimtevaart\",\"αεροπορικός\",\"αεροπορική\",\"αεροπορικό\",\"αεροναυπηγικός\",\"αεροναυπηγική\",\"αεροναυπηγικό\",\"αεροναυπηγικά\",\"havacılıkveuzay\",\"légtér\",\"zrakoplovstvo\",\"atmosfäär\",\"kosmos\",\"\"],\n" + + " \"key::87\": [\"dermatology\",\"dermatologia\",\"dermatología\",\"dermatologia\",\"Dermatologie\",\"dermatologie\",\"δρματολογία\",\"dermatoloji\",\"bőrgyógyászat\",\"dermatológia\",\"dermatologija\",\"dermatoloogia\",\"\"],\n" + + " \"key::88\": [\"architecture\",\"architettura\",\"arquitectura\",\"arquitetura\",\"Architektur\",\"architectuur\",\"αρχιτεκτονική\",\"mimarlık\",\"építészet\",\"arhitektura\",\"arhitektuur\",\"\"],\n" + + " \"key::89\": [\"mathematics\",\"matematica\",\"matematiche\",\"matemáticas\",\"matemáticas\",\"Mathematik\",\"wiskunde\",\"mathematica\",\"μαθηματικά\",\"matematik\",\"matematika\",\"matematika\",\"matemaatika\",\"\"],\n" + + " \"key::90\": [\"language\",\"lingue\",\"linguistica\",\"linguistiche\",\"lenguaje\",\"idioma\",\"língua\",\"idioma\",\"Sprache\",\"taal\",\"taalkunde\",\"γλώσσα\",\"dil\",\"nyelv\",\"jezik\",\"keel\",\"\"],\n" + + " \"key::91\": [\"neuroscience\",\"neuroscienza\",\"neurociencia\",\"neurociência\",\"Neurowissenschaft\",\"neurowetenschappen\",\"νευροεπιστήμη\",\"nörobilim\",\"idegtudomány\",\"nevroznanost\",\"neuroteadused\",\"\"],\n" + + " \"key::92\": [\"automation\",\"automazione\",\"automatización\",\"automação\",\"Automatisierung\",\"automatisering\",\"αυτοματοποίηση\",\"otomasyon\",\"automatizálás\",\"avtomatizacija\",\"automatiseeritud\",\"\"],\n" + + " \"key::93\": [\"pediatric\",\"pediatria\",\"pediatriche\",\"pediatrico\",\"pediátrico\",\"pediatría\",\"pediátrico\",\"pediatria\",\"pädiatrisch\",\"pediatrische\",\"παιδιατρική\",\"pediatrik\",\"gyermekgyógyászat\",\"pediatrija\",\"pediaatria\",\"\"],\n" + + " \"key::94\": [\"photonics\",\"fotonica\",\"fotoniche\",\"fotónica\",\"fotônica\",\"Photonik\",\"fotonica\",\"φωτονική\",\"fotonik\",\"fotonika\",\"fotonika\",\"fotoonika\",\"\"],\n" + + " \"key::95\": [\"mechanics\", \"mechanical\", \"meccanica\",\"meccaniche\",\"mecánica\",\"mecânica\",\"Mechanik\",\"Maschinenbau\",\"mechanica\",\"werktuigkunde\",\"μηχανικής\",\"mekanik\",\"gépészet\",\"mehanika\",\"mehaanika\",\"\"],\n" + + " \"key::96\": [\"psychiatrics\",\"psichiatria\",\"psichiatrica\",\"psichiatriche\",\"psiquiatría\",\"psiquiatria\",\"Psychiatrie\",\"psychiatrie\",\"ψυχιατρική\",\"psikiyatrik\",\"pszihiátria\",\"psihiatrija\",\"psühhaatria\",\"\"],\n" + + " \"key::97\": [\"psychology\",\"fisiologia\",\"psicología\",\"psicologia\",\"Psychologie\",\"psychologie\",\"ψυχολογία\",\"psikoloji\",\"pszihológia\",\"psihologija\",\"psühholoogia\",\"\"],\n" + + " \"key::98\": [\"automotive\",\"industriaautomobilistica\",\"industriadelautomóvil\",\"automotriz\",\"industriaautomotriz\",\"automotivo\",\"Automobilindustrie\",\"autoindustrie\",\"αυτοκίνητος\",\"αυτοκίνητη\",\"αυτοκίνητο\",\"αυτοκινούμενος\",\"αυτοκινούμενη\",\"αυτοκινούμενο\",\"αυτοκινητιστικός\",\"αυτοκινητιστική\",\"αυτοκινητιστικό\",\"otomotiv\",\"autóipari\",\"samogiben\",\"avtomobilskaindustrija\",\"auto-\",\"\"],\n" + + " \"key::99\": [\"neurology\",\"neurologia\",\"neurologiche\",\"neurología\",\"neurologia\",\"Neurologie\",\"neurologie\",\"zenuwleer\",\"νευρολογία\",\"nöroloji\",\"neurológia\",\"ideggyógyászat\",\"nevrologija\",\"neuroloogia\",\"\"],\n" + + " \"key::100\": [\"geology\",\"geologia\",\"geologiche\",\"geología\",\"geologia\",\"Geologie\",\"geologie\",\"aardkunde\",\"γεωλογία\",\"jeoloji\",\"geológia\",\"földtudomány\",\"geologija\",\"geoloogia\",\"\"],\n" + + " \"key::101\": [\"microbiology\",\"microbiologia\",\"micro-biologia\",\"microbiologiche\",\"microbiología\",\"microbiologia\",\"Mikrobiologie\",\"microbiologie\",\"μικροβιολογία\",\"mikrobiyoloji\",\"mikrobiológia\",\"mikrobiologija\",\"mikrobioloogia\",\"\"],\n" + + " \"key::102\": [\"informatics\",\"informatica\",\"informática\",\"informática\",\"informatica\",\"\"],\n" + + " \"key::103\": [\"forschungsgemeinschaft\",\"comunita ricerca\",\"research community\",\"research foundation\",\"research association\"],\n" + + " \"key::104\": [\"commerce\",\"ticaret\",\"ticarət\",\"commercio\",\"trade\",\"handel\",\"comercio\"],\n" + + " \"key::105\" : [\"state\", \"stato\", \"etade\", \"estado\", \"statale\", \"etat\", \"zustand\", \"estado\"],\n" + + " \"key::106\" : [\"seminary\", \"seminario\", \"seminaire\", \"seminar\"],\n" + + " \"key::107\" : [\"agricultural forestry\", \"af\", \"a f\"],\n" + + " \"key::108\" : [\"agricultural mechanical\", \"am\", \"a m\"],\n" + + " \"key::109\" : [\"catholic\", \"catholique\", \"katholische\", \"catolica\", \"cattolica\", \"catolico\"]\n" + + " }\n" + + " }\n" + + "}"); @Test - public void testJPath () throws Exception { + public void testJPath() throws Exception { MapDocument d = MapDocumentUtil.asMapDocumentWithJPath(conf, json); System.out.println("d = " + d); - } } diff --git a/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/DatePicker.java b/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/DatePicker.java index 73f178edc..5c4111c20 100644 --- a/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/DatePicker.java +++ b/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/DatePicker.java @@ -1,18 +1,17 @@ package eu.dnetlib.dedup; -import eu.dnetlib.dhp.schema.oaf.Field; -import org.apache.commons.lang.StringUtils; - -import java.time.Year; -import java.util.*; -import java.util.stream.Collectors; - import static java.util.Collections.reverseOrder; import static java.util.Map.Entry.comparingByValue; import static java.util.stream.Collectors.toMap; import static org.apache.commons.lang.StringUtils.endsWith; import static org.apache.commons.lang.StringUtils.substringBefore; +import eu.dnetlib.dhp.schema.oaf.Field; +import java.time.Year; +import java.util.*; +import java.util.stream.Collectors; +import org.apache.commons.lang.StringUtils; + public class DatePicker { private static final String DATE_PATTERN = "\\d{4}-\\d{2}-\\d{2}"; @@ -22,33 +21,32 @@ public class DatePicker { public static Field pick(final Collection dateofacceptance) { - final Map frequencies = dateofacceptance - .parallelStream() - .filter(StringUtils::isNotBlank) - .collect( - Collectors.toConcurrentMap( - w -> w, w -> 1, Integer::sum)); + final Map frequencies = + dateofacceptance + .parallelStream() + .filter(StringUtils::isNotBlank) + .collect(Collectors.toConcurrentMap(w -> w, w -> 1, Integer::sum)); if (frequencies.isEmpty()) { return new Field<>(); } final Field date = new Field<>(); - date.setValue(frequencies.keySet().iterator().next()); + date.setValue(frequencies.keySet().iterator().next()); // let's sort this map by values first, filtering out invalid dates - final Map sorted = frequencies - .entrySet() - .stream() - .filter(d -> StringUtils.isNotBlank(d.getKey())) - .filter(d -> d.getKey().matches(DATE_PATTERN)) - .filter(d -> inRange(d.getKey())) - .sorted(reverseOrder(comparingByValue())) - .collect( - toMap( - Map.Entry::getKey, - Map.Entry::getValue, (e1, e2) -> e2, - LinkedHashMap::new)); + final Map sorted = + frequencies.entrySet().stream() + .filter(d -> StringUtils.isNotBlank(d.getKey())) + .filter(d -> d.getKey().matches(DATE_PATTERN)) + .filter(d -> inRange(d.getKey())) + .sorted(reverseOrder(comparingByValue())) + .collect( + toMap( + Map.Entry::getKey, + Map.Entry::getValue, + (e1, e2) -> e2, + LinkedHashMap::new)); // shortcut if (sorted.size() == 0) { @@ -58,18 +56,24 @@ public class DatePicker { // voting method (1/3 + 1) wins if (sorted.size() >= 3) { final int acceptThreshold = (sorted.size() / 3) + 1; - final List accepted = sorted.entrySet().stream() - .filter(e -> e.getValue() >= acceptThreshold) - .map(e -> e.getKey()) - .collect(Collectors.toList()); + final List accepted = + sorted.entrySet().stream() + .filter(e -> e.getValue() >= acceptThreshold) + .map(e -> e.getKey()) + .collect(Collectors.toList()); // cannot find strong majority if (accepted.isEmpty()) { final int max = sorted.values().iterator().next(); - Optional first = sorted.entrySet().stream() - .filter(e -> e.getValue() == max && !endsWith(e.getKey(), DATE_DEFAULT_SUFFIX)) - .map(Map.Entry::getKey) - .findFirst(); + Optional first = + sorted.entrySet().stream() + .filter( + e -> + e.getValue() == max + && !endsWith( + e.getKey(), DATE_DEFAULT_SUFFIX)) + .map(Map.Entry::getKey) + .findFirst(); if (first.isPresent()) { date.setValue(first.get()); return date; @@ -83,9 +87,10 @@ public class DatePicker { date.setValue(accepted.get(0)); return date; } else { - final Optional first = accepted.stream() - .filter(d -> !endsWith(d, DATE_DEFAULT_SUFFIX)) - .findFirst(); + final Optional first = + accepted.stream() + .filter(d -> !endsWith(d, DATE_DEFAULT_SUFFIX)) + .findFirst(); if (first.isPresent()) { date.setValue(first.get()); return date; @@ -94,7 +99,7 @@ public class DatePicker { return date; } - //1st non YYYY-01-01 is returned + // 1st non YYYY-01-01 is returned } else { if (sorted.size() == 2) { for (Map.Entry e : sorted.entrySet()) { @@ -115,5 +120,4 @@ public class DatePicker { final int year = Integer.parseInt(substringBefore(date, "-")); return year >= YEAR_LB && year <= YEAR_UB; } - -} \ No newline at end of file +} diff --git a/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/DedupRecordFactory.java b/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/DedupRecordFactory.java index ebb504078..f8b69cc14 100644 --- a/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/DedupRecordFactory.java +++ b/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/DedupRecordFactory.java @@ -1,43 +1,64 @@ package eu.dnetlib.dedup; import com.fasterxml.jackson.databind.DeserializationFeature; +import com.fasterxml.jackson.databind.ObjectMapper; import com.google.common.collect.Lists; import eu.dnetlib.dhp.schema.oaf.*; import eu.dnetlib.pace.config.DedupConfig; import eu.dnetlib.pace.util.MapDocumentUtil; +import java.util.Collection; import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.function.PairFunction; import org.apache.spark.sql.Encoders; import org.apache.spark.sql.SparkSession; -import com.fasterxml.jackson.databind.ObjectMapper; import scala.Tuple2; -import java.util.Collection; - public class DedupRecordFactory { - public static JavaRDD createDedupRecord(final JavaSparkContext sc, final SparkSession spark, final String mergeRelsInputPath, final String entitiesInputPath, final OafEntityType entityType, final DedupConfig dedupConf) { + public static JavaRDD createDedupRecord( + final JavaSparkContext sc, + final SparkSession spark, + final String mergeRelsInputPath, + final String entitiesInputPath, + final OafEntityType entityType, + final DedupConfig dedupConf) { long ts = System.currentTimeMillis(); - // - final JavaPairRDD inputJsonEntities = sc.textFile(entitiesInputPath) - .mapToPair((PairFunction) it -> - new Tuple2(MapDocumentUtil.getJPathString(dedupConf.getWf().getIdPath(), it), it) - ); + // + final JavaPairRDD inputJsonEntities = + sc.textFile(entitiesInputPath) + .mapToPair( + (PairFunction) + it -> + new Tuple2( + MapDocumentUtil.getJPathString( + dedupConf.getWf().getIdPath(), it), + it)); - //: source is the dedup_id, target is the id of the mergedIn - JavaPairRDD mergeRels = spark - .read().load(mergeRelsInputPath).as(Encoders.bean(Relation.class)) - .where("relClass=='merges'") - .javaRDD() - .mapToPair( - (PairFunction) r -> - new Tuple2(r.getTarget(), r.getSource()) - ); + // : source is the dedup_id, target is the id of the mergedIn + JavaPairRDD mergeRels = + spark.read() + .load(mergeRelsInputPath) + .as(Encoders.bean(Relation.class)) + .where("relClass=='merges'") + .javaRDD() + .mapToPair( + (PairFunction) + r -> + new Tuple2( + r.getTarget(), r.getSource())); - // - final JavaPairRDD joinResult = mergeRels.join(inputJsonEntities).mapToPair((PairFunction>, String, String>) Tuple2::_2); + // + final JavaPairRDD joinResult = + mergeRels + .join(inputJsonEntities) + .mapToPair( + (PairFunction< + Tuple2>, + String, + String>) + Tuple2::_2); JavaPairRDD> sortedJoinResult = joinResult.groupByKey(); @@ -55,42 +76,46 @@ public class DedupRecordFactory { case organization: return sortedJoinResult.map(o -> DedupRecordFactory.organizationMerger(o, ts)); case otherresearchproduct: - return sortedJoinResult.map(o -> DedupRecordFactory.otherresearchproductMerger(o, ts)); + return sortedJoinResult.map( + o -> DedupRecordFactory.otherresearchproductMerger(o, ts)); default: return null; } - } - private static Publication publicationMerger(Tuple2> e, final long ts) { + private static Publication publicationMerger( + Tuple2> e, final long ts) { - Publication p = new Publication(); //the result of the merge, to be returned at the end + Publication p = new Publication(); // the result of the merge, to be returned at the end p.setId(e._1()); final ObjectMapper mapper = new ObjectMapper(); mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); - final Collection dateofacceptance = Lists.newArrayList(); if (e._2() != null) - e._2().forEach(pub -> { - try { - Publication publication = mapper.readValue(pub, Publication.class); + e._2().forEach( + pub -> { + try { + Publication publication = + mapper.readValue(pub, Publication.class); - p.mergeFrom(publication); - p.setAuthor(DedupUtility.mergeAuthor(p.getAuthor(), publication.getAuthor())); - //add to the list if they are not null - if (publication.getDateofacceptance() != null) - dateofacceptance.add(publication.getDateofacceptance().getValue()); - } catch (Exception exc) { - throw new RuntimeException(exc); - } - }); + p.mergeFrom(publication); + p.setAuthor( + DedupUtility.mergeAuthor( + p.getAuthor(), publication.getAuthor())); + // add to the list if they are not null + if (publication.getDateofacceptance() != null) + dateofacceptance.add( + publication.getDateofacceptance().getValue()); + } catch (Exception exc) { + throw new RuntimeException(exc); + } + }); p.setDateofacceptance(DatePicker.pick(dateofacceptance)); - if (p.getDataInfo() == null) - p.setDataInfo(new DataInfo()); + if (p.getDataInfo() == null) p.setDataInfo(new DataInfo()); p.getDataInfo().setTrust("0.9"); p.setLastupdatetimestamp(ts); return p; @@ -98,7 +123,7 @@ public class DedupRecordFactory { private static Dataset datasetMerger(Tuple2> e, final long ts) { - Dataset d = new Dataset(); //the result of the merge, to be returned at the end + Dataset d = new Dataset(); // the result of the merge, to be returned at the end d.setId(e._1()); @@ -108,22 +133,25 @@ public class DedupRecordFactory { final Collection dateofacceptance = Lists.newArrayList(); if (e._2() != null) - e._2().forEach(dat -> { - try { - Dataset dataset = mapper.readValue(dat, Dataset.class); + e._2().forEach( + dat -> { + try { + Dataset dataset = mapper.readValue(dat, Dataset.class); - d.mergeFrom(dataset); - d.setAuthor(DedupUtility.mergeAuthor(d.getAuthor(), dataset.getAuthor())); - //add to the list if they are not null - if (dataset.getDateofacceptance() != null) - dateofacceptance.add(dataset.getDateofacceptance().getValue()); - } catch (Exception exc) { - throw new RuntimeException(exc); - } - }); + d.mergeFrom(dataset); + d.setAuthor( + DedupUtility.mergeAuthor( + d.getAuthor(), dataset.getAuthor())); + // add to the list if they are not null + if (dataset.getDateofacceptance() != null) + dateofacceptance.add( + dataset.getDateofacceptance().getValue()); + } catch (Exception exc) { + throw new RuntimeException(exc); + } + }); d.setDateofacceptance(DatePicker.pick(dateofacceptance)); - if (d.getDataInfo() == null) - d.setDataInfo(new DataInfo()); + if (d.getDataInfo() == null) d.setDataInfo(new DataInfo()); d.getDataInfo().setTrust("0.9"); d.setLastupdatetimestamp(ts); return d; @@ -131,24 +159,24 @@ public class DedupRecordFactory { private static Project projectMerger(Tuple2> e, final long ts) { - Project p = new Project(); //the result of the merge, to be returned at the end + Project p = new Project(); // the result of the merge, to be returned at the end p.setId(e._1()); final ObjectMapper mapper = new ObjectMapper(); mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); if (e._2() != null) - e._2().forEach(proj -> { - try { - Project project = mapper.readValue(proj, Project.class); + e._2().forEach( + proj -> { + try { + Project project = mapper.readValue(proj, Project.class); - p.mergeFrom(project); - } catch (Exception exc) { - throw new RuntimeException(exc); - } - }); - if (p.getDataInfo() == null) - p.setDataInfo(new DataInfo()); + p.mergeFrom(project); + } catch (Exception exc) { + throw new RuntimeException(exc); + } + }); + if (p.getDataInfo() == null) p.setDataInfo(new DataInfo()); p.getDataInfo().setTrust("0.9"); p.setLastupdatetimestamp(ts); return p; @@ -156,100 +184,106 @@ public class DedupRecordFactory { private static Software softwareMerger(Tuple2> e, final long ts) { - Software s = new Software(); //the result of the merge, to be returned at the end + Software s = new Software(); // the result of the merge, to be returned at the end s.setId(e._1()); final ObjectMapper mapper = new ObjectMapper(); mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); final Collection dateofacceptance = Lists.newArrayList(); if (e._2() != null) - e._2().forEach(soft -> { - try { - Software software = mapper.readValue(soft, Software.class); + e._2().forEach( + soft -> { + try { + Software software = mapper.readValue(soft, Software.class); - s.mergeFrom(software); - s.setAuthor(DedupUtility.mergeAuthor(s.getAuthor(), software.getAuthor())); - //add to the list if they are not null - if (software.getDateofacceptance() != null) - dateofacceptance.add(software.getDateofacceptance().getValue()); - } catch (Exception exc) { - throw new RuntimeException(exc); - } - }); + s.mergeFrom(software); + s.setAuthor( + DedupUtility.mergeAuthor( + s.getAuthor(), software.getAuthor())); + // add to the list if they are not null + if (software.getDateofacceptance() != null) + dateofacceptance.add( + software.getDateofacceptance().getValue()); + } catch (Exception exc) { + throw new RuntimeException(exc); + } + }); s.setDateofacceptance(DatePicker.pick(dateofacceptance)); - if (s.getDataInfo() == null) - s.setDataInfo(new DataInfo()); + if (s.getDataInfo() == null) s.setDataInfo(new DataInfo()); s.getDataInfo().setTrust("0.9"); s.setLastupdatetimestamp(ts); return s; } private static Datasource datasourceMerger(Tuple2> e, final long ts) { - Datasource d = new Datasource(); //the result of the merge, to be returned at the end + Datasource d = new Datasource(); // the result of the merge, to be returned at the end d.setId(e._1()); final ObjectMapper mapper = new ObjectMapper(); mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); if (e._2() != null) - e._2().forEach(dat -> { - try { - Datasource datasource = mapper.readValue(dat, Datasource.class); + e._2().forEach( + dat -> { + try { + Datasource datasource = mapper.readValue(dat, Datasource.class); - d.mergeFrom(datasource); - } catch (Exception exc) { - throw new RuntimeException(exc); - } - }); - if (d.getDataInfo() == null) - d.setDataInfo(new DataInfo()); + d.mergeFrom(datasource); + } catch (Exception exc) { + throw new RuntimeException(exc); + } + }); + if (d.getDataInfo() == null) d.setDataInfo(new DataInfo()); d.getDataInfo().setTrust("0.9"); d.setLastupdatetimestamp(ts); return d; } - private static Organization organizationMerger(Tuple2> e, final long ts) { + private static Organization organizationMerger( + Tuple2> e, final long ts) { - Organization o = new Organization(); //the result of the merge, to be returned at the end + Organization o = new Organization(); // the result of the merge, to be returned at the end o.setId(e._1()); final ObjectMapper mapper = new ObjectMapper(); mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); - StringBuilder trust = new StringBuilder("0.0"); if (e._2() != null) - e._2().forEach(pub -> { - try { - Organization organization = mapper.readValue(pub, Organization.class); + e._2().forEach( + pub -> { + try { + Organization organization = + mapper.readValue(pub, Organization.class); - final String currentTrust = organization.getDataInfo().getTrust(); - if (!"1.0".equals(currentTrust)) { - trust.setLength(0); - trust.append(currentTrust); - } - o.mergeFrom(organization); + final String currentTrust = + organization.getDataInfo().getTrust(); + if (!"1.0".equals(currentTrust)) { + trust.setLength(0); + trust.append(currentTrust); + } + o.mergeFrom(organization); - } catch (Exception exc) { - throw new RuntimeException(exc); - } - }); + } catch (Exception exc) { + throw new RuntimeException(exc); + } + }); - if (o.getDataInfo() == null) - { + if (o.getDataInfo() == null) { o.setDataInfo(new DataInfo()); } - if (o.getDataInfo() == null) - o.setDataInfo(new DataInfo()); + if (o.getDataInfo() == null) o.setDataInfo(new DataInfo()); o.getDataInfo().setTrust("0.9"); o.setLastupdatetimestamp(ts); return o; } - private static OtherResearchProduct otherresearchproductMerger(Tuple2> e, final long ts) { + private static OtherResearchProduct otherresearchproductMerger( + Tuple2> e, final long ts) { - OtherResearchProduct o = new OtherResearchProduct(); //the result of the merge, to be returned at the end + OtherResearchProduct o = + new OtherResearchProduct(); // the result of the merge, to be returned at the end o.setId(e._1()); @@ -259,25 +293,31 @@ public class DedupRecordFactory { final Collection dateofacceptance = Lists.newArrayList(); if (e._2() != null) - e._2().forEach(orp -> { - try { - OtherResearchProduct otherResearchProduct = mapper.readValue(orp, OtherResearchProduct.class); + e._2().forEach( + orp -> { + try { + OtherResearchProduct otherResearchProduct = + mapper.readValue(orp, OtherResearchProduct.class); - o.mergeFrom(otherResearchProduct); - o.setAuthor(DedupUtility.mergeAuthor(o.getAuthor(), otherResearchProduct.getAuthor())); - //add to the list if they are not null - if (otherResearchProduct.getDateofacceptance() != null) - dateofacceptance.add(otherResearchProduct.getDateofacceptance().getValue()); - } catch (Exception exc) { - throw new RuntimeException(exc); - } - }); - if (o.getDataInfo() == null) - o.setDataInfo(new DataInfo()); + o.mergeFrom(otherResearchProduct); + o.setAuthor( + DedupUtility.mergeAuthor( + o.getAuthor(), + otherResearchProduct.getAuthor())); + // add to the list if they are not null + if (otherResearchProduct.getDateofacceptance() != null) + dateofacceptance.add( + otherResearchProduct + .getDateofacceptance() + .getValue()); + } catch (Exception exc) { + throw new RuntimeException(exc); + } + }); + if (o.getDataInfo() == null) o.setDataInfo(new DataInfo()); o.setDateofacceptance(DatePicker.pick(dateofacceptance)); o.getDataInfo().setTrust("0.9"); o.setLastupdatetimestamp(ts); return o; } - } diff --git a/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/DedupUtility.java b/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/DedupUtility.java index 7ed102e03..977c14e13 100644 --- a/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/DedupUtility.java +++ b/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/DedupUtility.java @@ -8,6 +8,13 @@ import eu.dnetlib.pace.clustering.BlacklistAwareClusteringCombiner; import eu.dnetlib.pace.config.DedupConfig; import eu.dnetlib.pace.model.MapDocument; import eu.dnetlib.pace.model.Person; +import java.io.IOException; +import java.io.StringWriter; +import java.nio.charset.StandardCharsets; +import java.security.MessageDigest; +import java.text.Normalizer; +import java.util.*; +import java.util.stream.Collectors; import org.apache.commons.codec.binary.Hex; import org.apache.commons.io.IOUtils; import org.apache.commons.lang3.StringUtils; @@ -21,32 +28,43 @@ import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.util.LongAccumulator; import scala.Tuple2; -import java.io.IOException; -import java.io.StringWriter; -import java.nio.charset.StandardCharsets; -import java.security.MessageDigest; -import java.text.Normalizer; -import java.util.*; -import java.util.stream.Collectors; - public class DedupUtility { private static final Double THRESHOLD = 0.95; - public static Map constructAccumulator(final DedupConfig dedupConf, final SparkContext context) { + public static Map constructAccumulator( + final DedupConfig dedupConf, final SparkContext context) { Map accumulators = new HashMap<>(); - String acc1 = String.format("%s::%s", dedupConf.getWf().getEntityType(), "records per hash key = 1"); + String acc1 = + String.format( + "%s::%s", dedupConf.getWf().getEntityType(), "records per hash key = 1"); accumulators.put(acc1, context.longAccumulator(acc1)); - String acc2 = String.format("%s::%s", dedupConf.getWf().getEntityType(), "missing " + dedupConf.getWf().getOrderField()); + String acc2 = + String.format( + "%s::%s", + dedupConf.getWf().getEntityType(), + "missing " + dedupConf.getWf().getOrderField()); accumulators.put(acc2, context.longAccumulator(acc2)); - String acc3 = String.format("%s::%s", dedupConf.getWf().getEntityType(), String.format("Skipped records for count(%s) >= %s", dedupConf.getWf().getOrderField(), dedupConf.getWf().getGroupMaxSize())); + String acc3 = + String.format( + "%s::%s", + dedupConf.getWf().getEntityType(), + String.format( + "Skipped records for count(%s) >= %s", + dedupConf.getWf().getOrderField(), + dedupConf.getWf().getGroupMaxSize())); accumulators.put(acc3, context.longAccumulator(acc3)); String acc4 = String.format("%s::%s", dedupConf.getWf().getEntityType(), "skip list"); accumulators.put(acc4, context.longAccumulator(acc4)); - String acc5 = String.format("%s::%s", dedupConf.getWf().getEntityType(), "dedupSimilarity (x2)"); + String acc5 = + String.format("%s::%s", dedupConf.getWf().getEntityType(), "dedupSimilarity (x2)"); accumulators.put(acc5, context.longAccumulator(acc5)); - String acc6 = String.format("%s::%s", dedupConf.getWf().getEntityType(), "d < " + dedupConf.getWf().getThreshold()); + String acc6 = + String.format( + "%s::%s", + dedupConf.getWf().getEntityType(), + "d < " + dedupConf.getWf().getThreshold()); accumulators.put(acc6, context.longAccumulator(acc6)); return accumulators; @@ -71,7 +89,6 @@ public class DedupUtility { FSDataInputStream inputStream = new FSDataInputStream(fileSystem.open(new Path(path))); return DedupConfig.load(IOUtils.toString(inputStream, StandardCharsets.UTF_8.name())); - } static String readFromClasspath(final String filename, final Class clazz) { @@ -99,7 +116,6 @@ public class DedupUtility { } } - public static List mergeAuthor(final List a, final List b) { int pa = countAuthorsPids(a); int pb = countAuthorsPids(b); @@ -119,29 +135,40 @@ public class DedupUtility { } private static void enrichPidFromList(List base, List enrich) { - if (base == null || enrich == null) - return; - final Map basePidAuthorMap = base.stream() - .filter(a -> a.getPid() != null && a.getPid().size() > 0) - .flatMap(a -> a.getPid() - .stream() - .map(p -> new Tuple2<>(p.toComparableString(), a)) - ).collect(Collectors.toMap(Tuple2::_1, Tuple2::_2, (x1, x2) -> x1)); + if (base == null || enrich == null) return; + final Map basePidAuthorMap = + base.stream() + .filter(a -> a.getPid() != null && a.getPid().size() > 0) + .flatMap( + a -> + a.getPid().stream() + .map(p -> new Tuple2<>(p.toComparableString(), a))) + .collect(Collectors.toMap(Tuple2::_1, Tuple2::_2, (x1, x2) -> x1)); - final List> pidToEnrich = enrich - .stream() - .filter(a -> a.getPid() != null && a.getPid().size() > 0) - .flatMap(a -> a.getPid().stream().filter(p -> !basePidAuthorMap.containsKey(p.toComparableString())).map(p -> new Tuple2<>(p, a))) - .collect(Collectors.toList()); + final List> pidToEnrich = + enrich.stream() + .filter(a -> a.getPid() != null && a.getPid().size() > 0) + .flatMap( + a -> + a.getPid().stream() + .filter( + p -> + !basePidAuthorMap.containsKey( + p.toComparableString())) + .map(p -> new Tuple2<>(p, a))) + .collect(Collectors.toList()); - - pidToEnrich.forEach(a -> { - Optional> simAuhtor = base.stream().map(ba -> new Tuple2<>(sim(ba, a._2()), ba)).max(Comparator.comparing(Tuple2::_1)); - if (simAuhtor.isPresent() && simAuhtor.get()._1() > THRESHOLD) { - Author r = simAuhtor.get()._2(); - r.getPid().add(a._1()); - } - }); + pidToEnrich.forEach( + a -> { + Optional> simAuhtor = + base.stream() + .map(ba -> new Tuple2<>(sim(ba, a._2()), ba)) + .max(Comparator.comparing(Tuple2::_1)); + if (simAuhtor.isPresent() && simAuhtor.get()._1() > THRESHOLD) { + Author r = simAuhtor.get()._2(); + r.getPid().add(a._1()); + } + }); } public static String createEntityPath(final String basePath, final String entityType) { @@ -162,19 +189,21 @@ public class DedupUtility { final Person pb = parse(b); if (pa.isAccurate() & pb.isAccurate()) { - return new JaroWinkler().score( - normalize(pa.getSurnameString()), - normalize(pb.getSurnameString())); + return new JaroWinkler() + .score(normalize(pa.getSurnameString()), normalize(pb.getSurnameString())); } else { - return new JaroWinkler().score( - normalize(pa.getNormalisedFullname()), - normalize(pb.getNormalisedFullname())); + return new JaroWinkler() + .score( + normalize(pa.getNormalisedFullname()), + normalize(pb.getNormalisedFullname())); } } private static String normalize(final String s) { return nfd(s).toLowerCase() - // do not compact the regexes in a single expression, would cause StackOverflowError in case of large input strings + // do not compact the regexes in a single expression, would cause StackOverflowError + // in case + // of large input strings .replaceAll("(\\W)+", " ") .replaceAll("(\\p{InCombiningDiacriticalMarks})+", " ") .replaceAll("(\\p{Punct})+", " ") @@ -195,23 +224,19 @@ public class DedupUtility { } } - private static int countAuthorsPids(List authors) { - if (authors == null) - return 0; + if (authors == null) return 0; return (int) authors.stream().filter(DedupUtility::hasPid).count(); } private static int authorsSize(List authors) { - if (authors == null) - return 0; + if (authors == null) return 0; return authors.size(); } private static boolean hasPid(Author a) { - if (a == null || a.getPid() == null || a.getPid().size() == 0) - return false; + if (a == null || a.getPid() == null || a.getPid().size() == 0) return false; return a.getPid().stream().anyMatch(p -> p != null && StringUtils.isNotBlank(p.getValue())); } } diff --git a/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/Deduper.java b/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/Deduper.java index 7206f892f..bec4229ee 100644 --- a/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/Deduper.java +++ b/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/Deduper.java @@ -1,10 +1,11 @@ package eu.dnetlib.dedup; import eu.dnetlib.pace.config.DedupConfig; -import eu.dnetlib.pace.model.Field; import eu.dnetlib.pace.model.MapDocument; import eu.dnetlib.pace.util.BlockProcessor; import eu.dnetlib.pace.util.MapDocumentUtil; +import java.util.*; +import java.util.stream.Collectors; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.spark.api.java.JavaPairRDD; @@ -17,9 +18,6 @@ import org.apache.spark.util.LongAccumulator; import scala.Serializable; import scala.Tuple2; -import java.util.*; -import java.util.stream.Collectors; - public class Deduper implements Serializable { private static final Log log = LogFactory.getLog(Deduper.class); @@ -30,26 +28,32 @@ public class Deduper implements Serializable { * @param: list of JSON entities to be deduped * @param: the dedup configuration */ - public static JavaPairRDD dedup(JavaSparkContext context, JavaRDD entities, DedupConfig config) { + public static JavaPairRDD dedup( + JavaSparkContext context, JavaRDD entities, DedupConfig config) { - Map accumulators = DedupUtility.constructAccumulator(config, context.sc()); + Map accumulators = + DedupUtility.constructAccumulator(config, context.sc()); - //create vertexes of the graph: + // create vertexes of the graph: JavaPairRDD mapDocs = mapToVertexes(context, entities, config); - - //create blocks for deduplication + // create blocks for deduplication JavaPairRDD> blocks = createBlocks(context, mapDocs, config); - //create relations by comparing only elements in the same group + // create relations by comparing only elements in the same group return computeRelations(context, blocks, config); -// final RDD> edgeRdd = relationRDD.map(it -> new Edge<>(it._1().hashCode(), it._2().hashCode(), "equalTo")).rdd(); -// -// RDD> vertexes = mapDocs.mapToPair((PairFunction, Object, MapDocument>) t -> new Tuple2((long) t._1().hashCode(), t._2())).rdd(); -// accumulators.forEach((name, acc) -> log.info(name + " -> " + acc.value())); -// -// return GraphProcessor.findCCs(vertexes, edgeRdd, 20).toJavaRDD(); + // final RDD> edgeRdd = relationRDD.map(it -> new + // Edge<>(it._1().hashCode(), + // it._2().hashCode(), "equalTo")).rdd(); + // + // RDD> vertexes = + // mapDocs.mapToPair((PairFunction, Object, MapDocument>) t -> + // new + // Tuple2((long) t._1().hashCode(), t._2())).rdd(); + // accumulators.forEach((name, acc) -> log.info(name + " -> " + acc.value())); + // + // return GraphProcessor.findCCs(vertexes, edgeRdd, 20).toJavaRDD(); } /** @@ -58,71 +62,92 @@ public class Deduper implements Serializable { * @param: list of blocks * @param: the dedup configuration */ - public static JavaPairRDD computeRelations(JavaSparkContext context, JavaPairRDD> blocks, DedupConfig config) { + public static JavaPairRDD computeRelations( + JavaSparkContext context, + JavaPairRDD> blocks, + DedupConfig config) { - Map accumulators = DedupUtility.constructAccumulator(config, context.sc()); + Map accumulators = + DedupUtility.constructAccumulator(config, context.sc()); - return blocks.flatMapToPair((PairFlatMapFunction>, String, String>) it -> { - final SparkReporter reporter = new SparkReporter(accumulators); - new BlockProcessor(config).process(it._1(), it._2(), reporter); - return reporter.getRelations().iterator(); - - }).mapToPair( - (PairFunction, String, Tuple2>) item -> - new Tuple2>(item._1() + item._2(), item)) + return blocks.flatMapToPair( + (PairFlatMapFunction>, String, String>) + it -> { + final SparkReporter reporter = new SparkReporter(accumulators); + new BlockProcessor(config).process(it._1(), it._2(), reporter); + return reporter.getRelations().iterator(); + }) + .mapToPair( + (PairFunction, String, Tuple2>) + item -> + new Tuple2>( + item._1() + item._2(), item)) .reduceByKey((a, b) -> a) - .mapToPair((PairFunction>, String, String>) Tuple2::_2); + .mapToPair( + (PairFunction>, String, String>) + Tuple2::_2); } - /** * @return the list of blocks based on clustering of dedup configuration * @param: the spark context * @param: list of entities: * @param: the dedup configuration */ - public static JavaPairRDD> createBlocks(JavaSparkContext context, JavaPairRDD mapDocs, DedupConfig config) { + public static JavaPairRDD> createBlocks( + JavaSparkContext context, + JavaPairRDD mapDocs, + DedupConfig config) { return mapDocs - //the reduce is just to be sure that we haven't document with same id + // the reduce is just to be sure that we haven't document with same id .reduceByKey((a, b) -> a) .map(Tuple2::_2) - //Clustering: from to List - .flatMapToPair((PairFlatMapFunction) a -> - DedupUtility.getGroupingKeys(config, a) - .stream() - .map(it -> new Tuple2<>(it, a)) - .collect(Collectors.toList()) - .iterator()) + // Clustering: from to List + .flatMapToPair( + (PairFlatMapFunction) + a -> + DedupUtility.getGroupingKeys(config, a).stream() + .map(it -> new Tuple2<>(it, a)) + .collect(Collectors.toList()) + .iterator()) .groupByKey(); } - - public static JavaPairRDD> createsortedBlocks(JavaSparkContext context, JavaPairRDD mapDocs, DedupConfig config) { + public static JavaPairRDD> createsortedBlocks( + JavaSparkContext context, + JavaPairRDD mapDocs, + DedupConfig config) { final String of = config.getWf().getOrderField(); final int maxQueueSize = config.getWf().getGroupMaxSize(); return mapDocs - //the reduce is just to be sure that we haven't document with same id + // the reduce is just to be sure that we haven't document with same id .reduceByKey((a, b) -> a) .map(Tuple2::_2) - //Clustering: from to List - .flatMapToPair((PairFlatMapFunction>) a -> - DedupUtility.getGroupingKeys(config, a) - .stream() - .map(it -> { - List tmp = new ArrayList<>(); - tmp.add(a); - return new Tuple2<>(it, tmp); - } - ) - .collect(Collectors.toList()) - .iterator()) - .reduceByKey((Function2, List, List>) (v1, v2) -> { - v1.addAll(v2); - v1.sort(Comparator.comparing(a -> a.getFieldMap().get(of).stringValue())); - if (v1.size() > maxQueueSize) - return new ArrayList<>(v1.subList(0, maxQueueSize)); - return v1; - }); + // Clustering: from to List + .flatMapToPair( + (PairFlatMapFunction>) + a -> + DedupUtility.getGroupingKeys(config, a).stream() + .map( + it -> { + List tmp = + new ArrayList<>(); + tmp.add(a); + return new Tuple2<>(it, tmp); + }) + .collect(Collectors.toList()) + .iterator()) + .reduceByKey( + (Function2, List, List>) + (v1, v2) -> { + v1.addAll(v2); + v1.sort( + Comparator.comparing( + a -> a.getFieldMap().get(of).stringValue())); + if (v1.size() > maxQueueSize) + return new ArrayList<>(v1.subList(0, maxQueueSize)); + return v1; + }); } /** @@ -131,32 +156,48 @@ public class Deduper implements Serializable { * @param: list of JSON entities * @param: the dedup configuration */ - public static JavaPairRDD mapToVertexes(JavaSparkContext context, JavaRDD entities, DedupConfig config) { + public static JavaPairRDD mapToVertexes( + JavaSparkContext context, JavaRDD entities, DedupConfig config) { - return entities.mapToPair((PairFunction) s -> { - - MapDocument mapDocument = MapDocumentUtil.asMapDocumentWithJPath(config, s); - return new Tuple2(mapDocument.getIdentifier(), mapDocument); - - - }); + return entities.mapToPair( + (PairFunction) + s -> { + MapDocument mapDocument = + MapDocumentUtil.asMapDocumentWithJPath(config, s); + return new Tuple2( + mapDocument.getIdentifier(), mapDocument); + }); } - public static JavaPairRDD computeRelations2(JavaSparkContext context, JavaPairRDD> blocks, DedupConfig config) { - Map accumulators = DedupUtility.constructAccumulator(config, context.sc()); + public static JavaPairRDD computeRelations2( + JavaSparkContext context, + JavaPairRDD> blocks, + DedupConfig config) { + Map accumulators = + DedupUtility.constructAccumulator(config, context.sc()); - return blocks.flatMapToPair((PairFlatMapFunction>, String, String>) it -> { - try { - final SparkReporter reporter = new SparkReporter(accumulators); - new BlockProcessor(config).processSortedBlock(it._1(), it._2(), reporter); - return reporter.getRelations().iterator(); - } catch (Exception e) { - throw new RuntimeException(it._2().get(0).getIdentifier(), e); - } - }).mapToPair( - (PairFunction, String, Tuple2>) item -> - new Tuple2>(item._1() + item._2(), item)) + return blocks.flatMapToPair( + (PairFlatMapFunction>, String, String>) + it -> { + try { + final SparkReporter reporter = + new SparkReporter(accumulators); + new BlockProcessor(config) + .processSortedBlock(it._1(), it._2(), reporter); + return reporter.getRelations().iterator(); + } catch (Exception e) { + throw new RuntimeException( + it._2().get(0).getIdentifier(), e); + } + }) + .mapToPair( + (PairFunction, String, Tuple2>) + item -> + new Tuple2>( + item._1() + item._2(), item)) .reduceByKey((a, b) -> a) - .mapToPair((PairFunction>, String, String>) Tuple2::_2); + .mapToPair( + (PairFunction>, String, String>) + Tuple2::_2); } -} \ No newline at end of file +} diff --git a/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/OafEntityType.java b/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/OafEntityType.java index fb347ed51..10632240a 100644 --- a/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/OafEntityType.java +++ b/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/OafEntityType.java @@ -1,7 +1,6 @@ package eu.dnetlib.dedup; public enum OafEntityType { - datasource, organization, project, @@ -9,7 +8,4 @@ public enum OafEntityType { otherresearchproduct, software, publication - - - } diff --git a/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/SparkCreateConnectedComponent.java b/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/SparkCreateConnectedComponent.java index d3020b92c..29068e132 100644 --- a/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/SparkCreateConnectedComponent.java +++ b/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/SparkCreateConnectedComponent.java @@ -1,8 +1,5 @@ package eu.dnetlib.dedup; -import com.google.common.collect.Iterables; -import com.google.common.collect.Lists; -import com.google.common.hash.HashFunction; import com.google.common.hash.Hashing; import eu.dnetlib.dedup.graph.ConnectedComponent; import eu.dnetlib.dedup.graph.GraphProcessor; @@ -10,6 +7,8 @@ import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.schema.oaf.Relation; import eu.dnetlib.pace.config.DedupConfig; import eu.dnetlib.pace.util.MapDocumentUtil; +import java.util.ArrayList; +import java.util.List; import org.apache.commons.io.IOUtils; import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.JavaRDD; @@ -23,58 +22,91 @@ import org.apache.spark.sql.Encoders; import org.apache.spark.sql.SparkSession; import scala.Tuple2; -import java.util.ArrayList; -import java.util.List; - public class SparkCreateConnectedComponent { public static void main(String[] args) throws Exception { - final ArgumentApplicationParser parser = new ArgumentApplicationParser(IOUtils.toString(SparkCreateConnectedComponent.class.getResourceAsStream("/eu/dnetlib/dhp/sx/dedup/dedup_parameters.json"))); + final ArgumentApplicationParser parser = + new ArgumentApplicationParser( + IOUtils.toString( + SparkCreateConnectedComponent.class.getResourceAsStream( + "/eu/dnetlib/dhp/sx/dedup/dedup_parameters.json"))); parser.parseArgument(args); - final SparkSession spark = SparkSession - .builder() - .appName(SparkCreateConnectedComponent.class.getSimpleName()) - .master(parser.get("master")) - .getOrCreate(); + final SparkSession spark = + SparkSession.builder() + .appName(SparkCreateConnectedComponent.class.getSimpleName()) + .master(parser.get("master")) + .getOrCreate(); final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); final String inputPath = parser.get("sourcePath"); final String entity = parser.get("entity"); final String targetPath = parser.get("targetPath"); -// final DedupConfig dedupConf = DedupConfig.load(IOUtils.toString(SparkCreateConnectedComponent.class.getResourceAsStream("/eu/dnetlib/dhp/dedup/conf/org.curr.conf2.json"))); + // final DedupConfig dedupConf = + // DedupConfig.load(IOUtils.toString(SparkCreateConnectedComponent.class.getResourceAsStream("/eu/dnetlib/dhp/dedup/conf/org.curr.conf2.json"))); final DedupConfig dedupConf = DedupConfig.load(parser.get("dedupConf")); - final JavaPairRDD vertexes = sc.textFile(inputPath + "/" + entity) - .map(s -> MapDocumentUtil.getJPathString(dedupConf.getWf().getIdPath(), s)) - .mapToPair((PairFunction) - s -> new Tuple2(getHashcode(s), s) - ); + final JavaPairRDD vertexes = + sc.textFile(inputPath + "/" + entity) + .map(s -> MapDocumentUtil.getJPathString(dedupConf.getWf().getIdPath(), s)) + .mapToPair( + (PairFunction) + s -> new Tuple2(getHashcode(s), s)); - final Dataset similarityRelations = spark.read().load(DedupUtility.createSimRelPath(targetPath,entity)).as(Encoders.bean(Relation.class)); - final RDD> edgeRdd = similarityRelations.javaRDD().map(it -> new Edge<>(getHashcode(it.getSource()), getHashcode(it.getTarget()), it.getRelClass())).rdd(); - final JavaRDD cc = GraphProcessor.findCCs(vertexes.rdd(), edgeRdd, dedupConf.getWf().getMaxIterations()).toJavaRDD(); - final Dataset mergeRelation = spark.createDataset(cc.filter(k->k.getDocIds().size()>1).flatMap((FlatMapFunction) c -> - c.getDocIds() - .stream() - .flatMap(id -> { - List tmp = new ArrayList<>(); - Relation r = new Relation(); - r.setSource(c.getCcId()); - r.setTarget(id); - r.setRelClass("merges"); - tmp.add(r); - r = new Relation(); - r.setTarget(c.getCcId()); - r.setSource(id); - r.setRelClass("isMergedIn"); - tmp.add(r); - return tmp.stream(); - }).iterator()).rdd(), Encoders.bean(Relation.class)); - mergeRelation.write().mode("overwrite").save(DedupUtility.createMergeRelPath(targetPath,entity)); + final Dataset similarityRelations = + spark.read() + .load(DedupUtility.createSimRelPath(targetPath, entity)) + .as(Encoders.bean(Relation.class)); + final RDD> edgeRdd = + similarityRelations + .javaRDD() + .map( + it -> + new Edge<>( + getHashcode(it.getSource()), + getHashcode(it.getTarget()), + it.getRelClass())) + .rdd(); + final JavaRDD cc = + GraphProcessor.findCCs( + vertexes.rdd(), edgeRdd, dedupConf.getWf().getMaxIterations()) + .toJavaRDD(); + final Dataset mergeRelation = + spark.createDataset( + cc.filter(k -> k.getDocIds().size() > 1) + .flatMap( + (FlatMapFunction) + c -> + c.getDocIds().stream() + .flatMap( + id -> { + List tmp = + new ArrayList<>(); + Relation r = + new Relation(); + r.setSource( + c.getCcId()); + r.setTarget(id); + r.setRelClass("merges"); + tmp.add(r); + r = new Relation(); + r.setTarget( + c.getCcId()); + r.setSource(id); + r.setRelClass( + "isMergedIn"); + tmp.add(r); + return tmp.stream(); + }) + .iterator()) + .rdd(), + Encoders.bean(Relation.class)); + mergeRelation + .write() + .mode("overwrite") + .save(DedupUtility.createMergeRelPath(targetPath, entity)); } - public static long getHashcode(final String id) { + public static long getHashcode(final String id) { return Hashing.murmur3_128().hashString(id).asLong(); } - } diff --git a/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/SparkCreateDedupRecord.java b/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/SparkCreateDedupRecord.java index 09c0ba89b..780356fdf 100644 --- a/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/SparkCreateDedupRecord.java +++ b/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/SparkCreateDedupRecord.java @@ -11,13 +11,17 @@ import org.apache.spark.sql.SparkSession; public class SparkCreateDedupRecord { public static void main(String[] args) throws Exception { - final ArgumentApplicationParser parser = new ArgumentApplicationParser(IOUtils.toString(SparkCreateDedupRecord.class.getResourceAsStream("/eu/dnetlib/dhp/sx/dedup/dedupRecord_parameters.json"))); + final ArgumentApplicationParser parser = + new ArgumentApplicationParser( + IOUtils.toString( + SparkCreateDedupRecord.class.getResourceAsStream( + "/eu/dnetlib/dhp/sx/dedup/dedupRecord_parameters.json"))); parser.parseArgument(args); - final SparkSession spark = SparkSession - .builder() - .appName(SparkCreateDedupRecord.class.getSimpleName()) - .master(parser.get("master")) - .getOrCreate(); + final SparkSession spark = + SparkSession.builder() + .appName(SparkCreateDedupRecord.class.getSimpleName()) + .master(parser.get("master")) + .getOrCreate(); final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); final String sourcePath = parser.get("sourcePath"); @@ -25,10 +29,20 @@ public class SparkCreateDedupRecord { final String dedupPath = parser.get("dedupPath"); final DedupConfig dedupConf = DedupConfig.load(parser.get("dedupConf")); - final JavaRDD dedupRecord = DedupRecordFactory.createDedupRecord(sc, spark, DedupUtility.createMergeRelPath(dedupPath,entity), DedupUtility.createEntityPath(sourcePath,entity), OafEntityType.valueOf(entity), dedupConf); - dedupRecord.map(r-> { - ObjectMapper mapper = new ObjectMapper(); - return mapper.writeValueAsString(r); - }).saveAsTextFile(dedupPath+"/"+entity+"/dedup_records"); + final JavaRDD dedupRecord = + DedupRecordFactory.createDedupRecord( + sc, + spark, + DedupUtility.createMergeRelPath(dedupPath, entity), + DedupUtility.createEntityPath(sourcePath, entity), + OafEntityType.valueOf(entity), + dedupConf); + dedupRecord + .map( + r -> { + ObjectMapper mapper = new ObjectMapper(); + return mapper.writeValueAsString(r); + }) + .saveAsTextFile(dedupPath + "/" + entity + "/dedup_records"); } } diff --git a/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/SparkCreateSimRels.java b/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/SparkCreateSimRels.java index b847c80dc..172843348 100644 --- a/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/SparkCreateSimRels.java +++ b/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/SparkCreateSimRels.java @@ -1,73 +1,81 @@ package eu.dnetlib.dedup; -import com.google.common.hash.Hashing; import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.schema.oaf.Relation; import eu.dnetlib.pace.config.DedupConfig; import eu.dnetlib.pace.model.MapDocument; import eu.dnetlib.pace.util.MapDocumentUtil; +import java.util.List; import org.apache.commons.io.IOUtils; import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; -import org.apache.spark.api.java.function.PairFunction; import org.apache.spark.sql.Encoders; import org.apache.spark.sql.SparkSession; import scala.Tuple2; -import java.util.List; - - /** * This Spark class creates similarity relations between entities, saving result * - * param request: - * sourcePath - * entityType - * target Path + *

param request: sourcePath entityType target Path */ public class SparkCreateSimRels { public static void main(String[] args) throws Exception { - final ArgumentApplicationParser parser = new ArgumentApplicationParser(IOUtils.toString(SparkCreateSimRels.class.getResourceAsStream("/eu/dnetlib/dhp/sx/dedup/dedup_parameters.json"))); + final ArgumentApplicationParser parser = + new ArgumentApplicationParser( + IOUtils.toString( + SparkCreateSimRels.class.getResourceAsStream( + "/eu/dnetlib/dhp/sx/dedup/dedup_parameters.json"))); parser.parseArgument(args); - final SparkSession spark = SparkSession - .builder() - .appName(SparkCreateSimRels.class.getSimpleName()) - .master(parser.get("master")) - .getOrCreate(); + final SparkSession spark = + SparkSession.builder() + .appName(SparkCreateSimRels.class.getSimpleName()) + .master(parser.get("master")) + .getOrCreate(); final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); final String inputPath = parser.get("sourcePath"); final String entity = parser.get("entity"); final String targetPath = parser.get("targetPath"); -// final DedupConfig dedupConf = DedupConfig.load(IOUtils.toString(SparkCreateSimRels.class.getResourceAsStream("/eu/dnetlib/dhp/dedup/conf/org.curr.conf.json"))); + // final DedupConfig dedupConf = + // DedupConfig.load(IOUtils.toString(SparkCreateSimRels.class.getResourceAsStream("/eu/dnetlib/dhp/dedup/conf/org.curr.conf.json"))); final DedupConfig dedupConf = DedupConfig.load(parser.get("dedupConf")); + JavaPairRDD mapDocument = + sc.textFile(inputPath + "/" + entity) + .mapToPair( + s -> { + MapDocument d = + MapDocumentUtil.asMapDocumentWithJPath(dedupConf, s); + return new Tuple2<>(d.getIdentifier(), d); + }); + // create blocks for deduplication + JavaPairRDD> blocks = + Deduper.createsortedBlocks(sc, mapDocument, dedupConf); + // JavaPairRDD> blocks = Deduper.createBlocks(sc, + // mapDocument, dedupConf); - JavaPairRDD mapDocument = sc.textFile(inputPath + "/" + entity) - .mapToPair(s->{ - MapDocument d = MapDocumentUtil.asMapDocumentWithJPath(dedupConf,s); - return new Tuple2<>(d.getIdentifier(), d);}); + // create relations by comparing only elements in the same group + final JavaPairRDD dedupRels = + Deduper.computeRelations2(sc, blocks, dedupConf); + // final JavaPairRDD dedupRels = Deduper.computeRelations(sc, blocks, + // dedupConf); - //create blocks for deduplication - JavaPairRDD> blocks = Deduper.createsortedBlocks(sc, mapDocument, dedupConf); -// JavaPairRDD> blocks = Deduper.createBlocks(sc, mapDocument, dedupConf); - - //create relations by comparing only elements in the same group - final JavaPairRDD dedupRels = Deduper.computeRelations2(sc, blocks, dedupConf); -// final JavaPairRDD dedupRels = Deduper.computeRelations(sc, blocks, dedupConf); - - final JavaRDD isSimilarToRDD = dedupRels.map(simRel -> { - final Relation r = new Relation(); - r.setSource(simRel._1()); - r.setTarget(simRel._2()); - r.setRelClass("isSimilarTo"); - return r; - }); - - spark.createDataset(isSimilarToRDD.rdd(), Encoders.bean(Relation.class)).write().mode("overwrite").save( DedupUtility.createSimRelPath(targetPath,entity)); + final JavaRDD isSimilarToRDD = + dedupRels.map( + simRel -> { + final Relation r = new Relation(); + r.setSource(simRel._1()); + r.setTarget(simRel._2()); + r.setRelClass("isSimilarTo"); + return r; + }); + spark.createDataset(isSimilarToRDD.rdd(), Encoders.bean(Relation.class)) + .write() + .mode("overwrite") + .save(DedupUtility.createSimRelPath(targetPath, entity)); } } diff --git a/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/SparkReporter.java b/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/SparkReporter.java index 165a10b25..a77484892 100644 --- a/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/SparkReporter.java +++ b/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/SparkReporter.java @@ -1,33 +1,35 @@ package eu.dnetlib.dedup; import eu.dnetlib.pace.util.Reporter; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.spark.util.LongAccumulator; import scala.Serializable; import scala.Tuple2; -import java.util.ArrayList; -import java.util.List; -import java.util.Map; - public class SparkReporter implements Serializable, Reporter { final List> relations = new ArrayList<>(); private static final Log log = LogFactory.getLog(SparkReporter.class); Map accumulators; - public SparkReporter(Map accumulators){ + public SparkReporter(Map accumulators) { this.accumulators = accumulators; } - public void incrementCounter(String counterGroup, String counterName, long delta, Map accumulators) { + public void incrementCounter( + String counterGroup, + String counterName, + long delta, + Map accumulators) { final String accumulatorName = String.format("%s::%s", counterGroup, counterName); - if (accumulators.containsKey(accumulatorName)){ + if (accumulators.containsKey(accumulatorName)) { accumulators.get(accumulatorName).add(delta); } - } @Override diff --git a/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/graph/ConnectedComponent.java b/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/graph/ConnectedComponent.java index 27a61c02d..41d53944f 100644 --- a/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/graph/ConnectedComponent.java +++ b/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/graph/ConnectedComponent.java @@ -3,21 +3,18 @@ package eu.dnetlib.dedup.graph; import com.fasterxml.jackson.databind.ObjectMapper; import eu.dnetlib.dedup.DedupUtility; import eu.dnetlib.pace.util.PaceException; -import org.apache.commons.lang.StringUtils; -import org.codehaus.jackson.annotate.JsonIgnore; - import java.io.IOException; import java.io.Serializable; import java.util.Set; +import org.apache.commons.lang.StringUtils; +import org.codehaus.jackson.annotate.JsonIgnore; public class ConnectedComponent implements Serializable { private Set docIds; private String ccId; - - public ConnectedComponent() { - } + public ConnectedComponent() {} public ConnectedComponent(Set docIds) { this.docIds = docIds; @@ -28,7 +25,7 @@ public class ConnectedComponent implements Serializable { if (docIds.size() > 1) { final String s = getMin(); String prefix = s.split("\\|")[0]; - ccId =prefix + "|dedup_______::" + DedupUtility.md5(s); + ccId = prefix + "|dedup_______::" + DedupUtility.md5(s); return ccId; } else { return docIds.iterator().next(); @@ -36,24 +33,25 @@ public class ConnectedComponent implements Serializable { } @JsonIgnore - public String getMin(){ + public String getMin() { final StringBuilder min = new StringBuilder(); - docIds.forEach(i -> { - if (StringUtils.isBlank(min.toString())) { - min.append(i); - } else { - if (min.toString().compareTo(i) > 0) { - min.setLength(0); - min.append(i); - } - } - }); + docIds.forEach( + i -> { + if (StringUtils.isBlank(min.toString())) { + min.append(i); + } else { + if (min.toString().compareTo(i) > 0) { + min.setLength(0); + min.append(i); + } + } + }); return min.toString(); } @Override - public String toString(){ + public String toString() { ObjectMapper mapper = new ObjectMapper(); try { return mapper.writeValueAsString(this); diff --git a/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/sx/SparkPropagateRelationsJob.java b/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/sx/SparkPropagateRelationsJob.java index a72529443..968f71ef0 100644 --- a/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/sx/SparkPropagateRelationsJob.java +++ b/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/sx/SparkPropagateRelationsJob.java @@ -6,60 +6,79 @@ import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.schema.oaf.DataInfo; import eu.dnetlib.dhp.schema.oaf.Relation; import eu.dnetlib.dhp.utils.DHPUtils; +import java.io.IOException; import org.apache.commons.io.IOUtils; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.function.MapFunction; import org.apache.spark.sql.*; import scala.Tuple2; -import java.io.IOException; - public class SparkPropagateRelationsJob { enum FieldType { SOURCE, TARGET } - final static String SOURCEJSONPATH = "$.source"; - final static String TARGETJSONPATH = "$.target"; + + static final String SOURCEJSONPATH = "$.source"; + static final String TARGETJSONPATH = "$.target"; public static void main(String[] args) throws Exception { - final ArgumentApplicationParser parser = new ArgumentApplicationParser(IOUtils.toString(SparkPropagateRelationsJob.class.getResourceAsStream("/eu/dnetlib/dhp/sx/dedup/dedup_propagate_relation_parameters.json"))); + final ArgumentApplicationParser parser = + new ArgumentApplicationParser( + IOUtils.toString( + SparkPropagateRelationsJob.class.getResourceAsStream( + "/eu/dnetlib/dhp/sx/dedup/dedup_propagate_relation_parameters.json"))); parser.parseArgument(args); - final SparkSession spark = SparkSession - .builder() - .appName(SparkUpdateEntityJob.class.getSimpleName()) - .master(parser.get("master")) - .getOrCreate(); + final SparkSession spark = + SparkSession.builder() + .appName(SparkUpdateEntityJob.class.getSimpleName()) + .master(parser.get("master")) + .getOrCreate(); final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); final String relationPath = parser.get("relationPath"); final String mergeRelPath = parser.get("mergeRelPath"); final String targetRelPath = parser.get("targetRelPath"); + final Dataset merge = + spark.read() + .load(mergeRelPath) + .as(Encoders.bean(Relation.class)) + .where("relClass == 'merges'"); - final Dataset merge = spark.read().load(mergeRelPath).as(Encoders.bean(Relation.class)).where("relClass == 'merges'"); + final Dataset rels = + spark.read().load(relationPath).as(Encoders.bean(Relation.class)); - final Dataset rels= spark.read().load(relationPath).as(Encoders.bean(Relation.class)); + final Dataset firstJoin = + rels.joinWith(merge, merge.col("target").equalTo(rels.col("source")), "left_outer") + .map( + (MapFunction, Relation>) + r -> { + final Relation mergeRelation = r._2(); + final Relation relation = r._1(); + if (mergeRelation != null) + relation.setSource(mergeRelation.getSource()); + return relation; + }, + Encoders.bean(Relation.class)); - final Dataset firstJoin = rels.joinWith(merge, merge.col("target").equalTo(rels.col("source")), "left_outer") - .map((MapFunction, Relation>) r -> { - final Relation mergeRelation = r._2(); - final Relation relation = r._1(); - - if(mergeRelation!= null) - relation.setSource(mergeRelation.getSource()); - return relation; - }, Encoders.bean(Relation.class)); - - final Dataset secondJoin = firstJoin.joinWith(merge, merge.col("target").equalTo(firstJoin.col("target")), "left_outer") - .map((MapFunction, Relation>) r -> { - final Relation mergeRelation = r._2(); - final Relation relation = r._1(); - if (mergeRelation != null ) - relation.setTarget(mergeRelation.getSource()); - return relation; - }, Encoders.bean(Relation.class)); + final Dataset secondJoin = + firstJoin + .joinWith( + merge, + merge.col("target").equalTo(firstJoin.col("target")), + "left_outer") + .map( + (MapFunction, Relation>) + r -> { + final Relation mergeRelation = r._2(); + final Relation relation = r._1(); + if (mergeRelation != null) + relation.setTarget(mergeRelation.getSource()); + return relation; + }, + Encoders.bean(Relation.class)); secondJoin.write().mode(SaveMode.Overwrite).save(targetRelPath); } @@ -71,14 +90,12 @@ public class SparkPropagateRelationsJob { return source.toLowerCase().contains("dedup") || target.toLowerCase().contains("dedup"); } - private static String replaceField(final String json, final String id, final FieldType type) { ObjectMapper mapper = new ObjectMapper(); mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); try { Relation relation = mapper.readValue(json, Relation.class); - if (relation.getDataInfo() == null) - relation.setDataInfo(new DataInfo()); + if (relation.getDataInfo() == null) relation.setDataInfo(new DataInfo()); relation.getDataInfo().setDeletedbyinference(false); switch (type) { case SOURCE: diff --git a/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/sx/SparkUpdateEntityJob.java b/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/sx/SparkUpdateEntityJob.java index 44f7c551e..c5a9581ec 100644 --- a/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/sx/SparkUpdateEntityJob.java +++ b/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/sx/SparkUpdateEntityJob.java @@ -10,6 +10,7 @@ import eu.dnetlib.dhp.schema.scholexplorer.DLIDataset; import eu.dnetlib.dhp.schema.scholexplorer.DLIPublication; import eu.dnetlib.dhp.schema.scholexplorer.DLIUnknown; import eu.dnetlib.dhp.utils.DHPUtils; +import java.io.IOException; import org.apache.commons.io.IOUtils; import org.apache.hadoop.io.compress.GzipCodec; import org.apache.spark.api.java.JavaPairRDD; @@ -19,20 +20,22 @@ import org.apache.spark.api.java.function.PairFunction; import org.apache.spark.sql.*; import scala.Tuple2; -import java.io.IOException; - public class SparkUpdateEntityJob { - final static String IDJSONPATH = "$.id"; + static final String IDJSONPATH = "$.id"; public static void main(String[] args) throws Exception { - final ArgumentApplicationParser parser = new ArgumentApplicationParser(IOUtils.toString(SparkUpdateEntityJob.class.getResourceAsStream("/eu/dnetlib/dhp/sx/dedup/dedup_delete_by_inference_parameters.json"))); + final ArgumentApplicationParser parser = + new ArgumentApplicationParser( + IOUtils.toString( + SparkUpdateEntityJob.class.getResourceAsStream( + "/eu/dnetlib/dhp/sx/dedup/dedup_delete_by_inference_parameters.json"))); parser.parseArgument(args); - final SparkSession spark = SparkSession - .builder() - .appName(SparkUpdateEntityJob.class.getSimpleName()) - .master(parser.get("master")) - .getOrCreate(); + final SparkSession spark = + SparkSession.builder() + .appName(SparkUpdateEntityJob.class.getSimpleName()) + .master(parser.get("master")) + .getOrCreate(); final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); final String entityPath = parser.get("entityPath"); @@ -41,52 +44,59 @@ public class SparkUpdateEntityJob { final String entity = parser.get("entity"); final String destination = parser.get("targetPath"); - final Dataset df = spark.read().load(mergeRelPath).as(Encoders.bean(Relation.class)); - final JavaPairRDD mergedIds = df - .where("relClass == 'merges'") - .select(df.col("target")) - .distinct() - .toJavaRDD() - .mapToPair((PairFunction) r -> new Tuple2<>(r.getString(0), "d")); + final Dataset df = + spark.read().load(mergeRelPath).as(Encoders.bean(Relation.class)); + final JavaPairRDD mergedIds = + df.where("relClass == 'merges'") + .select(df.col("target")) + .distinct() + .toJavaRDD() + .mapToPair( + (PairFunction) + r -> new Tuple2<>(r.getString(0), "d")); final JavaRDD sourceEntity = sc.textFile(entityPath); final JavaRDD dedupEntity = sc.textFile(dedupRecordPath); - JavaPairRDD entitiesWithId = sourceEntity.mapToPair((PairFunction) s -> new Tuple2<>(DHPUtils.getJPathString(IDJSONPATH, s), s)); - Class mainClass; - switch (entity) { - case "publication": - mainClass = DLIPublication.class; - break; - case "dataset": - mainClass = DLIDataset.class; - break; - case "unknown": - mainClass = DLIUnknown.class; - break; - default: - throw new IllegalArgumentException("Illegal type " + entity); - - } - JavaRDD map = entitiesWithId.leftOuterJoin(mergedIds).map(k -> k._2()._2().isPresent() ? updateDeletedByInference(k._2()._1(), mainClass) : k._2()._1()); - map.union(dedupEntity).saveAsTextFile(destination, GzipCodec.class); - + JavaPairRDD entitiesWithId = + sourceEntity.mapToPair( + (PairFunction) + s -> new Tuple2<>(DHPUtils.getJPathString(IDJSONPATH, s), s)); + Class mainClass; + switch (entity) { + case "publication": + mainClass = DLIPublication.class; + break; + case "dataset": + mainClass = DLIDataset.class; + break; + case "unknown": + mainClass = DLIUnknown.class; + break; + default: + throw new IllegalArgumentException("Illegal type " + entity); + } + JavaRDD map = + entitiesWithId + .leftOuterJoin(mergedIds) + .map( + k -> + k._2()._2().isPresent() + ? updateDeletedByInference(k._2()._1(), mainClass) + : k._2()._1()); + map.union(dedupEntity).saveAsTextFile(destination, GzipCodec.class); } - private static String updateDeletedByInference(final String json, final Class clazz) { + private static String updateDeletedByInference( + final String json, final Class clazz) { final ObjectMapper mapper = new ObjectMapper(); mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); try { Oaf entity = mapper.readValue(json, clazz); - if (entity.getDataInfo()== null) - entity.setDataInfo(new DataInfo()); + if (entity.getDataInfo() == null) entity.setDataInfo(new DataInfo()); entity.getDataInfo().setDeletedbyinference(true); return mapper.writeValueAsString(entity); } catch (IOException e) { throw new RuntimeException("Unable to convert json", e); } - - } - - } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/GraphHiveImporterJob.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/GraphHiveImporterJob.java index 0270076dd..d0fe95289 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/GraphHiveImporterJob.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/GraphHiveImporterJob.java @@ -1,8 +1,11 @@ package eu.dnetlib.dhp.oa.graph; +import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession; + import com.fasterxml.jackson.databind.ObjectMapper; import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.schema.common.ModelSupport; +import java.util.Optional; import org.apache.commons.io.IOUtils; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaSparkContext; @@ -12,26 +15,23 @@ import org.apache.spark.sql.SparkSession; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.util.Optional; - -import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession; -import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; - public class GraphHiveImporterJob { private static final Logger log = LoggerFactory.getLogger(GraphHiveImporterJob.class); public static void main(String[] args) throws Exception { - final ArgumentApplicationParser parser = new ArgumentApplicationParser( - IOUtils.toString(GraphHiveImporterJob.class.getResourceAsStream( - "/eu/dnetlib/dhp/oa/graph/input_graph_hive_parameters.json"))); + final ArgumentApplicationParser parser = + new ArgumentApplicationParser( + IOUtils.toString( + GraphHiveImporterJob.class.getResourceAsStream( + "/eu/dnetlib/dhp/oa/graph/input_graph_hive_parameters.json"))); parser.parseArgument(args); - Boolean isSparkSessionManaged = Optional - .ofNullable(parser.get("isSparkSessionManaged")) - .map(Boolean::valueOf) - .orElse(Boolean.TRUE); + Boolean isSparkSessionManaged = + Optional.ofNullable(parser.get("isSparkSessionManaged")) + .map(Boolean::valueOf) + .orElse(Boolean.TRUE); log.info("isSparkSessionManaged: {}", isSparkSessionManaged); String inputPath = parser.get("inputPath"); @@ -46,7 +46,9 @@ public class GraphHiveImporterJob { SparkConf conf = new SparkConf(); conf.set("hive.metastore.uris", hiveMetastoreUris); - runWithSparkHiveSession(conf, isSparkSessionManaged, + runWithSparkHiveSession( + conf, + isSparkSessionManaged, spark -> loadGraphAsHiveDB(spark, inputPath, hiveDbName)); } @@ -58,12 +60,15 @@ public class GraphHiveImporterJob { final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); // Read the input file and convert it into RDD of serializable object - ModelSupport.oafTypes.forEach((name, clazz) -> spark.createDataset(sc.textFile(inputPath + "/" + name) - .map(s -> new ObjectMapper().readValue(s, clazz)) - .rdd(), Encoders.bean(clazz)) - .write() - .mode(SaveMode.Overwrite) - .saveAsTable(hiveDbName + "." + name)); + ModelSupport.oafTypes.forEach( + (name, clazz) -> + spark.createDataset( + sc.textFile(inputPath + "/" + name) + .map(s -> new ObjectMapper().readValue(s, clazz)) + .rdd(), + Encoders.bean(clazz)) + .write() + .mode(SaveMode.Overwrite) + .saveAsTable(hiveDbName + "." + name)); } - } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java index 7d99a4774..f1058db38 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java @@ -1,364 +1,443 @@ package eu.dnetlib.dhp.oa.graph.raw; +import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.*; + import eu.dnetlib.dhp.schema.oaf.*; +import java.util.*; import org.apache.commons.lang3.StringUtils; import org.dom4j.Document; import org.dom4j.DocumentFactory; import org.dom4j.DocumentHelper; import org.dom4j.Node; -import java.util.*; - -import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.*; - public abstract class AbstractMdRecordToOafMapper { - protected final Map code2name; - - protected static final Qualifier MAIN_TITLE_QUALIFIER = qualifier("main title", "main title", "dnet:dataCite_title", "dnet:dataCite_title"); - - protected static final Qualifier PUBLICATION_RESULTTYPE_QUALIFIER = - qualifier("publication", "publication", "dnet:result_typologies", "dnet:result_typologies"); - protected static final Qualifier DATASET_RESULTTYPE_QUALIFIER = qualifier("dataset", "dataset", "dnet:result_typologies", "dnet:result_typologies"); - protected static final Qualifier SOFTWARE_RESULTTYPE_QUALIFIER = qualifier("software", "software", "dnet:result_typologies", "dnet:result_typologies"); - protected static final Qualifier OTHER_RESULTTYPE_QUALIFIER = qualifier("other", "other", "dnet:result_typologies", "dnet:result_typologies"); - - protected AbstractMdRecordToOafMapper(final Map code2name) { - this.code2name = code2name; - } - - public List processMdRecord(final String xml) { - try { - final Map nsContext = new HashMap<>(); - nsContext.put("dr", "http://www.driver-repository.eu/namespace/dr"); - nsContext.put("dri", "http://www.driver-repository.eu/namespace/dri"); - nsContext.put("oaf", "http://namespace.openaire.eu/oaf"); - nsContext.put("oai", "http://www.openarchives.org/OAI/2.0/"); - nsContext.put("prov", "http://www.openarchives.org/OAI/2.0/provenance"); - nsContext.put("dc", "http://purl.org/dc/elements/1.1/"); - nsContext.put("datacite", "http://datacite.org/schema/kernel-3"); - DocumentFactory.getInstance().setXPathNamespaceURIs(nsContext); - - final Document doc = DocumentHelper.parseText(xml.replaceAll("http://datacite.org/schema/kernel-4", "http://datacite.org/schema/kernel-3")); - - final String type = doc.valueOf("//dr:CobjCategory/@type"); - final KeyValue collectedFrom = keyValue(doc.valueOf("//oaf:collectedFrom/@id"), doc.valueOf("//oaf:collectedFrom/@name")); - final KeyValue hostedBy = StringUtils.isBlank(doc.valueOf("//oaf:hostedBy/@id")) ? collectedFrom - : keyValue(doc.valueOf("//oaf:hostedBy/@id"), doc.valueOf("//oaf:hostedBy/@name")); - - final DataInfo info = prepareDataInfo(doc); - final long lastUpdateTimestamp = new Date().getTime(); - - return createOafs(doc, type, collectedFrom, hostedBy, info, lastUpdateTimestamp); - } catch (final Exception e) { - throw new RuntimeException(e); - } - } - - protected List createOafs(final Document doc, - final String type, - final KeyValue collectedFrom, - final KeyValue hostedBy, - final DataInfo info, - final long lastUpdateTimestamp) { - - final List oafs = new ArrayList<>(); - - switch (type.toLowerCase()) { - case "": - case "publication": - final Publication p = new Publication(); - populateResultFields(p, doc, collectedFrom, hostedBy, info, lastUpdateTimestamp); - p.setResulttype(PUBLICATION_RESULTTYPE_QUALIFIER); - p.setJournal(prepareJournal(doc, info)); - oafs.add(p); - break; - case "dataset": - final Dataset d = new Dataset(); - populateResultFields(d, doc, collectedFrom, hostedBy, info, lastUpdateTimestamp); - d.setResulttype(DATASET_RESULTTYPE_QUALIFIER); - d.setStoragedate(prepareDatasetStorageDate(doc, info)); - d.setDevice(prepareDatasetDevice(doc, info)); - d.setSize(prepareDatasetSize(doc, info)); - d.setVersion(prepareDatasetVersion(doc, info)); - d.setLastmetadataupdate(prepareDatasetLastMetadataUpdate(doc, info)); - d.setMetadataversionnumber(prepareDatasetMetadataVersionNumber(doc, info)); - d.setGeolocation(prepareDatasetGeoLocations(doc, info)); - oafs.add(d); - break; - case "software": - final Software s = new Software(); - populateResultFields(s, doc, collectedFrom, hostedBy, info, lastUpdateTimestamp); - s.setResulttype(SOFTWARE_RESULTTYPE_QUALIFIER); - s.setDocumentationUrl(prepareSoftwareDocumentationUrls(doc, info)); - s.setLicense(prepareSoftwareLicenses(doc, info)); - s.setCodeRepositoryUrl(prepareSoftwareCodeRepositoryUrl(doc, info)); - s.setProgrammingLanguage(prepareSoftwareProgrammingLanguage(doc, info)); - oafs.add(s); - break; - case "otherresearchproducts": - default: - final OtherResearchProduct o = new OtherResearchProduct(); - populateResultFields(o, doc, collectedFrom, hostedBy, info, lastUpdateTimestamp); - o.setResulttype(OTHER_RESULTTYPE_QUALIFIER); - o.setContactperson(prepareOtherResearchProductContactPersons(doc, info)); - o.setContactgroup(prepareOtherResearchProductContactGroups(doc, info)); - o.setTool(prepareOtherResearchProductTools(doc, info)); - oafs.add(o); - break; - } - - if (!oafs.isEmpty()) { - oafs.addAll(addProjectRels(doc, collectedFrom, info, lastUpdateTimestamp)); - oafs.addAll(addOtherResultRels(doc, collectedFrom, info, lastUpdateTimestamp)); - } - - return oafs; - } - - private List addProjectRels(final Document doc, - final KeyValue collectedFrom, - final DataInfo info, - final long lastUpdateTimestamp) { - - final List res = new ArrayList<>(); - - final String docId = createOpenaireId(50, doc.valueOf("//dri:objIdentifier"), false); - - for (final Object o : doc.selectNodes("//oaf:projectid")) { - final String projectId = createOpenaireId(40, ((Node) o).getText(), true); - - final Relation r1 = new Relation(); - r1.setRelType("resultProject"); - r1.setSubRelType("outcome"); - r1.setRelClass("isProducedBy"); - r1.setSource(docId); - r1.setTarget(projectId); - r1.setCollectedFrom(Arrays.asList(collectedFrom)); - r1.setDataInfo(info); - r1.setLastupdatetimestamp(lastUpdateTimestamp); - res.add(r1); - - final Relation r2 = new Relation(); - r2.setRelType("resultProject"); - r2.setSubRelType("outcome"); - r2.setRelClass("produces"); - r2.setSource(projectId); - r2.setTarget(docId); - r2.setCollectedFrom(Arrays.asList(collectedFrom)); - r2.setDataInfo(info); - r2.setLastupdatetimestamp(lastUpdateTimestamp); - res.add(r2); - } - - return res; - } - - protected abstract List addOtherResultRels(final Document doc, - final KeyValue collectedFrom, - final DataInfo info, - final long lastUpdateTimestamp); - - private void populateResultFields(final Result r, - final Document doc, - final KeyValue collectedFrom, - final KeyValue hostedBy, - final DataInfo info, - final long lastUpdateTimestamp) { - r.setDataInfo(info); - r.setLastupdatetimestamp(lastUpdateTimestamp); - r.setId(createOpenaireId(50, doc.valueOf("//dri:objIdentifier"), false)); - r.setOriginalId(Arrays.asList(doc.valueOf("//dri:objIdentifier"))); - r.setCollectedfrom(Arrays.asList(collectedFrom)); - r.setPid(prepareListStructProps(doc, "//oaf:identifier", "@identifierType", "dnet:pid_types", "dnet:pid_types", info)); - r.setDateofcollection(doc.valueOf("//dr:dateOfCollection")); - r.setDateoftransformation(doc.valueOf("//dr:dateOfTransformation")); - r.setExtraInfo(new ArrayList<>()); // NOT PRESENT IN MDSTORES - r.setOaiprovenance(prepareOAIprovenance(doc)); - r.setAuthor(prepareAuthors(doc, info)); - r.setLanguage(prepareLanguages(doc)); - r.setCountry(new ArrayList<>()); // NOT PRESENT IN MDSTORES - r.setSubject(prepareSubjects(doc, info)); - r.setTitle(prepareTitles(doc, info)); - r.setRelevantdate(prepareRelevantDates(doc, info)); - r.setDescription(prepareDescriptions(doc, info)); - r.setDateofacceptance(prepareField(doc, "//oaf:dateAccepted", info)); - r.setPublisher(preparePublisher(doc, info)); - r.setEmbargoenddate(prepareField(doc, "//oaf:embargoenddate", info)); - r.setSource(prepareSources(doc, info)); - r.setFulltext(new ArrayList<>()); // NOT PRESENT IN MDSTORES - r.setFormat(prepareFormats(doc, info)); - r.setContributor(prepareContributors(doc, info)); - r.setResourcetype(prepareResourceType(doc, info)); - r.setCoverage(prepareCoverages(doc, info)); - r.setContext(new ArrayList<>()); // NOT PRESENT IN MDSTORES - r.setExternalReference(new ArrayList<>()); // NOT PRESENT IN MDSTORES - r.setInstance(prepareInstances(doc, info, collectedFrom, hostedBy)); - } - - protected abstract Qualifier prepareResourceType(Document doc, DataInfo info); - - protected abstract List prepareInstances(Document doc, DataInfo info, KeyValue collectedfrom, KeyValue hostedby); - - protected abstract List> prepareSources(Document doc, DataInfo info); - - protected abstract List prepareRelevantDates(Document doc, DataInfo info); - - protected abstract List> prepareCoverages(Document doc, DataInfo info); - - protected abstract List> prepareContributors(Document doc, DataInfo info); - - protected abstract List> prepareFormats(Document doc, DataInfo info); - - protected abstract Field preparePublisher(Document doc, DataInfo info); - - protected abstract List> prepareDescriptions(Document doc, DataInfo info); - - protected abstract List prepareTitles(Document doc, DataInfo info); - - protected abstract List prepareSubjects(Document doc, DataInfo info); - - protected abstract Qualifier prepareLanguages(Document doc); - - protected abstract List prepareAuthors(Document doc, DataInfo info); - - protected abstract List> prepareOtherResearchProductTools(Document doc, DataInfo info); - - protected abstract List> prepareOtherResearchProductContactGroups(Document doc, DataInfo info); - - protected abstract List> prepareOtherResearchProductContactPersons(Document doc, DataInfo info); - - protected abstract Qualifier prepareSoftwareProgrammingLanguage(Document doc, DataInfo info); - - protected abstract Field prepareSoftwareCodeRepositoryUrl(Document doc, DataInfo info); - - protected abstract List prepareSoftwareLicenses(Document doc, DataInfo info); - - protected abstract List> prepareSoftwareDocumentationUrls(Document doc, DataInfo info); - - protected abstract List prepareDatasetGeoLocations(Document doc, DataInfo info); - - protected abstract Field prepareDatasetMetadataVersionNumber(Document doc, DataInfo info); - - protected abstract Field prepareDatasetLastMetadataUpdate(Document doc, DataInfo info); - - protected abstract Field prepareDatasetVersion(Document doc, DataInfo info); - - protected abstract Field prepareDatasetSize(Document doc, DataInfo info); - - protected abstract Field prepareDatasetDevice(Document doc, DataInfo info); - - protected abstract Field prepareDatasetStorageDate(Document doc, DataInfo info); - - private Journal prepareJournal(final Document doc, final DataInfo info) { - final Node n = doc.selectSingleNode("//oaf:journal"); - if (n != null) { - final String name = n.getText(); - final String issnPrinted = n.valueOf("@issn"); - final String issnOnline = n.valueOf("@eissn"); - final String issnLinking = n.valueOf("@lissn"); - final String ep = n.valueOf("@ep"); - final String iss = n.valueOf("@iss"); - final String sp = n.valueOf("@sp"); - final String vol = n.valueOf("@vol"); - final String edition = n.valueOf("@edition"); - if (StringUtils.isNotBlank(name)) { return journal(name, issnPrinted, issnOnline, issnLinking, ep, iss, sp, vol, edition, null, null, info); } - } - return null; - } - - protected Qualifier prepareQualifier(final Node node, final String xpath, final String schemeId, final String schemeName) { - final String classId = node.valueOf(xpath); - final String className = code2name.get(classId); - return qualifier(classId, className, schemeId, schemeName); - } - - protected List prepareListStructProps(final Node node, - final String xpath, - final String xpathClassId, - final String schemeId, - final String schemeName, - final DataInfo info) { - final List res = new ArrayList<>(); - for (final Object o : node.selectNodes(xpath)) { - final Node n = (Node) o; - final String classId = n.valueOf(xpathClassId); - final String className = code2name.get(classId); - res.add(structuredProperty(n.getText(), classId, className, schemeId, schemeName, info)); - } - return res; - } - - protected List prepareListStructProps(final Node node, final String xpath, final Qualifier qualifier, final DataInfo info) { - final List res = new ArrayList<>(); - for (final Object o : node.selectNodes(xpath)) { - final Node n = (Node) o; - res.add(structuredProperty(n.getText(), qualifier, info)); - } - return res; - } - - protected List prepareListStructProps(final Node node, final String xpath, final DataInfo info) { - final List res = new ArrayList<>(); - for (final Object o : node.selectNodes(xpath)) { - final Node n = (Node) o; - res.add(structuredProperty(n.getText(), n.valueOf("@classid"), n.valueOf("@classname"), n.valueOf("@schemeid"), n - .valueOf("@schemename"), info)); - } - return res; - } - - protected OAIProvenance prepareOAIprovenance(final Document doc) { - final Node n = doc.selectSingleNode("//*[local-name()='provenance']/*[local-name()='originDescription']"); - - if (n == null) { return null; } - - final String identifier = n.valueOf("./*[local-name()='identifier']"); - final String baseURL = n.valueOf("./*[local-name()='baseURL']");; - final String metadataNamespace = n.valueOf("./*[local-name()='metadataNamespace']");; - final boolean altered = n.valueOf("@altered").equalsIgnoreCase("true"); - final String datestamp = n.valueOf("./*[local-name()='datestamp']");; - final String harvestDate = n.valueOf("@harvestDate");; - - return oaiIProvenance(identifier, baseURL, metadataNamespace, altered, datestamp, harvestDate); - - } - - protected DataInfo prepareDataInfo(final Document doc) { - final Node n = doc.selectSingleNode("//oaf:datainfo"); - - if (n == null) { return null; } - - final String paClassId = n.valueOf("./oaf:provenanceaction/@classid"); - final String paClassName = n.valueOf("./oaf:provenanceaction/@classname"); - final String paSchemeId = n.valueOf("./oaf:provenanceaction/@schemeid"); - final String paSchemeName = n.valueOf("./oaf:provenanceaction/@schemename"); - - final boolean deletedbyinference = Boolean.parseBoolean(n.valueOf("./oaf:deletedbyinference")); - final String inferenceprovenance = n.valueOf("./oaf:inferenceprovenance"); - final Boolean inferred = Boolean.parseBoolean(n.valueOf("./oaf:inferred")); - final String trust = n.valueOf("./oaf:trust"); - - return dataInfo(deletedbyinference, inferenceprovenance, inferred, false, qualifier(paClassId, paClassName, paSchemeId, paSchemeName), trust); - } - - protected Field prepareField(final Node node, final String xpath, final DataInfo info) { - return field(node.valueOf(xpath), info); - } - - protected List> prepareListFields(final Node node, final String xpath, final DataInfo info) { - return listFields(info, prepareListString(node, xpath)); - } - - protected List prepareListString(final Node node, final String xpath) { - final List res = new ArrayList<>(); - for (final Object o : node.selectNodes(xpath)) { - final String s = ((Node) o).getText().trim(); - if (StringUtils.isNotBlank(s)) { - res.add(s); - } - } - return res; - } - + protected final Map code2name; + + protected static final Qualifier MAIN_TITLE_QUALIFIER = + qualifier("main title", "main title", "dnet:dataCite_title", "dnet:dataCite_title"); + + protected static final Qualifier PUBLICATION_RESULTTYPE_QUALIFIER = + qualifier( + "publication", + "publication", + "dnet:result_typologies", + "dnet:result_typologies"); + protected static final Qualifier DATASET_RESULTTYPE_QUALIFIER = + qualifier("dataset", "dataset", "dnet:result_typologies", "dnet:result_typologies"); + protected static final Qualifier SOFTWARE_RESULTTYPE_QUALIFIER = + qualifier("software", "software", "dnet:result_typologies", "dnet:result_typologies"); + protected static final Qualifier OTHER_RESULTTYPE_QUALIFIER = + qualifier("other", "other", "dnet:result_typologies", "dnet:result_typologies"); + + protected AbstractMdRecordToOafMapper(final Map code2name) { + this.code2name = code2name; + } + + public List processMdRecord(final String xml) { + try { + final Map nsContext = new HashMap<>(); + nsContext.put("dr", "http://www.driver-repository.eu/namespace/dr"); + nsContext.put("dri", "http://www.driver-repository.eu/namespace/dri"); + nsContext.put("oaf", "http://namespace.openaire.eu/oaf"); + nsContext.put("oai", "http://www.openarchives.org/OAI/2.0/"); + nsContext.put("prov", "http://www.openarchives.org/OAI/2.0/provenance"); + nsContext.put("dc", "http://purl.org/dc/elements/1.1/"); + nsContext.put("datacite", "http://datacite.org/schema/kernel-3"); + DocumentFactory.getInstance().setXPathNamespaceURIs(nsContext); + + final Document doc = + DocumentHelper.parseText( + xml.replaceAll( + "http://datacite.org/schema/kernel-4", + "http://datacite.org/schema/kernel-3")); + + final String type = doc.valueOf("//dr:CobjCategory/@type"); + final KeyValue collectedFrom = + keyValue( + doc.valueOf("//oaf:collectedFrom/@id"), + doc.valueOf("//oaf:collectedFrom/@name")); + final KeyValue hostedBy = + StringUtils.isBlank(doc.valueOf("//oaf:hostedBy/@id")) + ? collectedFrom + : keyValue( + doc.valueOf("//oaf:hostedBy/@id"), + doc.valueOf("//oaf:hostedBy/@name")); + + final DataInfo info = prepareDataInfo(doc); + final long lastUpdateTimestamp = new Date().getTime(); + + return createOafs(doc, type, collectedFrom, hostedBy, info, lastUpdateTimestamp); + } catch (final Exception e) { + throw new RuntimeException(e); + } + } + + protected List createOafs( + final Document doc, + final String type, + final KeyValue collectedFrom, + final KeyValue hostedBy, + final DataInfo info, + final long lastUpdateTimestamp) { + + final List oafs = new ArrayList<>(); + + switch (type.toLowerCase()) { + case "": + case "publication": + final Publication p = new Publication(); + populateResultFields(p, doc, collectedFrom, hostedBy, info, lastUpdateTimestamp); + p.setResulttype(PUBLICATION_RESULTTYPE_QUALIFIER); + p.setJournal(prepareJournal(doc, info)); + oafs.add(p); + break; + case "dataset": + final Dataset d = new Dataset(); + populateResultFields(d, doc, collectedFrom, hostedBy, info, lastUpdateTimestamp); + d.setResulttype(DATASET_RESULTTYPE_QUALIFIER); + d.setStoragedate(prepareDatasetStorageDate(doc, info)); + d.setDevice(prepareDatasetDevice(doc, info)); + d.setSize(prepareDatasetSize(doc, info)); + d.setVersion(prepareDatasetVersion(doc, info)); + d.setLastmetadataupdate(prepareDatasetLastMetadataUpdate(doc, info)); + d.setMetadataversionnumber(prepareDatasetMetadataVersionNumber(doc, info)); + d.setGeolocation(prepareDatasetGeoLocations(doc, info)); + oafs.add(d); + break; + case "software": + final Software s = new Software(); + populateResultFields(s, doc, collectedFrom, hostedBy, info, lastUpdateTimestamp); + s.setResulttype(SOFTWARE_RESULTTYPE_QUALIFIER); + s.setDocumentationUrl(prepareSoftwareDocumentationUrls(doc, info)); + s.setLicense(prepareSoftwareLicenses(doc, info)); + s.setCodeRepositoryUrl(prepareSoftwareCodeRepositoryUrl(doc, info)); + s.setProgrammingLanguage(prepareSoftwareProgrammingLanguage(doc, info)); + oafs.add(s); + break; + case "otherresearchproducts": + default: + final OtherResearchProduct o = new OtherResearchProduct(); + populateResultFields(o, doc, collectedFrom, hostedBy, info, lastUpdateTimestamp); + o.setResulttype(OTHER_RESULTTYPE_QUALIFIER); + o.setContactperson(prepareOtherResearchProductContactPersons(doc, info)); + o.setContactgroup(prepareOtherResearchProductContactGroups(doc, info)); + o.setTool(prepareOtherResearchProductTools(doc, info)); + oafs.add(o); + break; + } + + if (!oafs.isEmpty()) { + oafs.addAll(addProjectRels(doc, collectedFrom, info, lastUpdateTimestamp)); + oafs.addAll(addOtherResultRels(doc, collectedFrom, info, lastUpdateTimestamp)); + } + + return oafs; + } + + private List addProjectRels( + final Document doc, + final KeyValue collectedFrom, + final DataInfo info, + final long lastUpdateTimestamp) { + + final List res = new ArrayList<>(); + + final String docId = createOpenaireId(50, doc.valueOf("//dri:objIdentifier"), false); + + for (final Object o : doc.selectNodes("//oaf:projectid")) { + final String projectId = createOpenaireId(40, ((Node) o).getText(), true); + + final Relation r1 = new Relation(); + r1.setRelType("resultProject"); + r1.setSubRelType("outcome"); + r1.setRelClass("isProducedBy"); + r1.setSource(docId); + r1.setTarget(projectId); + r1.setCollectedFrom(Arrays.asList(collectedFrom)); + r1.setDataInfo(info); + r1.setLastupdatetimestamp(lastUpdateTimestamp); + res.add(r1); + + final Relation r2 = new Relation(); + r2.setRelType("resultProject"); + r2.setSubRelType("outcome"); + r2.setRelClass("produces"); + r2.setSource(projectId); + r2.setTarget(docId); + r2.setCollectedFrom(Arrays.asList(collectedFrom)); + r2.setDataInfo(info); + r2.setLastupdatetimestamp(lastUpdateTimestamp); + res.add(r2); + } + + return res; + } + + protected abstract List addOtherResultRels( + final Document doc, + final KeyValue collectedFrom, + final DataInfo info, + final long lastUpdateTimestamp); + + private void populateResultFields( + final Result r, + final Document doc, + final KeyValue collectedFrom, + final KeyValue hostedBy, + final DataInfo info, + final long lastUpdateTimestamp) { + r.setDataInfo(info); + r.setLastupdatetimestamp(lastUpdateTimestamp); + r.setId(createOpenaireId(50, doc.valueOf("//dri:objIdentifier"), false)); + r.setOriginalId(Arrays.asList(doc.valueOf("//dri:objIdentifier"))); + r.setCollectedfrom(Arrays.asList(collectedFrom)); + r.setPid( + prepareListStructProps( + doc, + "//oaf:identifier", + "@identifierType", + "dnet:pid_types", + "dnet:pid_types", + info)); + r.setDateofcollection(doc.valueOf("//dr:dateOfCollection")); + r.setDateoftransformation(doc.valueOf("//dr:dateOfTransformation")); + r.setExtraInfo(new ArrayList<>()); // NOT PRESENT IN MDSTORES + r.setOaiprovenance(prepareOAIprovenance(doc)); + r.setAuthor(prepareAuthors(doc, info)); + r.setLanguage(prepareLanguages(doc)); + r.setCountry(new ArrayList<>()); // NOT PRESENT IN MDSTORES + r.setSubject(prepareSubjects(doc, info)); + r.setTitle(prepareTitles(doc, info)); + r.setRelevantdate(prepareRelevantDates(doc, info)); + r.setDescription(prepareDescriptions(doc, info)); + r.setDateofacceptance(prepareField(doc, "//oaf:dateAccepted", info)); + r.setPublisher(preparePublisher(doc, info)); + r.setEmbargoenddate(prepareField(doc, "//oaf:embargoenddate", info)); + r.setSource(prepareSources(doc, info)); + r.setFulltext(new ArrayList<>()); // NOT PRESENT IN MDSTORES + r.setFormat(prepareFormats(doc, info)); + r.setContributor(prepareContributors(doc, info)); + r.setResourcetype(prepareResourceType(doc, info)); + r.setCoverage(prepareCoverages(doc, info)); + r.setContext(new ArrayList<>()); // NOT PRESENT IN MDSTORES + r.setExternalReference(new ArrayList<>()); // NOT PRESENT IN MDSTORES + r.setInstance(prepareInstances(doc, info, collectedFrom, hostedBy)); + } + + protected abstract Qualifier prepareResourceType(Document doc, DataInfo info); + + protected abstract List prepareInstances( + Document doc, DataInfo info, KeyValue collectedfrom, KeyValue hostedby); + + protected abstract List> prepareSources(Document doc, DataInfo info); + + protected abstract List prepareRelevantDates(Document doc, DataInfo info); + + protected abstract List> prepareCoverages(Document doc, DataInfo info); + + protected abstract List> prepareContributors(Document doc, DataInfo info); + + protected abstract List> prepareFormats(Document doc, DataInfo info); + + protected abstract Field preparePublisher(Document doc, DataInfo info); + + protected abstract List> prepareDescriptions(Document doc, DataInfo info); + + protected abstract List prepareTitles(Document doc, DataInfo info); + + protected abstract List prepareSubjects(Document doc, DataInfo info); + + protected abstract Qualifier prepareLanguages(Document doc); + + protected abstract List prepareAuthors(Document doc, DataInfo info); + + protected abstract List> prepareOtherResearchProductTools( + Document doc, DataInfo info); + + protected abstract List> prepareOtherResearchProductContactGroups( + Document doc, DataInfo info); + + protected abstract List> prepareOtherResearchProductContactPersons( + Document doc, DataInfo info); + + protected abstract Qualifier prepareSoftwareProgrammingLanguage(Document doc, DataInfo info); + + protected abstract Field prepareSoftwareCodeRepositoryUrl(Document doc, DataInfo info); + + protected abstract List prepareSoftwareLicenses( + Document doc, DataInfo info); + + protected abstract List> prepareSoftwareDocumentationUrls( + Document doc, DataInfo info); + + protected abstract List prepareDatasetGeoLocations(Document doc, DataInfo info); + + protected abstract Field prepareDatasetMetadataVersionNumber( + Document doc, DataInfo info); + + protected abstract Field prepareDatasetLastMetadataUpdate(Document doc, DataInfo info); + + protected abstract Field prepareDatasetVersion(Document doc, DataInfo info); + + protected abstract Field prepareDatasetSize(Document doc, DataInfo info); + + protected abstract Field prepareDatasetDevice(Document doc, DataInfo info); + + protected abstract Field prepareDatasetStorageDate(Document doc, DataInfo info); + + private Journal prepareJournal(final Document doc, final DataInfo info) { + final Node n = doc.selectSingleNode("//oaf:journal"); + if (n != null) { + final String name = n.getText(); + final String issnPrinted = n.valueOf("@issn"); + final String issnOnline = n.valueOf("@eissn"); + final String issnLinking = n.valueOf("@lissn"); + final String ep = n.valueOf("@ep"); + final String iss = n.valueOf("@iss"); + final String sp = n.valueOf("@sp"); + final String vol = n.valueOf("@vol"); + final String edition = n.valueOf("@edition"); + if (StringUtils.isNotBlank(name)) { + return journal( + name, + issnPrinted, + issnOnline, + issnLinking, + ep, + iss, + sp, + vol, + edition, + null, + null, + info); + } + } + return null; + } + + protected Qualifier prepareQualifier( + final Node node, final String xpath, final String schemeId, final String schemeName) { + final String classId = node.valueOf(xpath); + final String className = code2name.get(classId); + return qualifier(classId, className, schemeId, schemeName); + } + + protected List prepareListStructProps( + final Node node, + final String xpath, + final String xpathClassId, + final String schemeId, + final String schemeName, + final DataInfo info) { + final List res = new ArrayList<>(); + for (final Object o : node.selectNodes(xpath)) { + final Node n = (Node) o; + final String classId = n.valueOf(xpathClassId); + final String className = code2name.get(classId); + res.add( + structuredProperty( + n.getText(), classId, className, schemeId, schemeName, info)); + } + return res; + } + + protected List prepareListStructProps( + final Node node, final String xpath, final Qualifier qualifier, final DataInfo info) { + final List res = new ArrayList<>(); + for (final Object o : node.selectNodes(xpath)) { + final Node n = (Node) o; + res.add(structuredProperty(n.getText(), qualifier, info)); + } + return res; + } + + protected List prepareListStructProps( + final Node node, final String xpath, final DataInfo info) { + final List res = new ArrayList<>(); + for (final Object o : node.selectNodes(xpath)) { + final Node n = (Node) o; + res.add( + structuredProperty( + n.getText(), + n.valueOf("@classid"), + n.valueOf("@classname"), + n.valueOf("@schemeid"), + n.valueOf("@schemename"), + info)); + } + return res; + } + + protected OAIProvenance prepareOAIprovenance(final Document doc) { + final Node n = + doc.selectSingleNode( + "//*[local-name()='provenance']/*[local-name()='originDescription']"); + + if (n == null) { + return null; + } + + final String identifier = n.valueOf("./*[local-name()='identifier']"); + final String baseURL = n.valueOf("./*[local-name()='baseURL']"); + ; + final String metadataNamespace = n.valueOf("./*[local-name()='metadataNamespace']"); + ; + final boolean altered = n.valueOf("@altered").equalsIgnoreCase("true"); + final String datestamp = n.valueOf("./*[local-name()='datestamp']"); + ; + final String harvestDate = n.valueOf("@harvestDate"); + ; + + return oaiIProvenance( + identifier, baseURL, metadataNamespace, altered, datestamp, harvestDate); + } + + protected DataInfo prepareDataInfo(final Document doc) { + final Node n = doc.selectSingleNode("//oaf:datainfo"); + + if (n == null) { + return null; + } + + final String paClassId = n.valueOf("./oaf:provenanceaction/@classid"); + final String paClassName = n.valueOf("./oaf:provenanceaction/@classname"); + final String paSchemeId = n.valueOf("./oaf:provenanceaction/@schemeid"); + final String paSchemeName = n.valueOf("./oaf:provenanceaction/@schemename"); + + final boolean deletedbyinference = + Boolean.parseBoolean(n.valueOf("./oaf:deletedbyinference")); + final String inferenceprovenance = n.valueOf("./oaf:inferenceprovenance"); + final Boolean inferred = Boolean.parseBoolean(n.valueOf("./oaf:inferred")); + final String trust = n.valueOf("./oaf:trust"); + + return dataInfo( + deletedbyinference, + inferenceprovenance, + inferred, + false, + qualifier(paClassId, paClassName, paSchemeId, paSchemeName), + trust); + } + + protected Field prepareField(final Node node, final String xpath, final DataInfo info) { + return field(node.valueOf(xpath), info); + } + + protected List> prepareListFields( + final Node node, final String xpath, final DataInfo info) { + return listFields(info, prepareListString(node, xpath)); + } + + protected List prepareListString(final Node node, final String xpath) { + final List res = new ArrayList<>(); + for (final Object o : node.selectNodes(xpath)) { + final String s = ((Node) o).getText().trim(); + if (StringUtils.isNotBlank(s)) { + res.add(s); + } + } + return res; + } } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/DispatchEntitiesApplication.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/DispatchEntitiesApplication.java index 4812f1c30..4ee0ce3a4 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/DispatchEntitiesApplication.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/DispatchEntitiesApplication.java @@ -1,94 +1,96 @@ package eu.dnetlib.dhp.oa.graph.raw; +import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; + import com.fasterxml.jackson.databind.ObjectMapper; import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.common.HdfsSupport; import eu.dnetlib.dhp.schema.oaf.*; +import java.util.Optional; import org.apache.commons.io.IOUtils; import org.apache.commons.lang3.StringUtils; -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; import org.apache.hadoop.io.compress.GzipCodec; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaSparkContext; -import org.apache.spark.api.java.function.FilterFunction; -import org.apache.spark.api.java.function.MapFunction; -import org.apache.spark.sql.Encoders; -import org.apache.spark.sql.SaveMode; import org.apache.spark.sql.SparkSession; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.util.Optional; - -import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; - public class DispatchEntitiesApplication { - private static final Logger log = LoggerFactory.getLogger(DispatchEntitiesApplication.class); + private static final Logger log = LoggerFactory.getLogger(DispatchEntitiesApplication.class); - private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); - public static void main(final String[] args) throws Exception { - final ArgumentApplicationParser parser = new ArgumentApplicationParser( - IOUtils.toString(MigrateMongoMdstoresApplication.class - .getResourceAsStream("/eu/dnetlib/dhp/oa/graph/dispatch_entities_parameters.json"))); - parser.parseArgument(args); + public static void main(final String[] args) throws Exception { + final ArgumentApplicationParser parser = + new ArgumentApplicationParser( + IOUtils.toString( + MigrateMongoMdstoresApplication.class.getResourceAsStream( + "/eu/dnetlib/dhp/oa/graph/dispatch_entities_parameters.json"))); + parser.parseArgument(args); - Boolean isSparkSessionManaged = Optional - .ofNullable(parser.get("isSparkSessionManaged")) - .map(Boolean::valueOf) - .orElse(Boolean.TRUE); - log.info("isSparkSessionManaged: {}", isSparkSessionManaged); + Boolean isSparkSessionManaged = + Optional.ofNullable(parser.get("isSparkSessionManaged")) + .map(Boolean::valueOf) + .orElse(Boolean.TRUE); + log.info("isSparkSessionManaged: {}", isSparkSessionManaged); - final String sourcePath = parser.get("sourcePath"); - final String targetPath = parser.get("graphRawPath"); + final String sourcePath = parser.get("sourcePath"); + final String targetPath = parser.get("graphRawPath"); - SparkConf conf = new SparkConf(); - runWithSparkSession(conf, isSparkSessionManaged, spark -> { - removeOutputDir(spark, targetPath); + SparkConf conf = new SparkConf(); + runWithSparkSession( + conf, + isSparkSessionManaged, + spark -> { + removeOutputDir(spark, targetPath); - processEntity(spark, Publication.class, sourcePath, targetPath); - processEntity(spark, Dataset.class, sourcePath, targetPath); - processEntity(spark, Software.class, sourcePath, targetPath); - processEntity(spark, OtherResearchProduct.class, sourcePath, targetPath); - processEntity(spark, Datasource.class, sourcePath, targetPath); - processEntity(spark, Organization.class, sourcePath, targetPath); - processEntity(spark, Project.class, sourcePath, targetPath); - processEntity(spark, Relation.class, sourcePath, targetPath); - }); - } + processEntity(spark, Publication.class, sourcePath, targetPath); + processEntity(spark, Dataset.class, sourcePath, targetPath); + processEntity(spark, Software.class, sourcePath, targetPath); + processEntity(spark, OtherResearchProduct.class, sourcePath, targetPath); + processEntity(spark, Datasource.class, sourcePath, targetPath); + processEntity(spark, Organization.class, sourcePath, targetPath); + processEntity(spark, Project.class, sourcePath, targetPath); + processEntity(spark, Relation.class, sourcePath, targetPath); + }); + } - private static void processEntity(final SparkSession spark, final Class clazz, final String sourcePath, final String targetPath) { - final String type = clazz.getSimpleName().toLowerCase(); + private static void processEntity( + final SparkSession spark, + final Class clazz, + final String sourcePath, + final String targetPath) { + final String type = clazz.getSimpleName().toLowerCase(); - log.info(String.format("Processing entities (%s) in file: %s", type, sourcePath)); + log.info(String.format("Processing entities (%s) in file: %s", type, sourcePath)); - /* - spark.read() - .textFile(sourcePath) - .filter((FilterFunction) value -> isEntityType(value, type)) - .map((MapFunction) value -> StringUtils.substringAfter(value, "|"), Encoders.STRING()) - .map((MapFunction) value -> OBJECT_MAPPER.readValue(value, clazz), Encoders.bean(clazz)) - .write() - .mode(SaveMode.Overwrite) - .parquet(targetPath + "/" + type); + /* + spark.read() + .textFile(sourcePath) + .filter((FilterFunction) value -> isEntityType(value, type)) + .map((MapFunction) value -> StringUtils.substringAfter(value, "|"), Encoders.STRING()) + .map((MapFunction) value -> OBJECT_MAPPER.readValue(value, clazz), Encoders.bean(clazz)) + .write() + .mode(SaveMode.Overwrite) + .parquet(targetPath + "/" + type); - */ + */ - JavaSparkContext.fromSparkContext(spark.sparkContext()) - .textFile(sourcePath) - .filter(l -> isEntityType(l, type)) - .map(l -> StringUtils.substringAfter(l, "|")) - .saveAsTextFile(targetPath + "/" + type, GzipCodec.class); // use repartition(XXX) ??? - } + JavaSparkContext.fromSparkContext(spark.sparkContext()) + .textFile(sourcePath) + .filter(l -> isEntityType(l, type)) + .map(l -> StringUtils.substringAfter(l, "|")) + .saveAsTextFile( + targetPath + "/" + type, GzipCodec.class); // use repartition(XXX) ??? + } - private static boolean isEntityType(final String line, final String type) { - return StringUtils.substringBefore(line, "|").equalsIgnoreCase(type); - } - - private static void removeOutputDir(SparkSession spark, String path) { - HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration()); - } + private static boolean isEntityType(final String line, final String type) { + return StringUtils.substringBefore(line, "|").equalsIgnoreCase(type); + } + private static void removeOutputDir(SparkSession spark, String path) { + HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration()); + } } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/GenerateEntitiesApplication.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/GenerateEntitiesApplication.java index a9f331f53..34ae2df9b 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/GenerateEntitiesApplication.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/GenerateEntitiesApplication.java @@ -1,14 +1,18 @@ package eu.dnetlib.dhp.oa.graph.raw; +import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; + import com.fasterxml.jackson.databind.ObjectMapper; import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.common.HdfsSupport; import eu.dnetlib.dhp.oa.graph.raw.common.DbClient; import eu.dnetlib.dhp.schema.oaf.*; +import java.io.IOException; +import java.sql.SQLException; +import java.util.*; +import java.util.stream.Collectors; import org.apache.commons.io.IOUtils; import org.apache.commons.lang3.StringUtils; -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.Text; @@ -21,152 +25,160 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; import scala.Tuple2; -import java.io.IOException; -import java.sql.SQLException; -import java.util.*; -import java.util.stream.Collectors; - -import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; - public class GenerateEntitiesApplication { - private static final Logger log = LoggerFactory.getLogger(GenerateEntitiesApplication.class); + private static final Logger log = LoggerFactory.getLogger(GenerateEntitiesApplication.class); - public static void main(final String[] args) throws Exception { - final ArgumentApplicationParser parser = new ArgumentApplicationParser( - IOUtils.toString(MigrateMongoMdstoresApplication.class - .getResourceAsStream("/eu/dnetlib/dhp/oa/graph/generate_entities_parameters.json"))); + public static void main(final String[] args) throws Exception { + final ArgumentApplicationParser parser = + new ArgumentApplicationParser( + IOUtils.toString( + MigrateMongoMdstoresApplication.class.getResourceAsStream( + "/eu/dnetlib/dhp/oa/graph/generate_entities_parameters.json"))); - parser.parseArgument(args); + parser.parseArgument(args); - Boolean isSparkSessionManaged = Optional - .ofNullable(parser.get("isSparkSessionManaged")) - .map(Boolean::valueOf) - .orElse(Boolean.TRUE); - log.info("isSparkSessionManaged: {}", isSparkSessionManaged); + Boolean isSparkSessionManaged = + Optional.ofNullable(parser.get("isSparkSessionManaged")) + .map(Boolean::valueOf) + .orElse(Boolean.TRUE); + log.info("isSparkSessionManaged: {}", isSparkSessionManaged); - final String sourcePaths = parser.get("sourcePaths"); - final String targetPath = parser.get("targetPath"); + final String sourcePaths = parser.get("sourcePaths"); + final String targetPath = parser.get("targetPath"); - final String dbUrl = parser.get("postgresUrl"); - final String dbUser = parser.get("postgresUser"); - final String dbPassword = parser.get("postgresPassword"); + final String dbUrl = parser.get("postgresUrl"); + final String dbUser = parser.get("postgresUser"); + final String dbPassword = parser.get("postgresPassword"); - final Map code2name = loadClassNames(dbUrl, dbUser, dbPassword); + final Map code2name = loadClassNames(dbUrl, dbUser, dbPassword); - SparkConf conf = new SparkConf(); - runWithSparkSession(conf, isSparkSessionManaged, spark -> { - removeOutputDir(spark, targetPath); - generateEntities(spark, code2name, sourcePaths, targetPath); - }); - } + SparkConf conf = new SparkConf(); + runWithSparkSession( + conf, + isSparkSessionManaged, + spark -> { + removeOutputDir(spark, targetPath); + generateEntities(spark, code2name, sourcePaths, targetPath); + }); + } - private static void generateEntities(final SparkSession spark, - final Map code2name, - final String sourcePaths, - final String targetPath) { + private static void generateEntities( + final SparkSession spark, + final Map code2name, + final String sourcePaths, + final String targetPath) { - JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); - final List existingSourcePaths = Arrays.stream(sourcePaths.split(",")).filter(p -> exists(sc, p)).collect(Collectors.toList()); + JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); + final List existingSourcePaths = + Arrays.stream(sourcePaths.split(",")) + .filter(p -> exists(sc, p)) + .collect(Collectors.toList()); - log.info("Generate entities from files:"); - existingSourcePaths.forEach(log::info); + log.info("Generate entities from files:"); + existingSourcePaths.forEach(log::info); - JavaRDD inputRdd = sc.emptyRDD(); + JavaRDD inputRdd = sc.emptyRDD(); - for (final String sp : existingSourcePaths) { - inputRdd = inputRdd.union(sc.sequenceFile(sp, Text.class, Text.class) - .map(k -> new Tuple2<>(k._1().toString(), k._2().toString())) - .map(k -> convertToListOaf(k._1(), k._2(), code2name)) - .flatMap(list -> list.iterator()) - .map(oaf -> oaf.getClass().getSimpleName().toLowerCase() + "|" + convertToJson(oaf))); - } + for (final String sp : existingSourcePaths) { + inputRdd = + inputRdd.union( + sc.sequenceFile(sp, Text.class, Text.class) + .map(k -> new Tuple2<>(k._1().toString(), k._2().toString())) + .map(k -> convertToListOaf(k._1(), k._2(), code2name)) + .flatMap(list -> list.iterator()) + .map( + oaf -> + oaf.getClass().getSimpleName().toLowerCase() + + "|" + + convertToJson(oaf))); + } - inputRdd - .saveAsTextFile(targetPath, GzipCodec.class); + inputRdd.saveAsTextFile(targetPath, GzipCodec.class); + } - } + private static List convertToListOaf( + final String id, final String s, final Map code2name) { + final String type = StringUtils.substringAfter(id, ":"); - private static List convertToListOaf(final String id, final String s, final Map code2name) { - final String type = StringUtils.substringAfter(id, ":"); + switch (type.toLowerCase()) { + case "native_oaf": + return new OafToOafMapper(code2name).processMdRecord(s); + case "native_odf": + return new OdfToOafMapper(code2name).processMdRecord(s); + case "datasource": + return Arrays.asList(convertFromJson(s, Datasource.class)); + case "organization": + return Arrays.asList(convertFromJson(s, Organization.class)); + case "project": + return Arrays.asList(convertFromJson(s, Project.class)); + case "relation": + return Arrays.asList(convertFromJson(s, Relation.class)); + case "publication": + return Arrays.asList(convertFromJson(s, Publication.class)); + case "dataset": + return Arrays.asList(convertFromJson(s, Dataset.class)); + case "software": + return Arrays.asList(convertFromJson(s, Software.class)); + case "otherresearchproducts": + default: + return Arrays.asList(convertFromJson(s, OtherResearchProduct.class)); + } + } - switch (type.toLowerCase()) { - case "native_oaf": - return new OafToOafMapper(code2name).processMdRecord(s); - case "native_odf": - return new OdfToOafMapper(code2name).processMdRecord(s); - case "datasource": - return Arrays.asList(convertFromJson(s, Datasource.class)); - case "organization": - return Arrays.asList(convertFromJson(s, Organization.class)); - case "project": - return Arrays.asList(convertFromJson(s, Project.class)); - case "relation": - return Arrays.asList(convertFromJson(s, Relation.class)); - case "publication": - return Arrays.asList(convertFromJson(s, Publication.class)); - case "dataset": - return Arrays.asList(convertFromJson(s, Dataset.class)); - case "software": - return Arrays.asList(convertFromJson(s, Software.class)); - case "otherresearchproducts": - default: - return Arrays.asList(convertFromJson(s, OtherResearchProduct.class)); - } + private static Map loadClassNames( + final String dbUrl, final String dbUser, final String dbPassword) throws IOException { - } + log.info("Loading vocabulary terms from db..."); - private static Map loadClassNames(final String dbUrl, final String dbUser, final String dbPassword) throws IOException { + final Map map = new HashMap<>(); - log.info("Loading vocabulary terms from db..."); + try (DbClient dbClient = new DbClient(dbUrl, dbUser, dbPassword)) { + dbClient.processResults( + "select code, name from class", + rs -> { + try { + map.put(rs.getString("code"), rs.getString("name")); + } catch (final SQLException e) { + e.printStackTrace(); + } + }); + } - final Map map = new HashMap<>(); + log.info("Found " + map.size() + " terms."); - try (DbClient dbClient = new DbClient(dbUrl, dbUser, dbPassword)) { - dbClient.processResults("select code, name from class", rs -> { - try { - map.put(rs.getString("code"), rs.getString("name")); - } catch (final SQLException e) { - e.printStackTrace(); - } - }); - } + return map; + } - log.info("Found " + map.size() + " terms."); + private static String convertToJson(final Oaf oaf) { + try { + return new ObjectMapper().writeValueAsString(oaf); + } catch (final Exception e) { + throw new RuntimeException(e); + } + } - return map; + private static Oaf convertFromJson(final String s, final Class clazz) { + try { + return new ObjectMapper().readValue(s, clazz); + } catch (final Exception e) { + log.error("Error parsing object of class: " + clazz); + log.error(s); + throw new RuntimeException(e); + } + } - } + private static boolean exists(final JavaSparkContext context, final String pathToFile) { + try { + final FileSystem hdfs = FileSystem.get(context.hadoopConfiguration()); + final Path path = new Path(pathToFile); + return hdfs.exists(path); + } catch (final IOException e) { + throw new RuntimeException(e); + } + } - private static String convertToJson(final Oaf oaf) { - try { - return new ObjectMapper().writeValueAsString(oaf); - } catch (final Exception e) { - throw new RuntimeException(e); - } - } - - private static Oaf convertFromJson(final String s, final Class clazz) { - try { - return new ObjectMapper().readValue(s, clazz); - } catch (final Exception e) { - log.error("Error parsing object of class: " + clazz); - log.error(s); - throw new RuntimeException(e); - } - } - - private static boolean exists(final JavaSparkContext context, final String pathToFile) { - try { - final FileSystem hdfs = FileSystem.get(context.hadoopConfiguration()); - final Path path = new Path(pathToFile); - return hdfs.exists(path); - } catch (final IOException e) { - throw new RuntimeException(e); - } - } - - private static void removeOutputDir(SparkSession spark, String path) { - HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration()); - } + private static void removeOutputDir(SparkSession spark, String path) { + HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration()); + } } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MergeClaimsApplication.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MergeClaimsApplication.java index 4b209c68a..130b826e9 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MergeClaimsApplication.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MergeClaimsApplication.java @@ -1,13 +1,17 @@ package eu.dnetlib.dhp.oa.graph.raw; +import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; +import static eu.dnetlib.dhp.schema.common.ModelSupport.isSubClass; + import com.fasterxml.jackson.databind.ObjectMapper; import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.common.HdfsSupport; import eu.dnetlib.dhp.schema.common.ModelSupport; import eu.dnetlib.dhp.schema.oaf.*; +import java.util.Objects; +import java.util.Optional; +import java.util.function.Function; import org.apache.commons.io.IOUtils; -import org.apache.commons.lang3.StringUtils; -import org.apache.hadoop.io.compress.GzipCodec; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.function.FilterFunction; @@ -19,141 +23,186 @@ import org.apache.spark.sql.SparkSession; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import scala.Tuple2; -import scala.reflect.ClassTag; -import scala.reflect.ClassTag$; - -import java.util.Objects; -import java.util.Optional; -import java.util.function.Function; - -import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; -import static eu.dnetlib.dhp.schema.common.ModelSupport.isSubClass; public class MergeClaimsApplication { - private static final Logger log = LoggerFactory.getLogger(MergeClaimsApplication.class); + private static final Logger log = LoggerFactory.getLogger(MergeClaimsApplication.class); - private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); - public static void main(final String[] args) throws Exception { - final ArgumentApplicationParser parser = new ArgumentApplicationParser( - IOUtils.toString(MigrateMongoMdstoresApplication.class - .getResourceAsStream("/eu/dnetlib/dhp/oa/graph/merge_claims_parameters.json"))); - parser.parseArgument(args); + public static void main(final String[] args) throws Exception { + final ArgumentApplicationParser parser = + new ArgumentApplicationParser( + IOUtils.toString( + MigrateMongoMdstoresApplication.class.getResourceAsStream( + "/eu/dnetlib/dhp/oa/graph/merge_claims_parameters.json"))); + parser.parseArgument(args); - Boolean isSparkSessionManaged = Optional - .ofNullable(parser.get("isSparkSessionManaged")) - .map(Boolean::valueOf) - .orElse(Boolean.TRUE); - log.info("isSparkSessionManaged: {}", isSparkSessionManaged); + Boolean isSparkSessionManaged = + Optional.ofNullable(parser.get("isSparkSessionManaged")) + .map(Boolean::valueOf) + .orElse(Boolean.TRUE); + log.info("isSparkSessionManaged: {}", isSparkSessionManaged); - final String rawGraphPath = parser.get("rawGraphPath"); - log.info("rawGraphPath: {}", rawGraphPath); + final String rawGraphPath = parser.get("rawGraphPath"); + log.info("rawGraphPath: {}", rawGraphPath); - final String claimsGraphPath = parser.get("claimsGraphPath"); - log.info("claimsGraphPath: {}", claimsGraphPath); + final String claimsGraphPath = parser.get("claimsGraphPath"); + log.info("claimsGraphPath: {}", claimsGraphPath); - final String outputRawGaphPath = parser.get("outputRawGaphPath"); - log.info("outputRawGaphPath: {}", outputRawGaphPath); + final String outputRawGaphPath = parser.get("outputRawGaphPath"); + log.info("outputRawGaphPath: {}", outputRawGaphPath); - String graphTableClassName = parser.get("graphTableClassName"); - log.info("graphTableClassName: {}", graphTableClassName); + String graphTableClassName = parser.get("graphTableClassName"); + log.info("graphTableClassName: {}", graphTableClassName); - Class clazz = (Class) Class.forName(graphTableClassName); + Class clazz = (Class) Class.forName(graphTableClassName); - SparkConf conf = new SparkConf(); - conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); - conf.registerKryoClasses(ModelSupport.getOafModelClasses()); + SparkConf conf = new SparkConf(); + conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); + conf.registerKryoClasses(ModelSupport.getOafModelClasses()); - runWithSparkSession(conf, isSparkSessionManaged, spark -> { - String type = clazz.getSimpleName().toLowerCase(); + runWithSparkSession( + conf, + isSparkSessionManaged, + spark -> { + String type = clazz.getSimpleName().toLowerCase(); - String rawPath = rawGraphPath + "/" + type; - String claimPath = claimsGraphPath + "/" + type; - String outPath = outputRawGaphPath + "/" + type; + String rawPath = rawGraphPath + "/" + type; + String claimPath = claimsGraphPath + "/" + type; + String outPath = outputRawGaphPath + "/" + type; - removeOutputDir(spark, outPath); - mergeByType(spark, rawPath, claimPath, outPath, clazz); - }); - } + removeOutputDir(spark, outPath); + mergeByType(spark, rawPath, claimPath, outPath, clazz); + }); + } - private static void mergeByType(SparkSession spark, String rawPath, String claimPath, String outPath, Class clazz) { - Dataset> raw = readFromPath(spark, rawPath, clazz) - .map((MapFunction>) value -> new Tuple2<>(idFn().apply(value), value), Encoders.tuple(Encoders.STRING(), Encoders.kryo(clazz))); + private static void mergeByType( + SparkSession spark, String rawPath, String claimPath, String outPath, Class clazz) { + Dataset> raw = + readFromPath(spark, rawPath, clazz) + .map( + (MapFunction>) + value -> new Tuple2<>(idFn().apply(value), value), + Encoders.tuple(Encoders.STRING(), Encoders.kryo(clazz))); - final JavaSparkContext jsc = JavaSparkContext.fromSparkContext(spark.sparkContext()); - Dataset> claim = jsc.broadcast(readFromPath(spark, claimPath, clazz)) - .getValue() - .map((MapFunction>) value -> new Tuple2<>(idFn().apply(value), value), Encoders.tuple(Encoders.STRING(), Encoders.kryo(clazz))); + final JavaSparkContext jsc = JavaSparkContext.fromSparkContext(spark.sparkContext()); + Dataset> claim = + jsc.broadcast(readFromPath(spark, claimPath, clazz)) + .getValue() + .map( + (MapFunction>) + value -> new Tuple2<>(idFn().apply(value), value), + Encoders.tuple(Encoders.STRING(), Encoders.kryo(clazz))); - /* - Dataset> claim = readFromPath(spark, claimPath, clazz) - .map((MapFunction>) value -> new Tuple2<>(idFn().apply(value), value), Encoders.tuple(Encoders.STRING(), Encoders.kryo(clazz))); - */ + /* + Dataset> claim = readFromPath(spark, claimPath, clazz) + .map((MapFunction>) value -> new Tuple2<>(idFn().apply(value), value), Encoders.tuple(Encoders.STRING(), Encoders.kryo(clazz))); + */ - raw.joinWith(claim, raw.col("_1").equalTo(claim.col("_1")), "full_outer") - .map((MapFunction, Tuple2>, T>) value -> { + raw.joinWith(claim, raw.col("_1").equalTo(claim.col("_1")), "full_outer") + .map( + (MapFunction, Tuple2>, T>) + value -> { + Optional> opRaw = + Optional.ofNullable(value._1()); + Optional> opClaim = + Optional.ofNullable(value._2()); - Optional> opRaw = Optional.ofNullable(value._1()); - Optional> opClaim = Optional.ofNullable(value._2()); + return opRaw.isPresent() + ? opRaw.get()._2() + : opClaim.isPresent() ? opClaim.get()._2() : null; + }, + Encoders.bean(clazz)) + .filter(Objects::nonNull) + .map( + (MapFunction) value -> OBJECT_MAPPER.writeValueAsString(value), + Encoders.STRING()) + .write() + .mode(SaveMode.Overwrite) + .option("compression", "gzip") + .text(outPath); + } - return opRaw.isPresent() ? opRaw.get()._2() : opClaim.isPresent() ? opClaim.get()._2() : null; - }, Encoders.bean(clazz)) - .filter(Objects::nonNull) - .map((MapFunction) value -> OBJECT_MAPPER.writeValueAsString(value), Encoders.STRING()) - .write() - .mode(SaveMode.Overwrite) - .option("compression", "gzip") - .text(outPath); - } + private static Dataset readFromPath( + SparkSession spark, String path, Class clazz) { + return spark.read() + .textFile(path) + .map( + (MapFunction) value -> OBJECT_MAPPER.readValue(value, clazz), + Encoders.bean(clazz)) + .filter((FilterFunction) value -> Objects.nonNull(idFn().apply(value))); + /* + return spark.read() + .load(path) + .as(Encoders.bean(clazz)) + .filter((FilterFunction) value -> Objects.nonNull(idFn().apply(value))); + */ + } - private static Dataset readFromPath(SparkSession spark, String path, Class clazz) { - return spark.read() - .textFile(path) - .map((MapFunction) value -> OBJECT_MAPPER.readValue(value, clazz), Encoders.bean(clazz)) - .filter((FilterFunction) value -> Objects.nonNull(idFn().apply(value))); - /* - return spark.read() - .load(path) - .as(Encoders.bean(clazz)) - .filter((FilterFunction) value -> Objects.nonNull(idFn().apply(value))); - */ - } + private static void removeOutputDir(SparkSession spark, String path) { + HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration()); + } - private static void removeOutputDir(SparkSession spark, String path) { - HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration()); - } + private static Function idFn() { + return x -> { + if (isSubClass(x, Relation.class)) { + return idFnForRelation(x); + } + return idFnForOafEntity(x); + }; + } - private static Function idFn() { - return x -> { - if (isSubClass(x, Relation.class)) { - return idFnForRelation(x); - } - return idFnForOafEntity(x); - }; - } - - private static String idFnForRelation(T t) { - Relation r = (Relation) t; - return Optional.ofNullable(r.getSource()) - .map(source -> Optional.ofNullable(r.getTarget()) - .map(target -> Optional.ofNullable(r.getRelType()) - .map(relType -> Optional.ofNullable(r.getSubRelType()) - .map(subRelType -> Optional.ofNullable(r.getRelClass()) - .map(relClass -> String.join(source, target, relType, subRelType, relClass)) - .orElse(String.join(source, target, relType, subRelType)) - ) - .orElse(String.join(source, target, relType)) - ) - .orElse(String.join(source, target)) - ) - .orElse(source) - ) - .orElse(null); - } - - private static String idFnForOafEntity(T t) { - return ((OafEntity) t).getId(); - } + private static String idFnForRelation(T t) { + Relation r = (Relation) t; + return Optional.ofNullable(r.getSource()) + .map( + source -> + Optional.ofNullable(r.getTarget()) + .map( + target -> + Optional.ofNullable(r.getRelType()) + .map( + relType -> + Optional.ofNullable( + r + .getSubRelType()) + .map( + subRelType -> + Optional + .ofNullable( + r + .getRelClass()) + .map( + relClass -> + String + .join( + source, + target, + relType, + subRelType, + relClass)) + .orElse( + String + .join( + source, + target, + relType, + subRelType))) + .orElse( + String + .join( + source, + target, + relType))) + .orElse( + String.join( + source, target))) + .orElse(source)) + .orElse(null); + } + private static String idFnForOafEntity(T t) { + return ((OafEntity) t).getId(); + } } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplication.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplication.java index f8f6b58cc..6de7303c3 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplication.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplication.java @@ -1,14 +1,11 @@ package eu.dnetlib.dhp.oa.graph.raw; +import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.*; + import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.oa.graph.raw.common.AbstractMigrationApplication; import eu.dnetlib.dhp.oa.graph.raw.common.DbClient; import eu.dnetlib.dhp.schema.oaf.*; -import org.apache.commons.io.IOUtils; -import org.apache.commons.lang3.StringUtils; -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; - import java.io.Closeable; import java.io.IOException; import java.sql.Array; @@ -20,437 +17,510 @@ import java.util.Date; import java.util.List; import java.util.function.Consumer; import java.util.function.Function; - -import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.*; - -public class MigrateDbEntitiesApplication extends AbstractMigrationApplication implements Closeable { - - private static final Qualifier ENTITYREGISTRY_PROVENANCE_ACTION = - qualifier("sysimport:crosswalk:entityregistry", "sysimport:crosswalk:entityregistry", "dnet:provenance_actions", "dnet:provenance_actions"); - - private static final Log log = LogFactory.getLog(MigrateDbEntitiesApplication.class); - - private final DbClient dbClient; - - private final long lastUpdateTimestamp; - - public static void main(final String[] args) throws Exception { - final ArgumentApplicationParser parser = new ArgumentApplicationParser( - IOUtils.toString(MigrateDbEntitiesApplication.class.getResourceAsStream("/eu/dnetlib/dhp/oa/graph/migrate_db_entities_parameters.json"))); - - parser.parseArgument(args); - - final String dbUrl = parser.get("postgresUrl"); - final String dbUser = parser.get("postgresUser"); - final String dbPassword = parser.get("postgresPassword"); - - final String hdfsPath = parser.get("hdfsPath"); - - final boolean processClaims = parser.get("action") != null && parser.get("action").equalsIgnoreCase("claims"); - - try (final MigrateDbEntitiesApplication smdbe = new MigrateDbEntitiesApplication(hdfsPath, dbUrl, dbUser, dbPassword)) { - if (processClaims) { - log.info("Processing claims..."); - smdbe.execute("queryClaims.sql", smdbe::processClaims); - } else { - log.info("Processing datasources..."); - smdbe.execute("queryDatasources.sql", smdbe::processDatasource); - - log.info("Processing projects..."); - smdbe.execute("queryProjects.sql", smdbe::processProject); - - log.info("Processing orgs..."); - smdbe.execute("queryOrganizations.sql", smdbe::processOrganization); - - log.info("Processing relations ds <-> orgs ..."); - smdbe.execute("queryDatasourceOrganization.sql", smdbe::processDatasourceOrganization); - - log.info("Processing projects <-> orgs ..."); - smdbe.execute("queryProjectOrganization.sql", smdbe::processProjectOrganization); - } - log.info("All done."); - } - } - - protected MigrateDbEntitiesApplication() { // ONLY FOR UNIT TEST - super(); - this.dbClient = null; - this.lastUpdateTimestamp = new Date().getTime(); - } - - public MigrateDbEntitiesApplication(final String hdfsPath, final String dbUrl, final String dbUser, - final String dbPassword) throws Exception { - super(hdfsPath); - this.dbClient = new DbClient(dbUrl, dbUser, dbPassword); - this.lastUpdateTimestamp = new Date().getTime(); - } - - public void execute(final String sqlFile, final Function> producer) throws Exception { - final String sql = IOUtils.toString(getClass().getResourceAsStream("/eu/dnetlib/dhp/oa/graph/sql/" + sqlFile)); - - final Consumer consumer = rs -> producer.apply(rs).forEach(oaf -> emitOaf(oaf)); - - dbClient.processResults(sql, consumer); - } - - public List processDatasource(final ResultSet rs) { - - try { - - final DataInfo info = prepareDataInfo(rs); - - final Datasource ds = new Datasource(); - - ds.setId(createOpenaireId(10, rs.getString("datasourceid"), true)); - ds.setOriginalId(Arrays.asList(rs.getString("datasourceid"))); - ds.setCollectedfrom(listKeyValues(rs.getString("collectedfromid"), rs.getString("collectedfromname"))); - ds.setPid(new ArrayList<>()); - ds.setDateofcollection(asString(rs.getDate("dateofcollection"))); - ds.setDateoftransformation(null); // Value not returned by the SQL query - ds.setExtraInfo(new ArrayList<>()); // Values not present in the DB - ds.setOaiprovenance(null); // Values not present in the DB - ds.setDatasourcetype(prepareQualifierSplitting(rs.getString("datasourcetype"))); - ds.setOpenairecompatibility(prepareQualifierSplitting(rs.getString("openairecompatibility"))); - ds.setOfficialname(field(rs.getString("officialname"), info)); - ds.setEnglishname(field(rs.getString("englishname"), info)); - ds.setWebsiteurl(field(rs.getString("websiteurl"), info)); - ds.setLogourl(field(rs.getString("logourl"), info)); - ds.setContactemail(field(rs.getString("contactemail"), info)); - ds.setNamespaceprefix(field(rs.getString("namespaceprefix"), info)); - ds.setLatitude(field(Double.toString(rs.getDouble("latitude")), info)); - ds.setLongitude(field(Double.toString(rs.getDouble("longitude")), info)); - ds.setDateofvalidation(field(asString(rs.getDate("dateofvalidation")), info)); - ds.setDescription(field(rs.getString("description"), info)); - ds.setSubjects(prepareListOfStructProps(rs.getArray("subjects"), info)); - ds.setOdnumberofitems(field(Double.toString(rs.getInt("odnumberofitems")), info)); - ds.setOdnumberofitemsdate(field(asString(rs.getDate("odnumberofitemsdate")), info)); - ds.setOdpolicies(field(rs.getString("odpolicies"), info)); - ds.setOdlanguages(prepareListFields(rs.getArray("odlanguages"), info)); - ds.setOdcontenttypes(prepareListFields(rs.getArray("odcontenttypes"), info)); - ds.setAccessinfopackage(prepareListFields(rs.getArray("accessinfopackage"), info)); - ds.setReleasestartdate(field(asString(rs.getDate("releasestartdate")), info)); - ds.setReleaseenddate(field(asString(rs.getDate("releaseenddate")), info)); - ds.setMissionstatementurl(field(rs.getString("missionstatementurl"), info)); - ds.setDataprovider(field(rs.getBoolean("dataprovider"), info)); - ds.setServiceprovider(field(rs.getBoolean("serviceprovider"), info)); - ds.setDatabaseaccesstype(field(rs.getString("databaseaccesstype"), info)); - ds.setDatauploadtype(field(rs.getString("datauploadtype"), info)); - ds.setDatabaseaccessrestriction(field(rs.getString("databaseaccessrestriction"), info)); - ds.setDatauploadrestriction(field(rs.getString("datauploadrestriction"), info)); - ds.setVersioning(field(rs.getBoolean("versioning"), info)); - ds.setCitationguidelineurl(field(rs.getString("citationguidelineurl"), info)); - ds.setQualitymanagementkind(field(rs.getString("qualitymanagementkind"), info)); - ds.setPidsystems(field(rs.getString("pidsystems"), info)); - ds.setCertificates(field(rs.getString("certificates"), info)); - ds.setPolicies(new ArrayList<>()); // The sql query returns an empty array - ds.setJournal(prepareJournal(rs.getString("officialname"), rs.getString("journal"), info)); // Journal - ds.setDataInfo(info); - ds.setLastupdatetimestamp(lastUpdateTimestamp); - - return Arrays.asList(ds); - } catch (final Exception e) { - throw new RuntimeException(e); - } - } - - public List processProject(final ResultSet rs) { - try { - - final DataInfo info = prepareDataInfo(rs); - - final Project p = new Project(); - - p.setId(createOpenaireId(40, rs.getString("projectid"), true)); - p.setOriginalId(Arrays.asList(rs.getString("projectid"))); - p.setCollectedfrom(listKeyValues(rs.getString("collectedfromid"), rs.getString("collectedfromname"))); - p.setPid(new ArrayList<>()); - p.setDateofcollection(asString(rs.getDate("dateofcollection"))); - p.setDateoftransformation(asString(rs.getDate("dateoftransformation"))); - p.setExtraInfo(new ArrayList<>()); // Values not present in the DB - p.setOaiprovenance(null); // Values not present in the DB - p.setWebsiteurl(field(rs.getString("websiteurl"), info)); - p.setCode(field(rs.getString("code"), info)); - p.setAcronym(field(rs.getString("acronym"), info)); - p.setTitle(field(rs.getString("title"), info)); - p.setStartdate(field(asString(rs.getDate("startdate")), info)); - p.setEnddate(field(asString(rs.getDate("enddate")), info)); - p.setCallidentifier(field(rs.getString("callidentifier"), info)); - p.setKeywords(field(rs.getString("keywords"), info)); - p.setDuration(field(Integer.toString(rs.getInt("duration")), info)); - p.setEcsc39(field(Boolean.toString(rs.getBoolean("ecsc39")), info)); - p.setOamandatepublications(field(Boolean.toString(rs.getBoolean("oamandatepublications")), info)); - p.setEcarticle29_3(field(Boolean.toString(rs.getBoolean("ecarticle29_3")), info)); - p.setSubjects(prepareListOfStructProps(rs.getArray("subjects"), info)); - p.setFundingtree(prepareListFields(rs.getArray("fundingtree"), info)); - p.setContracttype(prepareQualifierSplitting(rs.getString("contracttype"))); - p.setOptional1(field(rs.getString("optional1"), info)); - p.setOptional2(field(rs.getString("optional2"), info)); - p.setJsonextrainfo(field(rs.getString("jsonextrainfo"), info)); - p.setContactfullname(field(rs.getString("contactfullname"), info)); - p.setContactfax(field(rs.getString("contactfax"), info)); - p.setContactphone(field(rs.getString("contactphone"), info)); - p.setContactemail(field(rs.getString("contactemail"), info)); - p.setSummary(field(rs.getString("summary"), info)); - p.setCurrency(field(rs.getString("currency"), info)); - p.setTotalcost(new Float(rs.getDouble("totalcost"))); - p.setFundedamount(new Float(rs.getDouble("fundedamount"))); - p.setDataInfo(info); - p.setLastupdatetimestamp(lastUpdateTimestamp); - - return Arrays.asList(p); - - } catch (final Exception e) { - throw new RuntimeException(e); - } - } - - public List processOrganization(final ResultSet rs) { - - try { - - final DataInfo info = prepareDataInfo(rs); - - final Organization o = new Organization(); - - o.setId(createOpenaireId(20, rs.getString("organizationid"), true)); - o.setOriginalId(Arrays.asList(rs.getString("organizationid"))); - o.setCollectedfrom(listKeyValues(rs.getString("collectedfromid"), rs.getString("collectedfromname"))); - o.setPid(new ArrayList<>()); - o.setDateofcollection(asString(rs.getDate("dateofcollection"))); - o.setDateoftransformation(asString(rs.getDate("dateoftransformation"))); - o.setExtraInfo(new ArrayList<>()); // Values not present in the DB - o.setOaiprovenance(null); // Values not present in the DB - o.setLegalshortname(field(rs.getString("legalshortname"), info)); - o.setLegalname(field(rs.getString("legalname"), info)); - o.setAlternativeNames(new ArrayList<>()); // Values not returned by the SQL query - o.setWebsiteurl(field(rs.getString("websiteurl"), info)); - o.setLogourl(field(rs.getString("logourl"), info)); - o.setEclegalbody(field(Boolean.toString(rs.getBoolean("eclegalbody")), info)); - o.setEclegalperson(field(Boolean.toString(rs.getBoolean("eclegalperson")), info)); - o.setEcnonprofit(field(Boolean.toString(rs.getBoolean("ecnonprofit")), info)); - o.setEcresearchorganization(field(Boolean.toString(rs.getBoolean("ecresearchorganization")), info)); - o.setEchighereducation(field(Boolean.toString(rs.getBoolean("echighereducation")), info)); - o.setEcinternationalorganizationeurinterests(field(Boolean.toString(rs.getBoolean("ecinternationalorganizationeurinterests")), info)); - o.setEcinternationalorganization(field(Boolean.toString(rs.getBoolean("ecinternationalorganization")), info)); - o.setEcenterprise(field(Boolean.toString(rs.getBoolean("ecenterprise")), info)); - o.setEcsmevalidated(field(Boolean.toString(rs.getBoolean("ecsmevalidated")), info)); - o.setEcnutscode(field(Boolean.toString(rs.getBoolean("ecnutscode")), info)); - o.setCountry(prepareQualifierSplitting(rs.getString("country"))); - o.setDataInfo(info); - o.setLastupdatetimestamp(lastUpdateTimestamp); - - return Arrays.asList(o); - } catch (final Exception e) { - throw new RuntimeException(e); - } - } - - public List processDatasourceOrganization(final ResultSet rs) { - try { - final DataInfo info = prepareDataInfo(rs); - final String orgId = createOpenaireId(20, rs.getString("organization"), true); - final String dsId = createOpenaireId(10, rs.getString("datasource"), true); - final List collectedFrom = listKeyValues(rs.getString("collectedfromid"), rs.getString("collectedfromname")); - - final Relation r1 = new Relation(); - r1.setRelType("datasourceOrganization"); - r1.setSubRelType("provision"); - r1.setRelClass("isProvidedBy"); - r1.setSource(dsId); - r1.setTarget(orgId); - r1.setCollectedFrom(collectedFrom); - r1.setDataInfo(info); - r1.setLastupdatetimestamp(lastUpdateTimestamp); - - final Relation r2 = new Relation(); - r2.setRelType("datasourceOrganization"); - r2.setSubRelType("provision"); - r2.setRelClass("provides"); - r2.setSource(orgId); - r2.setTarget(dsId); - r2.setCollectedFrom(collectedFrom); - r2.setDataInfo(info); - r2.setLastupdatetimestamp(lastUpdateTimestamp); - - return Arrays.asList(r1, r2); - } catch (final Exception e) { - throw new RuntimeException(e); - } - } - - public List processProjectOrganization(final ResultSet rs) { - try { - final DataInfo info = prepareDataInfo(rs); - final String orgId = createOpenaireId(20, rs.getString("resporganization"), true); - final String projectId = createOpenaireId(40, rs.getString("project"), true); - final List collectedFrom = listKeyValues(rs.getString("collectedfromid"), rs.getString("collectedfromname")); - - final Relation r1 = new Relation(); - r1.setRelType("projectOrganization"); - r1.setSubRelType("participation"); - r1.setRelClass("isParticipant"); - r1.setSource(projectId); - r1.setTarget(orgId); - r1.setCollectedFrom(collectedFrom); - r1.setDataInfo(info); - r1.setLastupdatetimestamp(lastUpdateTimestamp); - - final Relation r2 = new Relation(); - r2.setRelType("projectOrganization"); - r2.setSubRelType("participation"); - r2.setRelClass("hasParticipant"); - r2.setSource(orgId); - r2.setTarget(projectId); - r2.setCollectedFrom(collectedFrom); - r2.setDataInfo(info); - r2.setLastupdatetimestamp(lastUpdateTimestamp); - - return Arrays.asList(r1, r2); - } catch (final Exception e) { - throw new RuntimeException(e); - } - } - - public List processClaims(final ResultSet rs) { - - final DataInfo info = - dataInfo(false, null, false, false, qualifier("user:claim", "user:claim", "dnet:provenanceActions", "dnet:provenanceActions"), "0.9"); - - try { - - if (rs.getString("source_type").equals("context")) { - final Result r; - - if (rs.getString("target_type").equals("dataset")) { - r = new Dataset(); - } else if (rs.getString("target_type").equals("software")) { - r = new Software(); - } else if (rs.getString("target_type").equals("other")) { - r = new OtherResearchProduct(); - } else { - r = new Publication(); - } - r.setId(createOpenaireId(50, rs.getString("target_id"), false)); - r.setLastupdatetimestamp(lastUpdateTimestamp); - r.setContext(prepareContext(rs.getString("source_id"), info)); - r.setDataInfo(info); - - return Arrays.asList(r); - } else { - final String sourceId = createOpenaireId(rs.getString("source_type"), rs.getString("source_id"), false); - final String targetId = createOpenaireId(rs.getString("target_type"), rs.getString("target_id"), false); - - final Relation r1 = new Relation(); - final Relation r2 = new Relation(); - - if (rs.getString("source_type").equals("project")) { - r1.setRelType("resultProject"); - r1.setSubRelType("outcome"); - r1.setRelClass("produces"); - - r2.setRelType("resultProject"); - r2.setSubRelType("outcome"); - r2.setRelClass("isProducedBy"); - } else { - r1.setRelType("resultResult"); - r1.setSubRelType("relationship"); - r1.setRelClass("isRelatedTo"); - - r2.setRelType("resultResult"); - r2.setSubRelType("relationship"); - r2.setRelClass("isRelatedTo"); - } - - r1.setSource(sourceId); - r1.setTarget(targetId); - r1.setDataInfo(info); - r1.setLastupdatetimestamp(lastUpdateTimestamp); - - r2.setSource(targetId); - r2.setTarget(sourceId); - r2.setDataInfo(info); - r2.setLastupdatetimestamp(lastUpdateTimestamp); - - return Arrays.asList(r1, r2); - } - - } catch (final Exception e) { - throw new RuntimeException(e); - } - } - - private List prepareContext(final String id, final DataInfo dataInfo) { - final Context context = new Context(); - context.setId(id); - context.setDataInfo(Arrays.asList(dataInfo)); - return Arrays.asList(context); - } - - private DataInfo prepareDataInfo(final ResultSet rs) throws SQLException { - final Boolean deletedbyinference = rs.getBoolean("deletedbyinference"); - final String inferenceprovenance = rs.getString("inferenceprovenance"); - final Boolean inferred = rs.getBoolean("inferred"); - final String trust = rs.getString("trust"); - return dataInfo(deletedbyinference, inferenceprovenance, inferred, false, ENTITYREGISTRY_PROVENANCE_ACTION, trust); - } - - private Qualifier prepareQualifierSplitting(final String s) { - if (StringUtils.isBlank(s)) { return null; } - final String[] arr = s.split("@@@"); - return arr.length == 4 ? qualifier(arr[0], arr[1], arr[2], arr[3]) : null; - } - - private List> prepareListFields(final Array array, final DataInfo info) { - try { - return array != null ? listFields(info, (String[]) array.getArray()) : new ArrayList<>(); - } catch (final SQLException e) { - throw new RuntimeException("Invalid SQL array", e); - } - } - - private StructuredProperty prepareStructProp(final String s, final DataInfo dataInfo) { - if (StringUtils.isBlank(s)) { return null; } - final String[] parts = s.split("###"); - if (parts.length == 2) { - final String value = parts[0]; - final String[] arr = parts[1].split("@@@"); - if (arr.length == 4) { return structuredProperty(value, arr[0], arr[1], arr[2], arr[3], dataInfo); } - } - return null; - } - - private List prepareListOfStructProps(final Array array, final DataInfo dataInfo) throws SQLException { - final List res = new ArrayList<>(); - if (array != null) { - for (final String s : (String[]) array.getArray()) { - final StructuredProperty sp = prepareStructProp(s, dataInfo); - if (sp != null) { - res.add(sp); - } - } - } - - return res; - } - - private Journal prepareJournal(final String name, final String sj, final DataInfo info) { - if (StringUtils.isNotBlank(sj)) { - final String[] arr = sj.split("@@@"); - if (arr.length == 3) { - final String issn = StringUtils.isNotBlank(arr[0]) ? arr[0] : null; - final String eissn = StringUtils.isNotBlank(arr[1]) ? arr[1] : null;; - final String lissn = StringUtils.isNotBlank(arr[2]) ? arr[2] : null;; - if (issn != null || eissn != null - || lissn != null) { return journal(name, issn, eissn, eissn, null, null, null, null, null, null, null, info); } - } - } - return null; - } - - @Override - public void close() throws IOException { - super.close(); - dbClient.close(); - } - +import org.apache.commons.io.IOUtils; +import org.apache.commons.lang3.StringUtils; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; + +public class MigrateDbEntitiesApplication extends AbstractMigrationApplication + implements Closeable { + + private static final Qualifier ENTITYREGISTRY_PROVENANCE_ACTION = + qualifier( + "sysimport:crosswalk:entityregistry", + "sysimport:crosswalk:entityregistry", + "dnet:provenance_actions", + "dnet:provenance_actions"); + + private static final Log log = LogFactory.getLog(MigrateDbEntitiesApplication.class); + + private final DbClient dbClient; + + private final long lastUpdateTimestamp; + + public static void main(final String[] args) throws Exception { + final ArgumentApplicationParser parser = + new ArgumentApplicationParser( + IOUtils.toString( + MigrateDbEntitiesApplication.class.getResourceAsStream( + "/eu/dnetlib/dhp/oa/graph/migrate_db_entities_parameters.json"))); + + parser.parseArgument(args); + + final String dbUrl = parser.get("postgresUrl"); + final String dbUser = parser.get("postgresUser"); + final String dbPassword = parser.get("postgresPassword"); + + final String hdfsPath = parser.get("hdfsPath"); + + final boolean processClaims = + parser.get("action") != null && parser.get("action").equalsIgnoreCase("claims"); + + try (final MigrateDbEntitiesApplication smdbe = + new MigrateDbEntitiesApplication(hdfsPath, dbUrl, dbUser, dbPassword)) { + if (processClaims) { + log.info("Processing claims..."); + smdbe.execute("queryClaims.sql", smdbe::processClaims); + } else { + log.info("Processing datasources..."); + smdbe.execute("queryDatasources.sql", smdbe::processDatasource); + + log.info("Processing projects..."); + smdbe.execute("queryProjects.sql", smdbe::processProject); + + log.info("Processing orgs..."); + smdbe.execute("queryOrganizations.sql", smdbe::processOrganization); + + log.info("Processing relations ds <-> orgs ..."); + smdbe.execute( + "queryDatasourceOrganization.sql", smdbe::processDatasourceOrganization); + + log.info("Processing projects <-> orgs ..."); + smdbe.execute("queryProjectOrganization.sql", smdbe::processProjectOrganization); + } + log.info("All done."); + } + } + + protected MigrateDbEntitiesApplication() { // ONLY FOR UNIT TEST + super(); + this.dbClient = null; + this.lastUpdateTimestamp = new Date().getTime(); + } + + public MigrateDbEntitiesApplication( + final String hdfsPath, final String dbUrl, final String dbUser, final String dbPassword) + throws Exception { + super(hdfsPath); + this.dbClient = new DbClient(dbUrl, dbUser, dbPassword); + this.lastUpdateTimestamp = new Date().getTime(); + } + + public void execute(final String sqlFile, final Function> producer) + throws Exception { + final String sql = + IOUtils.toString( + getClass().getResourceAsStream("/eu/dnetlib/dhp/oa/graph/sql/" + sqlFile)); + + final Consumer consumer = rs -> producer.apply(rs).forEach(oaf -> emitOaf(oaf)); + + dbClient.processResults(sql, consumer); + } + + public List processDatasource(final ResultSet rs) { + + try { + + final DataInfo info = prepareDataInfo(rs); + + final Datasource ds = new Datasource(); + + ds.setId(createOpenaireId(10, rs.getString("datasourceid"), true)); + ds.setOriginalId(Arrays.asList(rs.getString("datasourceid"))); + ds.setCollectedfrom( + listKeyValues( + rs.getString("collectedfromid"), rs.getString("collectedfromname"))); + ds.setPid(new ArrayList<>()); + ds.setDateofcollection(asString(rs.getDate("dateofcollection"))); + ds.setDateoftransformation(null); // Value not returned by the SQL query + ds.setExtraInfo(new ArrayList<>()); // Values not present in the DB + ds.setOaiprovenance(null); // Values not present in the DB + ds.setDatasourcetype(prepareQualifierSplitting(rs.getString("datasourcetype"))); + ds.setOpenairecompatibility( + prepareQualifierSplitting(rs.getString("openairecompatibility"))); + ds.setOfficialname(field(rs.getString("officialname"), info)); + ds.setEnglishname(field(rs.getString("englishname"), info)); + ds.setWebsiteurl(field(rs.getString("websiteurl"), info)); + ds.setLogourl(field(rs.getString("logourl"), info)); + ds.setContactemail(field(rs.getString("contactemail"), info)); + ds.setNamespaceprefix(field(rs.getString("namespaceprefix"), info)); + ds.setLatitude(field(Double.toString(rs.getDouble("latitude")), info)); + ds.setLongitude(field(Double.toString(rs.getDouble("longitude")), info)); + ds.setDateofvalidation(field(asString(rs.getDate("dateofvalidation")), info)); + ds.setDescription(field(rs.getString("description"), info)); + ds.setSubjects(prepareListOfStructProps(rs.getArray("subjects"), info)); + ds.setOdnumberofitems(field(Double.toString(rs.getInt("odnumberofitems")), info)); + ds.setOdnumberofitemsdate(field(asString(rs.getDate("odnumberofitemsdate")), info)); + ds.setOdpolicies(field(rs.getString("odpolicies"), info)); + ds.setOdlanguages(prepareListFields(rs.getArray("odlanguages"), info)); + ds.setOdcontenttypes(prepareListFields(rs.getArray("odcontenttypes"), info)); + ds.setAccessinfopackage(prepareListFields(rs.getArray("accessinfopackage"), info)); + ds.setReleasestartdate(field(asString(rs.getDate("releasestartdate")), info)); + ds.setReleaseenddate(field(asString(rs.getDate("releaseenddate")), info)); + ds.setMissionstatementurl(field(rs.getString("missionstatementurl"), info)); + ds.setDataprovider(field(rs.getBoolean("dataprovider"), info)); + ds.setServiceprovider(field(rs.getBoolean("serviceprovider"), info)); + ds.setDatabaseaccesstype(field(rs.getString("databaseaccesstype"), info)); + ds.setDatauploadtype(field(rs.getString("datauploadtype"), info)); + ds.setDatabaseaccessrestriction(field(rs.getString("databaseaccessrestriction"), info)); + ds.setDatauploadrestriction(field(rs.getString("datauploadrestriction"), info)); + ds.setVersioning(field(rs.getBoolean("versioning"), info)); + ds.setCitationguidelineurl(field(rs.getString("citationguidelineurl"), info)); + ds.setQualitymanagementkind(field(rs.getString("qualitymanagementkind"), info)); + ds.setPidsystems(field(rs.getString("pidsystems"), info)); + ds.setCertificates(field(rs.getString("certificates"), info)); + ds.setPolicies(new ArrayList<>()); // The sql query returns an empty array + ds.setJournal( + prepareJournal( + rs.getString("officialname"), + rs.getString("journal"), + info)); // Journal + ds.setDataInfo(info); + ds.setLastupdatetimestamp(lastUpdateTimestamp); + + return Arrays.asList(ds); + } catch (final Exception e) { + throw new RuntimeException(e); + } + } + + public List processProject(final ResultSet rs) { + try { + + final DataInfo info = prepareDataInfo(rs); + + final Project p = new Project(); + + p.setId(createOpenaireId(40, rs.getString("projectid"), true)); + p.setOriginalId(Arrays.asList(rs.getString("projectid"))); + p.setCollectedfrom( + listKeyValues( + rs.getString("collectedfromid"), rs.getString("collectedfromname"))); + p.setPid(new ArrayList<>()); + p.setDateofcollection(asString(rs.getDate("dateofcollection"))); + p.setDateoftransformation(asString(rs.getDate("dateoftransformation"))); + p.setExtraInfo(new ArrayList<>()); // Values not present in the DB + p.setOaiprovenance(null); // Values not present in the DB + p.setWebsiteurl(field(rs.getString("websiteurl"), info)); + p.setCode(field(rs.getString("code"), info)); + p.setAcronym(field(rs.getString("acronym"), info)); + p.setTitle(field(rs.getString("title"), info)); + p.setStartdate(field(asString(rs.getDate("startdate")), info)); + p.setEnddate(field(asString(rs.getDate("enddate")), info)); + p.setCallidentifier(field(rs.getString("callidentifier"), info)); + p.setKeywords(field(rs.getString("keywords"), info)); + p.setDuration(field(Integer.toString(rs.getInt("duration")), info)); + p.setEcsc39(field(Boolean.toString(rs.getBoolean("ecsc39")), info)); + p.setOamandatepublications( + field(Boolean.toString(rs.getBoolean("oamandatepublications")), info)); + p.setEcarticle29_3(field(Boolean.toString(rs.getBoolean("ecarticle29_3")), info)); + p.setSubjects(prepareListOfStructProps(rs.getArray("subjects"), info)); + p.setFundingtree(prepareListFields(rs.getArray("fundingtree"), info)); + p.setContracttype(prepareQualifierSplitting(rs.getString("contracttype"))); + p.setOptional1(field(rs.getString("optional1"), info)); + p.setOptional2(field(rs.getString("optional2"), info)); + p.setJsonextrainfo(field(rs.getString("jsonextrainfo"), info)); + p.setContactfullname(field(rs.getString("contactfullname"), info)); + p.setContactfax(field(rs.getString("contactfax"), info)); + p.setContactphone(field(rs.getString("contactphone"), info)); + p.setContactemail(field(rs.getString("contactemail"), info)); + p.setSummary(field(rs.getString("summary"), info)); + p.setCurrency(field(rs.getString("currency"), info)); + p.setTotalcost(new Float(rs.getDouble("totalcost"))); + p.setFundedamount(new Float(rs.getDouble("fundedamount"))); + p.setDataInfo(info); + p.setLastupdatetimestamp(lastUpdateTimestamp); + + return Arrays.asList(p); + + } catch (final Exception e) { + throw new RuntimeException(e); + } + } + + public List processOrganization(final ResultSet rs) { + + try { + + final DataInfo info = prepareDataInfo(rs); + + final Organization o = new Organization(); + + o.setId(createOpenaireId(20, rs.getString("organizationid"), true)); + o.setOriginalId(Arrays.asList(rs.getString("organizationid"))); + o.setCollectedfrom( + listKeyValues( + rs.getString("collectedfromid"), rs.getString("collectedfromname"))); + o.setPid(new ArrayList<>()); + o.setDateofcollection(asString(rs.getDate("dateofcollection"))); + o.setDateoftransformation(asString(rs.getDate("dateoftransformation"))); + o.setExtraInfo(new ArrayList<>()); // Values not present in the DB + o.setOaiprovenance(null); // Values not present in the DB + o.setLegalshortname(field(rs.getString("legalshortname"), info)); + o.setLegalname(field(rs.getString("legalname"), info)); + o.setAlternativeNames(new ArrayList<>()); // Values not returned by the SQL query + o.setWebsiteurl(field(rs.getString("websiteurl"), info)); + o.setLogourl(field(rs.getString("logourl"), info)); + o.setEclegalbody(field(Boolean.toString(rs.getBoolean("eclegalbody")), info)); + o.setEclegalperson(field(Boolean.toString(rs.getBoolean("eclegalperson")), info)); + o.setEcnonprofit(field(Boolean.toString(rs.getBoolean("ecnonprofit")), info)); + o.setEcresearchorganization( + field(Boolean.toString(rs.getBoolean("ecresearchorganization")), info)); + o.setEchighereducation( + field(Boolean.toString(rs.getBoolean("echighereducation")), info)); + o.setEcinternationalorganizationeurinterests( + field( + Boolean.toString( + rs.getBoolean("ecinternationalorganizationeurinterests")), + info)); + o.setEcinternationalorganization( + field(Boolean.toString(rs.getBoolean("ecinternationalorganization")), info)); + o.setEcenterprise(field(Boolean.toString(rs.getBoolean("ecenterprise")), info)); + o.setEcsmevalidated(field(Boolean.toString(rs.getBoolean("ecsmevalidated")), info)); + o.setEcnutscode(field(Boolean.toString(rs.getBoolean("ecnutscode")), info)); + o.setCountry(prepareQualifierSplitting(rs.getString("country"))); + o.setDataInfo(info); + o.setLastupdatetimestamp(lastUpdateTimestamp); + + return Arrays.asList(o); + } catch (final Exception e) { + throw new RuntimeException(e); + } + } + + public List processDatasourceOrganization(final ResultSet rs) { + try { + final DataInfo info = prepareDataInfo(rs); + final String orgId = createOpenaireId(20, rs.getString("organization"), true); + final String dsId = createOpenaireId(10, rs.getString("datasource"), true); + final List collectedFrom = + listKeyValues( + rs.getString("collectedfromid"), rs.getString("collectedfromname")); + + final Relation r1 = new Relation(); + r1.setRelType("datasourceOrganization"); + r1.setSubRelType("provision"); + r1.setRelClass("isProvidedBy"); + r1.setSource(dsId); + r1.setTarget(orgId); + r1.setCollectedFrom(collectedFrom); + r1.setDataInfo(info); + r1.setLastupdatetimestamp(lastUpdateTimestamp); + + final Relation r2 = new Relation(); + r2.setRelType("datasourceOrganization"); + r2.setSubRelType("provision"); + r2.setRelClass("provides"); + r2.setSource(orgId); + r2.setTarget(dsId); + r2.setCollectedFrom(collectedFrom); + r2.setDataInfo(info); + r2.setLastupdatetimestamp(lastUpdateTimestamp); + + return Arrays.asList(r1, r2); + } catch (final Exception e) { + throw new RuntimeException(e); + } + } + + public List processProjectOrganization(final ResultSet rs) { + try { + final DataInfo info = prepareDataInfo(rs); + final String orgId = createOpenaireId(20, rs.getString("resporganization"), true); + final String projectId = createOpenaireId(40, rs.getString("project"), true); + final List collectedFrom = + listKeyValues( + rs.getString("collectedfromid"), rs.getString("collectedfromname")); + + final Relation r1 = new Relation(); + r1.setRelType("projectOrganization"); + r1.setSubRelType("participation"); + r1.setRelClass("isParticipant"); + r1.setSource(projectId); + r1.setTarget(orgId); + r1.setCollectedFrom(collectedFrom); + r1.setDataInfo(info); + r1.setLastupdatetimestamp(lastUpdateTimestamp); + + final Relation r2 = new Relation(); + r2.setRelType("projectOrganization"); + r2.setSubRelType("participation"); + r2.setRelClass("hasParticipant"); + r2.setSource(orgId); + r2.setTarget(projectId); + r2.setCollectedFrom(collectedFrom); + r2.setDataInfo(info); + r2.setLastupdatetimestamp(lastUpdateTimestamp); + + return Arrays.asList(r1, r2); + } catch (final Exception e) { + throw new RuntimeException(e); + } + } + + public List processClaims(final ResultSet rs) { + + final DataInfo info = + dataInfo( + false, + null, + false, + false, + qualifier( + "user:claim", + "user:claim", + "dnet:provenanceActions", + "dnet:provenanceActions"), + "0.9"); + + try { + + if (rs.getString("source_type").equals("context")) { + final Result r; + + if (rs.getString("target_type").equals("dataset")) { + r = new Dataset(); + } else if (rs.getString("target_type").equals("software")) { + r = new Software(); + } else if (rs.getString("target_type").equals("other")) { + r = new OtherResearchProduct(); + } else { + r = new Publication(); + } + r.setId(createOpenaireId(50, rs.getString("target_id"), false)); + r.setLastupdatetimestamp(lastUpdateTimestamp); + r.setContext(prepareContext(rs.getString("source_id"), info)); + r.setDataInfo(info); + + return Arrays.asList(r); + } else { + final String sourceId = + createOpenaireId( + rs.getString("source_type"), rs.getString("source_id"), false); + final String targetId = + createOpenaireId( + rs.getString("target_type"), rs.getString("target_id"), false); + + final Relation r1 = new Relation(); + final Relation r2 = new Relation(); + + if (rs.getString("source_type").equals("project")) { + r1.setRelType("resultProject"); + r1.setSubRelType("outcome"); + r1.setRelClass("produces"); + + r2.setRelType("resultProject"); + r2.setSubRelType("outcome"); + r2.setRelClass("isProducedBy"); + } else { + r1.setRelType("resultResult"); + r1.setSubRelType("relationship"); + r1.setRelClass("isRelatedTo"); + + r2.setRelType("resultResult"); + r2.setSubRelType("relationship"); + r2.setRelClass("isRelatedTo"); + } + + r1.setSource(sourceId); + r1.setTarget(targetId); + r1.setDataInfo(info); + r1.setLastupdatetimestamp(lastUpdateTimestamp); + + r2.setSource(targetId); + r2.setTarget(sourceId); + r2.setDataInfo(info); + r2.setLastupdatetimestamp(lastUpdateTimestamp); + + return Arrays.asList(r1, r2); + } + + } catch (final Exception e) { + throw new RuntimeException(e); + } + } + + private List prepareContext(final String id, final DataInfo dataInfo) { + final Context context = new Context(); + context.setId(id); + context.setDataInfo(Arrays.asList(dataInfo)); + return Arrays.asList(context); + } + + private DataInfo prepareDataInfo(final ResultSet rs) throws SQLException { + final Boolean deletedbyinference = rs.getBoolean("deletedbyinference"); + final String inferenceprovenance = rs.getString("inferenceprovenance"); + final Boolean inferred = rs.getBoolean("inferred"); + final String trust = rs.getString("trust"); + return dataInfo( + deletedbyinference, + inferenceprovenance, + inferred, + false, + ENTITYREGISTRY_PROVENANCE_ACTION, + trust); + } + + private Qualifier prepareQualifierSplitting(final String s) { + if (StringUtils.isBlank(s)) { + return null; + } + final String[] arr = s.split("@@@"); + return arr.length == 4 ? qualifier(arr[0], arr[1], arr[2], arr[3]) : null; + } + + private List> prepareListFields(final Array array, final DataInfo info) { + try { + return array != null + ? listFields(info, (String[]) array.getArray()) + : new ArrayList<>(); + } catch (final SQLException e) { + throw new RuntimeException("Invalid SQL array", e); + } + } + + private StructuredProperty prepareStructProp(final String s, final DataInfo dataInfo) { + if (StringUtils.isBlank(s)) { + return null; + } + final String[] parts = s.split("###"); + if (parts.length == 2) { + final String value = parts[0]; + final String[] arr = parts[1].split("@@@"); + if (arr.length == 4) { + return structuredProperty(value, arr[0], arr[1], arr[2], arr[3], dataInfo); + } + } + return null; + } + + private List prepareListOfStructProps( + final Array array, final DataInfo dataInfo) throws SQLException { + final List res = new ArrayList<>(); + if (array != null) { + for (final String s : (String[]) array.getArray()) { + final StructuredProperty sp = prepareStructProp(s, dataInfo); + if (sp != null) { + res.add(sp); + } + } + } + + return res; + } + + private Journal prepareJournal(final String name, final String sj, final DataInfo info) { + if (StringUtils.isNotBlank(sj)) { + final String[] arr = sj.split("@@@"); + if (arr.length == 3) { + final String issn = StringUtils.isNotBlank(arr[0]) ? arr[0] : null; + final String eissn = StringUtils.isNotBlank(arr[1]) ? arr[1] : null; + ; + final String lissn = StringUtils.isNotBlank(arr[2]) ? arr[2] : null; + ; + if (issn != null || eissn != null || lissn != null) { + return journal( + name, issn, eissn, eissn, null, null, null, null, null, null, null, + info); + } + } + } + return null; + } + + @Override + public void close() throws IOException { + super.close(); + dbClient.close(); + } } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateMongoMdstoresApplication.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateMongoMdstoresApplication.java index 585209ac9..90966429a 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateMongoMdstoresApplication.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateMongoMdstoresApplication.java @@ -3,64 +3,74 @@ package eu.dnetlib.dhp.oa.graph.raw; import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.oa.graph.raw.common.AbstractMigrationApplication; import eu.dnetlib.dhp.oa.graph.raw.common.MdstoreClient; -import org.apache.commons.io.IOUtils; -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; - import java.io.Closeable; import java.io.IOException; import java.util.Map; import java.util.Map.Entry; +import org.apache.commons.io.IOUtils; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; -public class MigrateMongoMdstoresApplication extends AbstractMigrationApplication implements Closeable { +public class MigrateMongoMdstoresApplication extends AbstractMigrationApplication + implements Closeable { - private static final Log log = LogFactory.getLog(MigrateMongoMdstoresApplication.class); + private static final Log log = LogFactory.getLog(MigrateMongoMdstoresApplication.class); - private final MdstoreClient mdstoreClient; + private final MdstoreClient mdstoreClient; - public static void main(final String[] args) throws Exception { - final ArgumentApplicationParser parser = new ArgumentApplicationParser( - IOUtils.toString(MigrateMongoMdstoresApplication.class.getResourceAsStream("/eu/dnetlib/dhp/oa/graph/migrate_mongo_mstores_parameters.json"))); - parser.parseArgument(args); + public static void main(final String[] args) throws Exception { + final ArgumentApplicationParser parser = + new ArgumentApplicationParser( + IOUtils.toString( + MigrateMongoMdstoresApplication.class.getResourceAsStream( + "/eu/dnetlib/dhp/oa/graph/migrate_mongo_mstores_parameters.json"))); + parser.parseArgument(args); - final String mongoBaseUrl = parser.get("mongoBaseUrl"); - final String mongoDb = parser.get("mongoDb"); + final String mongoBaseUrl = parser.get("mongoBaseUrl"); + final String mongoDb = parser.get("mongoDb"); - final String mdFormat = parser.get("mdFormat"); - final String mdLayout = parser.get("mdLayout"); - final String mdInterpretation = parser.get("mdInterpretation"); + final String mdFormat = parser.get("mdFormat"); + final String mdLayout = parser.get("mdLayout"); + final String mdInterpretation = parser.get("mdInterpretation"); - final String hdfsPath = parser.get("hdfsPath"); + final String hdfsPath = parser.get("hdfsPath"); - try (MigrateMongoMdstoresApplication app = new MigrateMongoMdstoresApplication(hdfsPath, mongoBaseUrl, mongoDb)) { - app.execute(mdFormat, mdLayout, mdInterpretation); - } + try (MigrateMongoMdstoresApplication app = + new MigrateMongoMdstoresApplication(hdfsPath, mongoBaseUrl, mongoDb)) { + app.execute(mdFormat, mdLayout, mdInterpretation); + } + } - } + public MigrateMongoMdstoresApplication( + final String hdfsPath, final String mongoBaseUrl, final String mongoDb) + throws Exception { + super(hdfsPath); + this.mdstoreClient = new MdstoreClient(mongoBaseUrl, mongoDb); + } - public MigrateMongoMdstoresApplication(final String hdfsPath, final String mongoBaseUrl, final String mongoDb) throws Exception { - super(hdfsPath); - this.mdstoreClient = new MdstoreClient(mongoBaseUrl, mongoDb); - } + public void execute(final String format, final String layout, final String interpretation) { + final Map colls = + mdstoreClient.validCollections(format, layout, interpretation); + log.info("Found " + colls.size() + " mdstores"); - public void execute(final String format, final String layout, final String interpretation) { - final Map colls = mdstoreClient.validCollections(format, layout, interpretation); - log.info("Found " + colls.size() + " mdstores"); + for (final Entry entry : colls.entrySet()) { + log.info( + "Processing mdstore " + + entry.getKey() + + " (collection: " + + entry.getValue() + + ")"); + final String currentColl = entry.getValue(); - for (final Entry entry : colls.entrySet()) { - log.info("Processing mdstore " + entry.getKey() + " (collection: " + entry.getValue() + ")"); - final String currentColl = entry.getValue(); - - for (final String xml : mdstoreClient.listRecords(currentColl)) { - emit(xml, "native_" + format); - } - } - } - - @Override - public void close() throws IOException { - super.close(); - mdstoreClient.close(); - } + for (final String xml : mdstoreClient.listRecords(currentColl)) { + emit(xml, "native_" + format); + } + } + } + @Override + public void close() throws IOException { + super.close(); + mdstoreClient.close(); + } } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OafToOafMapper.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OafToOafMapper.java index aed582d8f..e77b1f87f 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OafToOafMapper.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OafToOafMapper.java @@ -1,232 +1,258 @@ package eu.dnetlib.dhp.oa.graph.raw; -import eu.dnetlib.dhp.oa.graph.raw.common.PacePerson; -import eu.dnetlib.dhp.schema.oaf.*; -import org.dom4j.Document; -import org.dom4j.Node; - -import java.util.ArrayList; -import java.util.Arrays; -import java.util.List; -import java.util.Map; - import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.createOpenaireId; import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.field; +import eu.dnetlib.dhp.oa.graph.raw.common.PacePerson; +import eu.dnetlib.dhp.schema.oaf.*; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.Map; +import org.dom4j.Document; +import org.dom4j.Node; + public class OafToOafMapper extends AbstractMdRecordToOafMapper { - public OafToOafMapper(final Map code2name) { - super(code2name); - } + public OafToOafMapper(final Map code2name) { + super(code2name); + } - @Override - protected List prepareAuthors(final Document doc, final DataInfo info) { - final List res = new ArrayList<>(); - int pos = 1; - for (final Object o : doc.selectNodes("//dc:creator")) { - final Node n = (Node) o; - final Author author = new Author(); - author.setFullname(n.getText()); - author.setRank(pos++); - final PacePerson p = new PacePerson(n.getText(), false); - if (p.isAccurate()) { - author.setName(p.getNormalisedFirstName()); - author.setSurname(p.getNormalisedSurname()); - } - res.add(author); - } - return res; - } + @Override + protected List prepareAuthors(final Document doc, final DataInfo info) { + final List res = new ArrayList<>(); + int pos = 1; + for (final Object o : doc.selectNodes("//dc:creator")) { + final Node n = (Node) o; + final Author author = new Author(); + author.setFullname(n.getText()); + author.setRank(pos++); + final PacePerson p = new PacePerson(n.getText(), false); + if (p.isAccurate()) { + author.setName(p.getNormalisedFirstName()); + author.setSurname(p.getNormalisedSurname()); + } + res.add(author); + } + return res; + } - @Override - protected Qualifier prepareLanguages(final Document doc) { - return prepareQualifier(doc, "//dc:language", "dnet:languages", "dnet:languages"); - } + @Override + protected Qualifier prepareLanguages(final Document doc) { + return prepareQualifier(doc, "//dc:language", "dnet:languages", "dnet:languages"); + } - @Override - protected List prepareSubjects(final Document doc, final DataInfo info) { - return prepareListStructProps(doc, "//dc:subject", info); - } + @Override + protected List prepareSubjects(final Document doc, final DataInfo info) { + return prepareListStructProps(doc, "//dc:subject", info); + } - @Override - protected List prepareTitles(final Document doc, final DataInfo info) { - return prepareListStructProps(doc, "//dc:title", MAIN_TITLE_QUALIFIER, info); - } + @Override + protected List prepareTitles(final Document doc, final DataInfo info) { + return prepareListStructProps(doc, "//dc:title", MAIN_TITLE_QUALIFIER, info); + } - @Override - protected List> prepareDescriptions(final Document doc, final DataInfo info) { - return prepareListFields(doc, "//dc:description", info); - } + @Override + protected List> prepareDescriptions(final Document doc, final DataInfo info) { + return prepareListFields(doc, "//dc:description", info); + } - @Override - protected Field preparePublisher(final Document doc, final DataInfo info) { - return prepareField(doc, "//dc:publisher", info); - } + @Override + protected Field preparePublisher(final Document doc, final DataInfo info) { + return prepareField(doc, "//dc:publisher", info); + } - @Override - protected List> prepareFormats(final Document doc, final DataInfo info) { - return prepareListFields(doc, "//dc:format", info); - } + @Override + protected List> prepareFormats(final Document doc, final DataInfo info) { + return prepareListFields(doc, "//dc:format", info); + } - @Override - protected List> prepareContributors(final Document doc, final DataInfo info) { - return prepareListFields(doc, "//dc:contributor", info); - } + @Override + protected List> prepareContributors(final Document doc, final DataInfo info) { + return prepareListFields(doc, "//dc:contributor", info); + } - @Override - protected List> prepareCoverages(final Document doc, final DataInfo info) { - return prepareListFields(doc, "//dc:coverage", info); - } + @Override + protected List> prepareCoverages(final Document doc, final DataInfo info) { + return prepareListFields(doc, "//dc:coverage", info); + } - @Override - protected List prepareInstances(final Document doc, final DataInfo info, final KeyValue collectedfrom, final KeyValue hostedby) { - final List res = new ArrayList<>(); - for (final Object o : doc.selectNodes("//dc:identifier")) { - final String url = ((Node) o).getText().trim(); - if (url.startsWith("http")) { - final Instance instance = new Instance(); - instance.setUrl(Arrays.asList(url)); - instance.setInstancetype(prepareQualifier(doc, "//dr:CobjCategory", "dnet:publication_resource", "dnet:publication_resource")); - instance.setCollectedfrom(collectedfrom); - instance.setHostedby(hostedby); - instance.setDateofacceptance(field(doc.valueOf("//oaf:dateAccepted"), info)); - instance.setDistributionlocation(doc.valueOf("//oaf:distributionlocation")); - instance.setAccessright(prepareQualifier(doc, "//oaf:accessrights", "dnet:access_modes", "dnet:access_modes")); - instance.setLicense(field(doc.valueOf("//oaf:license"), info)); - instance.setRefereed(field(doc.valueOf("//oaf:refereed"), info)); - instance.setProcessingchargeamount(field(doc.valueOf("//oaf:processingchargeamount"), info)); - instance.setProcessingchargecurrency(field(doc.valueOf("//oaf:processingchargeamount/@currency"), info)); - res.add(instance); - } - } - return res; - } + @Override + protected List prepareInstances( + final Document doc, + final DataInfo info, + final KeyValue collectedfrom, + final KeyValue hostedby) { + final List res = new ArrayList<>(); + for (final Object o : doc.selectNodes("//dc:identifier")) { + final String url = ((Node) o).getText().trim(); + if (url.startsWith("http")) { + final Instance instance = new Instance(); + instance.setUrl(Arrays.asList(url)); + instance.setInstancetype( + prepareQualifier( + doc, + "//dr:CobjCategory", + "dnet:publication_resource", + "dnet:publication_resource")); + instance.setCollectedfrom(collectedfrom); + instance.setHostedby(hostedby); + instance.setDateofacceptance(field(doc.valueOf("//oaf:dateAccepted"), info)); + instance.setDistributionlocation(doc.valueOf("//oaf:distributionlocation")); + instance.setAccessright( + prepareQualifier( + doc, + "//oaf:accessrights", + "dnet:access_modes", + "dnet:access_modes")); + instance.setLicense(field(doc.valueOf("//oaf:license"), info)); + instance.setRefereed(field(doc.valueOf("//oaf:refereed"), info)); + instance.setProcessingchargeamount( + field(doc.valueOf("//oaf:processingchargeamount"), info)); + instance.setProcessingchargecurrency( + field(doc.valueOf("//oaf:processingchargeamount/@currency"), info)); + res.add(instance); + } + } + return res; + } - @Override - protected List> prepareSources(final Document doc, final DataInfo info) { - return prepareListFields(doc, "//dc:source", info); - } + @Override + protected List> prepareSources(final Document doc, final DataInfo info) { + return prepareListFields(doc, "//dc:source", info); + } - @Override - protected List prepareRelevantDates(final Document doc, final DataInfo info) { - return new ArrayList<>(); // NOT PRESENT IN OAF - } + @Override + protected List prepareRelevantDates( + final Document doc, final DataInfo info) { + return new ArrayList<>(); // NOT PRESENT IN OAF + } - // SOFTWARES + // SOFTWARES - @Override - protected Qualifier prepareSoftwareProgrammingLanguage(final Document doc, final DataInfo info) { - return null; // NOT PRESENT IN OAF - } + @Override + protected Qualifier prepareSoftwareProgrammingLanguage( + final Document doc, final DataInfo info) { + return null; // NOT PRESENT IN OAF + } - @Override - protected Field prepareSoftwareCodeRepositoryUrl(final Document doc, final DataInfo info) { - return null; // NOT PRESENT IN OAF - } + @Override + protected Field prepareSoftwareCodeRepositoryUrl( + final Document doc, final DataInfo info) { + return null; // NOT PRESENT IN OAF + } - @Override - protected List prepareSoftwareLicenses(final Document doc, final DataInfo info) { - return new ArrayList<>(); // NOT PRESENT IN OAF - } + @Override + protected List prepareSoftwareLicenses( + final Document doc, final DataInfo info) { + return new ArrayList<>(); // NOT PRESENT IN OAF + } - @Override - protected List> prepareSoftwareDocumentationUrls(final Document doc, final DataInfo info) { - return new ArrayList<>(); // NOT PRESENT IN OAF - } + @Override + protected List> prepareSoftwareDocumentationUrls( + final Document doc, final DataInfo info) { + return new ArrayList<>(); // NOT PRESENT IN OAF + } - // DATASETS - @Override - protected List prepareDatasetGeoLocations(final Document doc, final DataInfo info) { - return new ArrayList<>(); // NOT PRESENT IN OAF - } + // DATASETS + @Override + protected List prepareDatasetGeoLocations( + final Document doc, final DataInfo info) { + return new ArrayList<>(); // NOT PRESENT IN OAF + } - @Override - protected Field prepareDatasetMetadataVersionNumber(final Document doc, final DataInfo info) { - return null; // NOT PRESENT IN OAF - } + @Override + protected Field prepareDatasetMetadataVersionNumber( + final Document doc, final DataInfo info) { + return null; // NOT PRESENT IN OAF + } - @Override - protected Field prepareDatasetLastMetadataUpdate(final Document doc, final DataInfo info) { - return null; // NOT PRESENT IN OAF - } + @Override + protected Field prepareDatasetLastMetadataUpdate( + final Document doc, final DataInfo info) { + return null; // NOT PRESENT IN OAF + } - @Override - protected Field prepareDatasetVersion(final Document doc, final DataInfo info) { - return null; // NOT PRESENT IN OAF - } + @Override + protected Field prepareDatasetVersion(final Document doc, final DataInfo info) { + return null; // NOT PRESENT IN OAF + } - @Override - protected Field prepareDatasetSize(final Document doc, final DataInfo info) { - return null; // NOT PRESENT IN OAF - } + @Override + protected Field prepareDatasetSize(final Document doc, final DataInfo info) { + return null; // NOT PRESENT IN OAF + } - @Override - protected Field prepareDatasetDevice(final Document doc, final DataInfo info) { - return null; // NOT PRESENT IN OAF - } + @Override + protected Field prepareDatasetDevice(final Document doc, final DataInfo info) { + return null; // NOT PRESENT IN OAF + } - @Override - protected Field prepareDatasetStorageDate(final Document doc, final DataInfo info) { - return null; // NOT PRESENT IN OAF - } + @Override + protected Field prepareDatasetStorageDate(final Document doc, final DataInfo info) { + return null; // NOT PRESENT IN OAF + } - // OTHER PRODUCTS + // OTHER PRODUCTS - @Override - protected List> prepareOtherResearchProductTools(final Document doc, final DataInfo info) { - return new ArrayList<>(); // NOT PRESENT IN OAF - } + @Override + protected List> prepareOtherResearchProductTools( + final Document doc, final DataInfo info) { + return new ArrayList<>(); // NOT PRESENT IN OAF + } - @Override - protected List> prepareOtherResearchProductContactGroups(final Document doc, final DataInfo info) { - return new ArrayList<>(); // NOT PRESENT IN OAF - } + @Override + protected List> prepareOtherResearchProductContactGroups( + final Document doc, final DataInfo info) { + return new ArrayList<>(); // NOT PRESENT IN OAF + } - @Override - protected List> prepareOtherResearchProductContactPersons(final Document doc, final DataInfo info) { - return new ArrayList<>(); // NOT PRESENT IN OAF - } + @Override + protected List> prepareOtherResearchProductContactPersons( + final Document doc, final DataInfo info) { + return new ArrayList<>(); // NOT PRESENT IN OAF + } - @Override - protected List addOtherResultRels(final Document doc, - final KeyValue collectedFrom, - final DataInfo info, - final long lastUpdateTimestamp) { - final String docId = createOpenaireId(50, doc.valueOf("//dri:objIdentifier"), false); + @Override + protected List addOtherResultRels( + final Document doc, + final KeyValue collectedFrom, + final DataInfo info, + final long lastUpdateTimestamp) { + final String docId = createOpenaireId(50, doc.valueOf("//dri:objIdentifier"), false); - final List res = new ArrayList<>(); + final List res = new ArrayList<>(); - for (final Object o : doc.selectNodes("//*[local-name()='relatedDataset']")) { - final String otherId = createOpenaireId(50, ((Node) o).getText(), false); + for (final Object o : doc.selectNodes("//*[local-name()='relatedDataset']")) { + final String otherId = createOpenaireId(50, ((Node) o).getText(), false); - final Relation r1 = new Relation(); - r1.setRelType("resultResult"); - r1.setSubRelType("publicationDataset"); - r1.setRelClass("isRelatedTo"); - r1.setSource(docId); - r1.setTarget(otherId); - r1.setCollectedFrom(Arrays.asList(collectedFrom)); - r1.setDataInfo(info); - r1.setLastupdatetimestamp(lastUpdateTimestamp); - res.add(r1); + final Relation r1 = new Relation(); + r1.setRelType("resultResult"); + r1.setSubRelType("publicationDataset"); + r1.setRelClass("isRelatedTo"); + r1.setSource(docId); + r1.setTarget(otherId); + r1.setCollectedFrom(Arrays.asList(collectedFrom)); + r1.setDataInfo(info); + r1.setLastupdatetimestamp(lastUpdateTimestamp); + res.add(r1); - final Relation r2 = new Relation(); - r2.setRelType("resultResult"); - r2.setSubRelType("publicationDataset"); - r2.setRelClass("isRelatedTo"); - r2.setSource(otherId); - r2.setTarget(docId); - r2.setCollectedFrom(Arrays.asList(collectedFrom)); - r2.setDataInfo(info); - r2.setLastupdatetimestamp(lastUpdateTimestamp); - res.add(r2); - } - return res; - } - - @Override - protected Qualifier prepareResourceType(final Document doc, final DataInfo info) { - return null; // NOT PRESENT IN OAF - } + final Relation r2 = new Relation(); + r2.setRelType("resultResult"); + r2.setSubRelType("publicationDataset"); + r2.setRelClass("isRelatedTo"); + r2.setSource(otherId); + r2.setTarget(docId); + r2.setCollectedFrom(Arrays.asList(collectedFrom)); + r2.setDataInfo(info); + r2.setLastupdatetimestamp(lastUpdateTimestamp); + res.add(r2); + } + return res; + } + @Override + protected Qualifier prepareResourceType(final Document doc, final DataInfo info) { + return null; // NOT PRESENT IN OAF + } } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java index 6a6def977..5e6462417 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java @@ -1,253 +1,351 @@ package eu.dnetlib.dhp.oa.graph.raw; -import eu.dnetlib.dhp.schema.oaf.*; -import org.apache.commons.lang3.StringUtils; -import org.dom4j.Document; -import org.dom4j.Node; +import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.*; +import eu.dnetlib.dhp.schema.oaf.*; import java.util.ArrayList; import java.util.Arrays; import java.util.List; import java.util.Map; - -import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.*; +import org.apache.commons.lang3.StringUtils; +import org.dom4j.Document; +import org.dom4j.Node; public class OdfToOafMapper extends AbstractMdRecordToOafMapper { - public OdfToOafMapper(final Map code2name) { - super(code2name); - } + public OdfToOafMapper(final Map code2name) { + super(code2name); + } - @Override - protected List prepareTitles(final Document doc, final DataInfo info) { - return prepareListStructProps(doc, "//datacite:title", MAIN_TITLE_QUALIFIER, info); - } + @Override + protected List prepareTitles(final Document doc, final DataInfo info) { + return prepareListStructProps(doc, "//datacite:title", MAIN_TITLE_QUALIFIER, info); + } - @Override - protected List prepareAuthors(final Document doc, final DataInfo info) { - final List res = new ArrayList<>(); - int pos = 1; - for (final Object o : doc.selectNodes("//datacite:creator")) { - final Node n = (Node) o; - final Author author = new Author(); - author.setFullname(n.valueOf("./datacite:creatorName")); - author.setName(n.valueOf("./datacite:givenName")); - author.setSurname(n.valueOf("./datacite:familyName")); - author.setAffiliation(prepareListFields(doc, "./datacite:affiliation", info)); - author.setPid(preparePids(doc, info)); - author.setRank(pos++); - res.add(author); - } - return res; - } + @Override + protected List prepareAuthors(final Document doc, final DataInfo info) { + final List res = new ArrayList<>(); + int pos = 1; + for (final Object o : doc.selectNodes("//datacite:creator")) { + final Node n = (Node) o; + final Author author = new Author(); + author.setFullname(n.valueOf("./datacite:creatorName")); + author.setName(n.valueOf("./datacite:givenName")); + author.setSurname(n.valueOf("./datacite:familyName")); + author.setAffiliation(prepareListFields(doc, "./datacite:affiliation", info)); + author.setPid(preparePids(doc, info)); + author.setRank(pos++); + res.add(author); + } + return res; + } - private List preparePids(final Document doc, final DataInfo info) { - final List res = new ArrayList<>(); - for (final Object o : doc.selectNodes("./datacite:nameIdentifier")) { - res.add(structuredProperty(((Node) o).getText(), prepareQualifier((Node) o, "./@nameIdentifierScheme", "dnet:pid_types", "dnet:pid_types"), info)); - } - return res; - } + private List preparePids(final Document doc, final DataInfo info) { + final List res = new ArrayList<>(); + for (final Object o : doc.selectNodes("./datacite:nameIdentifier")) { + res.add( + structuredProperty( + ((Node) o).getText(), + prepareQualifier( + (Node) o, + "./@nameIdentifierScheme", + "dnet:pid_types", + "dnet:pid_types"), + info)); + } + return res; + } - @Override - protected List prepareInstances(final Document doc, final DataInfo info, final KeyValue collectedfrom, final KeyValue hostedby) { - final List res = new ArrayList<>(); - for (final Object o : doc.selectNodes("//datacite:alternateIdentifier[@alternateIdentifierType='URL']")) { - final Instance instance = new Instance(); - instance.setUrl(Arrays.asList(((Node) o).getText().trim())); - instance.setInstancetype(prepareQualifier(doc, "//dr:CobjCategory", "dnet:publication_resource", "dnet:publication_resource")); - instance.setCollectedfrom(collectedfrom); - instance.setHostedby(hostedby); - instance.setDateofacceptance(field(doc.valueOf("//oaf:dateAccepted"), info)); - instance.setDistributionlocation(doc.valueOf("//oaf:distributionlocation")); - instance.setAccessright(prepareQualifier(doc, "//oaf:accessrights", "dnet:access_modes", "dnet:access_modes")); - instance.setLicense(field(doc.valueOf("//oaf:license"), info)); - instance.setRefereed(field(doc.valueOf("//oaf:refereed"), info)); - instance.setProcessingchargeamount(field(doc.valueOf("//oaf:processingchargeamount"), info)); - instance.setProcessingchargecurrency(field(doc.valueOf("//oaf:processingchargeamount/@currency"), info)); - res.add(instance); - } - return res; - } + @Override + protected List prepareInstances( + final Document doc, + final DataInfo info, + final KeyValue collectedfrom, + final KeyValue hostedby) { + final List res = new ArrayList<>(); + for (final Object o : + doc.selectNodes("//datacite:alternateIdentifier[@alternateIdentifierType='URL']")) { + final Instance instance = new Instance(); + instance.setUrl(Arrays.asList(((Node) o).getText().trim())); + instance.setInstancetype( + prepareQualifier( + doc, + "//dr:CobjCategory", + "dnet:publication_resource", + "dnet:publication_resource")); + instance.setCollectedfrom(collectedfrom); + instance.setHostedby(hostedby); + instance.setDateofacceptance(field(doc.valueOf("//oaf:dateAccepted"), info)); + instance.setDistributionlocation(doc.valueOf("//oaf:distributionlocation")); + instance.setAccessright( + prepareQualifier( + doc, "//oaf:accessrights", "dnet:access_modes", "dnet:access_modes")); + instance.setLicense(field(doc.valueOf("//oaf:license"), info)); + instance.setRefereed(field(doc.valueOf("//oaf:refereed"), info)); + instance.setProcessingchargeamount( + field(doc.valueOf("//oaf:processingchargeamount"), info)); + instance.setProcessingchargecurrency( + field(doc.valueOf("//oaf:processingchargeamount/@currency"), info)); + res.add(instance); + } + return res; + } - @Override - protected List> prepareSources(final Document doc, final DataInfo info) { - return new ArrayList<>(); // Not present in ODF ??? - } + @Override + protected List> prepareSources(final Document doc, final DataInfo info) { + return new ArrayList<>(); // Not present in ODF ??? + } - @Override - protected List prepareRelevantDates(final Document doc, final DataInfo info) { - final List res = new ArrayList<>(); - for (final Object o : doc.selectNodes("//datacite:date")) { - final String dateType = ((Node) o).valueOf("@dateType"); - if (StringUtils.isBlank(dateType) && !dateType.equalsIgnoreCase("Accepted") && !dateType.equalsIgnoreCase("Issued") - && !dateType.equalsIgnoreCase("Updated") && !dateType.equalsIgnoreCase("Available")) { - res.add(structuredProperty(((Node) o).getText(), "UNKNOWN", "UNKNOWN", "dnet:dataCite_date", "dnet:dataCite_date", info)); - } - } - return res; - } + @Override + protected List prepareRelevantDates( + final Document doc, final DataInfo info) { + final List res = new ArrayList<>(); + for (final Object o : doc.selectNodes("//datacite:date")) { + final String dateType = ((Node) o).valueOf("@dateType"); + if (StringUtils.isBlank(dateType) + && !dateType.equalsIgnoreCase("Accepted") + && !dateType.equalsIgnoreCase("Issued") + && !dateType.equalsIgnoreCase("Updated") + && !dateType.equalsIgnoreCase("Available")) { + res.add( + structuredProperty( + ((Node) o).getText(), + "UNKNOWN", + "UNKNOWN", + "dnet:dataCite_date", + "dnet:dataCite_date", + info)); + } + } + return res; + } - @Override - protected List> prepareCoverages(final Document doc, final DataInfo info) { - return new ArrayList<>(); // Not present in ODF ??? - } + @Override + protected List> prepareCoverages(final Document doc, final DataInfo info) { + return new ArrayList<>(); // Not present in ODF ??? + } - @Override - protected List> prepareContributors(final Document doc, final DataInfo info) { - return prepareListFields(doc, "//datacite:contributorName", info); - } + @Override + protected List> prepareContributors(final Document doc, final DataInfo info) { + return prepareListFields(doc, "//datacite:contributorName", info); + } - @Override - protected List> prepareFormats(final Document doc, final DataInfo info) { - return prepareListFields(doc, "//datacite:format", info); - } + @Override + protected List> prepareFormats(final Document doc, final DataInfo info) { + return prepareListFields(doc, "//datacite:format", info); + } - @Override - protected Field preparePublisher(final Document doc, final DataInfo info) { - return prepareField(doc, "//datacite:publisher", info); - } + @Override + protected Field preparePublisher(final Document doc, final DataInfo info) { + return prepareField(doc, "//datacite:publisher", info); + } - @Override - protected List> prepareDescriptions(final Document doc, final DataInfo info) { - return prepareListFields(doc, "//datacite:description[@descriptionType='Abstract']", info); - } + @Override + protected List> prepareDescriptions(final Document doc, final DataInfo info) { + return prepareListFields(doc, "//datacite:description[@descriptionType='Abstract']", info); + } - @Override - protected List prepareSubjects(final Document doc, final DataInfo info) { - return prepareListStructProps(doc, "//datacite:subject", info); - } + @Override + protected List prepareSubjects(final Document doc, final DataInfo info) { + return prepareListStructProps(doc, "//datacite:subject", info); + } - @Override - protected Qualifier prepareLanguages(final Document doc) { - return prepareQualifier(doc, "//datacite:language", "dnet:languages", "dnet:languages"); - } + @Override + protected Qualifier prepareLanguages(final Document doc) { + return prepareQualifier(doc, "//datacite:language", "dnet:languages", "dnet:languages"); + } - @Override - protected List> prepareOtherResearchProductTools(final Document doc, final DataInfo info) { - return new ArrayList<>(); // Not present in ODF ??? - } + @Override + protected List> prepareOtherResearchProductTools( + final Document doc, final DataInfo info) { + return new ArrayList<>(); // Not present in ODF ??? + } - @Override - protected List> prepareOtherResearchProductContactGroups(final Document doc, final DataInfo info) { - return prepareListFields(doc, "//datacite:contributor[@contributorType='ContactGroup']/datacite:contributorName", info); - } + @Override + protected List> prepareOtherResearchProductContactGroups( + final Document doc, final DataInfo info) { + return prepareListFields( + doc, + "//datacite:contributor[@contributorType='ContactGroup']/datacite:contributorName", + info); + } - @Override - protected List> prepareOtherResearchProductContactPersons(final Document doc, final DataInfo info) { - return prepareListFields(doc, "//datacite:contributor[@contributorType='ContactPerson']/datacite:contributorName", info); - } + @Override + protected List> prepareOtherResearchProductContactPersons( + final Document doc, final DataInfo info) { + return prepareListFields( + doc, + "//datacite:contributor[@contributorType='ContactPerson']/datacite:contributorName", + info); + } - @Override - protected Qualifier prepareSoftwareProgrammingLanguage(final Document doc, final DataInfo info) { - return prepareQualifier(doc, "//datacite:format", "dnet:programming_languages", "dnet:programming_languages"); - } + @Override + protected Qualifier prepareSoftwareProgrammingLanguage( + final Document doc, final DataInfo info) { + return prepareQualifier( + doc, + "//datacite:format", + "dnet:programming_languages", + "dnet:programming_languages"); + } - @Override - protected Field prepareSoftwareCodeRepositoryUrl(final Document doc, final DataInfo info) { - return null; // Not present in ODF ??? - } + @Override + protected Field prepareSoftwareCodeRepositoryUrl( + final Document doc, final DataInfo info) { + return null; // Not present in ODF ??? + } - @Override - protected List prepareSoftwareLicenses(final Document doc, final DataInfo info) { - return new ArrayList<>(); // Not present in ODF ??? - } + @Override + protected List prepareSoftwareLicenses( + final Document doc, final DataInfo info) { + return new ArrayList<>(); // Not present in ODF ??? + } - @Override - protected List> prepareSoftwareDocumentationUrls(final Document doc, final DataInfo info) { - return prepareListFields(doc, "//datacite:relatedIdentifier[@relatedIdentifierType='URL' and @relationType='IsDocumentedBy']", info); - } + @Override + protected List> prepareSoftwareDocumentationUrls( + final Document doc, final DataInfo info) { + return prepareListFields( + doc, + "//datacite:relatedIdentifier[@relatedIdentifierType='URL' and @relationType='IsDocumentedBy']", + info); + } - // DATASETS + // DATASETS - @Override - protected List prepareDatasetGeoLocations(final Document doc, final DataInfo info) { - final List res = new ArrayList<>(); + @Override + protected List prepareDatasetGeoLocations( + final Document doc, final DataInfo info) { + final List res = new ArrayList<>(); - for (final Object o : doc.selectNodes("//datacite:geoLocation")) { - final GeoLocation loc = new GeoLocation(); - loc.setBox(((Node) o).valueOf("./datacite:geoLocationBox")); - loc.setPlace(((Node) o).valueOf("./datacite:geoLocationPlace")); - loc.setPoint(((Node) o).valueOf("./datacite:geoLocationPoint")); - res.add(loc); - } - return res; - } + for (final Object o : doc.selectNodes("//datacite:geoLocation")) { + final GeoLocation loc = new GeoLocation(); + loc.setBox(((Node) o).valueOf("./datacite:geoLocationBox")); + loc.setPlace(((Node) o).valueOf("./datacite:geoLocationPlace")); + loc.setPoint(((Node) o).valueOf("./datacite:geoLocationPoint")); + res.add(loc); + } + return res; + } - @Override - protected Field prepareDatasetMetadataVersionNumber(final Document doc, final DataInfo info) { - return null; // Not present in ODF ??? - } + @Override + protected Field prepareDatasetMetadataVersionNumber( + final Document doc, final DataInfo info) { + return null; // Not present in ODF ??? + } - @Override - protected Field prepareDatasetLastMetadataUpdate(final Document doc, final DataInfo info) { - return prepareField(doc, "//datacite:date[@dateType='Updated']", info); - } + @Override + protected Field prepareDatasetLastMetadataUpdate( + final Document doc, final DataInfo info) { + return prepareField(doc, "//datacite:date[@dateType='Updated']", info); + } - @Override - protected Field prepareDatasetVersion(final Document doc, final DataInfo info) { - return prepareField(doc, "//datacite:version", info); - } + @Override + protected Field prepareDatasetVersion(final Document doc, final DataInfo info) { + return prepareField(doc, "//datacite:version", info); + } - @Override - protected Field prepareDatasetSize(final Document doc, final DataInfo info) { - return prepareField(doc, "//datacite:size", info); - } + @Override + protected Field prepareDatasetSize(final Document doc, final DataInfo info) { + return prepareField(doc, "//datacite:size", info); + } - @Override - protected Field prepareDatasetDevice(final Document doc, final DataInfo info) { - return null; // Not present in ODF ??? - } + @Override + protected Field prepareDatasetDevice(final Document doc, final DataInfo info) { + return null; // Not present in ODF ??? + } - @Override - protected Field prepareDatasetStorageDate(final Document doc, final DataInfo info) { - return prepareField(doc, "//datacite:date[@dateType='Issued']", info); - } + @Override + protected Field prepareDatasetStorageDate(final Document doc, final DataInfo info) { + return prepareField(doc, "//datacite:date[@dateType='Issued']", info); + } - @Override - protected List addOtherResultRels(final Document doc, final KeyValue collectedFrom, final DataInfo info, final long lastUpdateTimestamp) { + @Override + protected List addOtherResultRels( + final Document doc, + final KeyValue collectedFrom, + final DataInfo info, + final long lastUpdateTimestamp) { - final String docId = createOpenaireId(50, doc.valueOf("//dri:objIdentifier"), false); + final String docId = createOpenaireId(50, doc.valueOf("//dri:objIdentifier"), false); - final List res = new ArrayList<>(); + final List res = new ArrayList<>(); - for (final Object o : doc.selectNodes("//datacite:relatedIdentifier[@relatedIdentifierType='OPENAIRE']")) { - final String otherId = createOpenaireId(50, ((Node) o).getText(), false); - final String type = ((Node) o).valueOf("@relationType"); + for (final Object o : + doc.selectNodes( + "//datacite:relatedIdentifier[@relatedIdentifierType='OPENAIRE']")) { + final String otherId = createOpenaireId(50, ((Node) o).getText(), false); + final String type = ((Node) o).valueOf("@relationType"); - if (type.equals("IsSupplementTo")) { - res.add(prepareOtherResultRel(collectedFrom, info, lastUpdateTimestamp, docId, otherId, "supplement", "isSupplementTo")); - res.add(prepareOtherResultRel(collectedFrom, info, lastUpdateTimestamp, otherId, docId, "supplement", "isSupplementedBy")); - } else if (type.equals("IsPartOf")) { - res.add(prepareOtherResultRel(collectedFrom, info, lastUpdateTimestamp, docId, otherId, "part", "IsPartOf")); - res.add(prepareOtherResultRel(collectedFrom, info, lastUpdateTimestamp, otherId, docId, "part", "HasParts")); - } else {} - } - return res; - } + if (type.equals("IsSupplementTo")) { + res.add( + prepareOtherResultRel( + collectedFrom, + info, + lastUpdateTimestamp, + docId, + otherId, + "supplement", + "isSupplementTo")); + res.add( + prepareOtherResultRel( + collectedFrom, + info, + lastUpdateTimestamp, + otherId, + docId, + "supplement", + "isSupplementedBy")); + } else if (type.equals("IsPartOf")) { + res.add( + prepareOtherResultRel( + collectedFrom, + info, + lastUpdateTimestamp, + docId, + otherId, + "part", + "IsPartOf")); + res.add( + prepareOtherResultRel( + collectedFrom, + info, + lastUpdateTimestamp, + otherId, + docId, + "part", + "HasParts")); + } else { + } + } + return res; + } - private Relation prepareOtherResultRel(final KeyValue collectedFrom, - final DataInfo info, - final long lastUpdateTimestamp, - final String source, - final String target, - final String subRelType, - final String relClass) { - final Relation r = new Relation(); - r.setRelType("resultResult"); - r.setSubRelType(subRelType); - r.setRelClass(relClass); - r.setSource(source); - r.setTarget(target); - r.setCollectedFrom(Arrays.asList(collectedFrom)); - r.setDataInfo(info); - r.setLastupdatetimestamp(lastUpdateTimestamp); - return r; - } - - @Override - protected Qualifier prepareResourceType(final Document doc, final DataInfo info) { - return prepareQualifier(doc, "//*[local-name() = 'resource']//*[local-name() = 'resourceType']", "dnet:dataCite_resource", "dnet:dataCite_resource"); - } + private Relation prepareOtherResultRel( + final KeyValue collectedFrom, + final DataInfo info, + final long lastUpdateTimestamp, + final String source, + final String target, + final String subRelType, + final String relClass) { + final Relation r = new Relation(); + r.setRelType("resultResult"); + r.setSubRelType(subRelType); + r.setRelClass(relClass); + r.setSource(source); + r.setTarget(target); + r.setCollectedFrom(Arrays.asList(collectedFrom)); + r.setDataInfo(info); + r.setLastupdatetimestamp(lastUpdateTimestamp); + return r; + } + @Override + protected Qualifier prepareResourceType(final Document doc, final DataInfo info) { + return prepareQualifier( + doc, + "//*[local-name() = 'resource']//*[local-name() = 'resourceType']", + "dnet:dataCite_resource", + "dnet:dataCite_resource"); + } } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/AbstractMigrationApplication.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/AbstractMigrationApplication.java index aec1ea50d..c22f7afd0 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/AbstractMigrationApplication.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/AbstractMigrationApplication.java @@ -1,6 +1,9 @@ package eu.dnetlib.dhp.oa.graph.raw.common; import eu.dnetlib.dhp.schema.oaf.Oaf; +import java.io.Closeable; +import java.io.IOException; +import java.util.concurrent.atomic.AtomicInteger; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; @@ -9,72 +12,73 @@ import org.apache.hadoop.io.SequenceFile; import org.apache.hadoop.io.Text; import org.codehaus.jackson.map.ObjectMapper; -import java.io.Closeable; -import java.io.IOException; -import java.util.concurrent.atomic.AtomicInteger; - public class AbstractMigrationApplication implements Closeable { - private final AtomicInteger counter = new AtomicInteger(0); + private final AtomicInteger counter = new AtomicInteger(0); - private final Text key = new Text(); + private final Text key = new Text(); - private final Text value = new Text(); + private final Text value = new Text(); - private final SequenceFile.Writer writer; + private final SequenceFile.Writer writer; - private final ObjectMapper objectMapper = new ObjectMapper(); + private final ObjectMapper objectMapper = new ObjectMapper(); - private static final Log log = LogFactory.getLog(AbstractMigrationApplication.class); + private static final Log log = LogFactory.getLog(AbstractMigrationApplication.class); - protected AbstractMigrationApplication() { // ONLY FOR UNIT TEST - this.writer = null; - } + protected AbstractMigrationApplication() { // ONLY FOR UNIT TEST + this.writer = null; + } - public AbstractMigrationApplication(final String hdfsPath) throws Exception { + public AbstractMigrationApplication(final String hdfsPath) throws Exception { - log.info(String.format("Creating SequenceFile Writer, hdfsPath=%s", hdfsPath)); + log.info(String.format("Creating SequenceFile Writer, hdfsPath=%s", hdfsPath)); - this.writer = SequenceFile.createWriter(getConf(), SequenceFile.Writer.file(new Path(hdfsPath)), SequenceFile.Writer - .keyClass(Text.class), SequenceFile.Writer.valueClass(Text.class)); - } + this.writer = + SequenceFile.createWriter( + getConf(), + SequenceFile.Writer.file(new Path(hdfsPath)), + SequenceFile.Writer.keyClass(Text.class), + SequenceFile.Writer.valueClass(Text.class)); + } - private Configuration getConf() throws IOException { - final Configuration conf = new Configuration(); - /* - * conf.set("fs.defaultFS", hdfsNameNode); conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName()); - * conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName()); System.setProperty("HADOOP_USER_NAME", hdfsUser); - * System.setProperty("hadoop.home.dir", "/"); FileSystem.get(URI.create(hdfsNameNode), conf); - */ - return conf; - } + private Configuration getConf() throws IOException { + final Configuration conf = new Configuration(); + /* + * conf.set("fs.defaultFS", hdfsNameNode); conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName()); + * conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName()); System.setProperty("HADOOP_USER_NAME", hdfsUser); + * System.setProperty("hadoop.home.dir", "/"); FileSystem.get(URI.create(hdfsNameNode), conf); + */ + return conf; + } - protected void emit(final String s, final String type) { - try { - key.set(counter.getAndIncrement() + ":" + type); - value.set(s); - writer.append(key, value); - } catch (final Exception e) { - throw new RuntimeException(e); - } - } + protected void emit(final String s, final String type) { + try { + key.set(counter.getAndIncrement() + ":" + type); + value.set(s); + writer.append(key, value); + } catch (final Exception e) { + throw new RuntimeException(e); + } + } - protected void emitOaf(final Oaf oaf) { - try { - emit(objectMapper.writeValueAsString(oaf), oaf.getClass().getSimpleName().toLowerCase()); - } catch (final Exception e) { - throw new RuntimeException(e); - } - } + protected void emitOaf(final Oaf oaf) { + try { + emit( + objectMapper.writeValueAsString(oaf), + oaf.getClass().getSimpleName().toLowerCase()); + } catch (final Exception e) { + throw new RuntimeException(e); + } + } - public ObjectMapper getObjectMapper() { - return objectMapper; - } - - @Override - public void close() throws IOException { - writer.hflush(); - writer.close(); - } + public ObjectMapper getObjectMapper() { + return objectMapper; + } + @Override + public void close() throws IOException { + writer.hflush(); + writer.close(); + } } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/DbClient.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/DbClient.java index 9c0562946..436015d2d 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/DbClient.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/DbClient.java @@ -1,61 +1,61 @@ package eu.dnetlib.dhp.oa.graph.raw.common; -import org.apache.commons.lang3.StringUtils; -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; - import java.io.Closeable; import java.io.IOException; import java.sql.*; import java.util.function.Consumer; +import org.apache.commons.lang3.StringUtils; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; public class DbClient implements Closeable { - private static final Log log = LogFactory.getLog(DbClient.class); + private static final Log log = LogFactory.getLog(DbClient.class); - private Connection connection; + private Connection connection; - public DbClient(final String address, final String login, final String password) { + public DbClient(final String address, final String login, final String password) { - try { - Class.forName("org.postgresql.Driver"); + try { + Class.forName("org.postgresql.Driver"); - this.connection = - StringUtils.isNoneBlank(login, password) ? DriverManager.getConnection(address, login, password) : DriverManager.getConnection(address); - this.connection.setAutoCommit(false); - } catch (final Exception e) { - log.error("Connection to postgresDB failed"); - throw new RuntimeException("Connection to postgresDB failed", e); - } - log.info("Opened database successfully"); - } + this.connection = + StringUtils.isNoneBlank(login, password) + ? DriverManager.getConnection(address, login, password) + : DriverManager.getConnection(address); + this.connection.setAutoCommit(false); + } catch (final Exception e) { + log.error("Connection to postgresDB failed"); + throw new RuntimeException("Connection to postgresDB failed", e); + } + log.info("Opened database successfully"); + } - public void processResults(final String sql, final Consumer consumer) { + public void processResults(final String sql, final Consumer consumer) { - try (final Statement stmt = connection.createStatement()) { - stmt.setFetchSize(100); + try (final Statement stmt = connection.createStatement()) { + stmt.setFetchSize(100); - try (final ResultSet rs = stmt.executeQuery(sql)) { - while (rs.next()) { - consumer.accept(rs); - } - } catch (final SQLException e) { - log.error("Error executing sql query: " + sql, e); - throw new RuntimeException("Error executing sql query", e); - } - } catch (final SQLException e1) { - log.error("Error preparing sql statement", e1); - throw new RuntimeException("Error preparing sql statement", e1); - } - } - - @Override - public void close() throws IOException { - try { - connection.close(); - } catch (final SQLException e) { - throw new RuntimeException(e); - } - } + try (final ResultSet rs = stmt.executeQuery(sql)) { + while (rs.next()) { + consumer.accept(rs); + } + } catch (final SQLException e) { + log.error("Error executing sql query: " + sql, e); + throw new RuntimeException("Error executing sql query", e); + } + } catch (final SQLException e1) { + log.error("Error preparing sql statement", e1); + throw new RuntimeException("Error preparing sql statement", e1); + } + } + @Override + public void close() throws IOException { + try { + connection.close(); + } catch (final SQLException e) { + throw new RuntimeException(e); + } + } } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/MdstoreClient.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/MdstoreClient.java index ac700ef63..321daf1e7 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/MdstoreClient.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/MdstoreClient.java @@ -5,89 +5,98 @@ import com.mongodb.MongoClient; import com.mongodb.MongoClientURI; import com.mongodb.client.MongoCollection; import com.mongodb.client.MongoDatabase; -import org.apache.commons.lang3.StringUtils; -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; -import org.bson.Document; - import java.io.Closeable; import java.io.IOException; import java.util.ArrayList; import java.util.HashMap; import java.util.Map; import java.util.stream.StreamSupport; +import org.apache.commons.lang3.StringUtils; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.bson.Document; public class MdstoreClient implements Closeable { - private final MongoClient client; - private final MongoDatabase db; + private final MongoClient client; + private final MongoDatabase db; - private static final String COLL_METADATA = "metadata"; - private static final String COLL_METADATA_MANAGER = "metadataManager"; + private static final String COLL_METADATA = "metadata"; + private static final String COLL_METADATA_MANAGER = "metadataManager"; - private static final Log log = LogFactory.getLog(MdstoreClient.class); + private static final Log log = LogFactory.getLog(MdstoreClient.class); - public MdstoreClient(final String baseUrl, final String dbName) { - this.client = new MongoClient(new MongoClientURI(baseUrl)); - this.db = getDb(client, dbName); - } + public MdstoreClient(final String baseUrl, final String dbName) { + this.client = new MongoClient(new MongoClientURI(baseUrl)); + this.db = getDb(client, dbName); + } - public Map validCollections(final String mdFormat, final String mdLayout, final String mdInterpretation) { + public Map validCollections( + final String mdFormat, final String mdLayout, final String mdInterpretation) { - final Map transactions = new HashMap<>(); - for (final Document entry : getColl(db, COLL_METADATA_MANAGER, true).find()) { - final String mdId = entry.getString("mdId"); - final String currentId = entry.getString("currentId"); - if (StringUtils.isNoneBlank(mdId, currentId)) { - transactions.put(mdId, currentId); - } - } + final Map transactions = new HashMap<>(); + for (final Document entry : getColl(db, COLL_METADATA_MANAGER, true).find()) { + final String mdId = entry.getString("mdId"); + final String currentId = entry.getString("currentId"); + if (StringUtils.isNoneBlank(mdId, currentId)) { + transactions.put(mdId, currentId); + } + } - final Map res = new HashMap<>(); - for (final Document entry : getColl(db, COLL_METADATA, true).find()) { - if (entry.getString("format").equals(mdFormat) && entry.getString("layout").equals(mdLayout) - && entry.getString("interpretation").equals(mdInterpretation) && transactions.containsKey(entry.getString("mdId"))) { - res.put(entry.getString("mdId"), transactions.get(entry.getString("mdId"))); - } - } + final Map res = new HashMap<>(); + for (final Document entry : getColl(db, COLL_METADATA, true).find()) { + if (entry.getString("format").equals(mdFormat) + && entry.getString("layout").equals(mdLayout) + && entry.getString("interpretation").equals(mdInterpretation) + && transactions.containsKey(entry.getString("mdId"))) { + res.put(entry.getString("mdId"), transactions.get(entry.getString("mdId"))); + } + } - return res; - } + return res; + } - private MongoDatabase getDb(final MongoClient client, final String dbName) { - if (!Iterables.contains(client.listDatabaseNames(), dbName)) { - final String err = String.format("Database '%s' not found in %s", dbName, client.getAddress()); - log.warn(err); - throw new RuntimeException(err); - } - return client.getDatabase(dbName); - } + private MongoDatabase getDb(final MongoClient client, final String dbName) { + if (!Iterables.contains(client.listDatabaseNames(), dbName)) { + final String err = + String.format("Database '%s' not found in %s", dbName, client.getAddress()); + log.warn(err); + throw new RuntimeException(err); + } + return client.getDatabase(dbName); + } - private MongoCollection getColl(final MongoDatabase db, final String collName, final boolean abortIfMissing) { - if (!Iterables.contains(db.listCollectionNames(), collName)) { - final String err = String.format(String.format("Missing collection '%s' in database '%s'", collName, db.getName())); - log.warn(err); - if (abortIfMissing) { - throw new RuntimeException(err); - } else { - return null; - } - } - return db.getCollection(collName); - } + private MongoCollection getColl( + final MongoDatabase db, final String collName, final boolean abortIfMissing) { + if (!Iterables.contains(db.listCollectionNames(), collName)) { + final String err = + String.format( + String.format( + "Missing collection '%s' in database '%s'", + collName, db.getName())); + log.warn(err); + if (abortIfMissing) { + throw new RuntimeException(err); + } else { + return null; + } + } + return db.getCollection(collName); + } - public Iterable listRecords(final String collName) { - final MongoCollection coll = getColl(db, collName, false); - return coll == null ? new ArrayList<>() - : () -> StreamSupport.stream(coll.find().spliterator(), false) - .filter(e -> e.containsKey("body")) - .map(e -> e.getString("body")) - .iterator(); - } - - @Override - public void close() throws IOException { - client.close(); - } + public Iterable listRecords(final String collName) { + final MongoCollection coll = getColl(db, collName, false); + return coll == null + ? new ArrayList<>() + : () -> + StreamSupport.stream(coll.find().spliterator(), false) + .filter(e -> e.containsKey("body")) + .map(e -> e.getString("body")) + .iterator(); + } + @Override + public void close() throws IOException { + client.close(); + } } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/OafMapperUtils.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/OafMapperUtils.java index d02070a8b..0f844cda0 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/OafMapperUtils.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/OafMapperUtils.java @@ -2,185 +2,215 @@ package eu.dnetlib.dhp.oa.graph.raw.common; import eu.dnetlib.dhp.schema.oaf.*; import eu.dnetlib.dhp.utils.DHPUtils; -import org.apache.commons.lang3.StringUtils; - import java.util.ArrayList; import java.util.Arrays; import java.util.List; import java.util.Objects; import java.util.stream.Collectors; +import org.apache.commons.lang3.StringUtils; public class OafMapperUtils { - public static KeyValue keyValue(final String k, final String v) { - final KeyValue kv = new KeyValue(); - kv.setKey(k); - kv.setValue(v); - return kv; - } + public static KeyValue keyValue(final String k, final String v) { + final KeyValue kv = new KeyValue(); + kv.setKey(k); + kv.setValue(v); + return kv; + } - public static List listKeyValues(final String... s) { - if (s.length % 2 > 0) { throw new RuntimeException("Invalid number of parameters (k,v,k,v,....)"); } + public static List listKeyValues(final String... s) { + if (s.length % 2 > 0) { + throw new RuntimeException("Invalid number of parameters (k,v,k,v,....)"); + } - final List list = new ArrayList<>(); - for (int i = 0; i < s.length; i += 2) { - list.add(keyValue(s[i], s[i + 1])); - } - return list; - } + final List list = new ArrayList<>(); + for (int i = 0; i < s.length; i += 2) { + list.add(keyValue(s[i], s[i + 1])); + } + return list; + } - public static Field field(final T value, final DataInfo info) { - if (value == null || StringUtils.isBlank(value.toString())) { return null; } + public static Field field(final T value, final DataInfo info) { + if (value == null || StringUtils.isBlank(value.toString())) { + return null; + } - final Field field = new Field<>(); - field.setValue(value); - field.setDataInfo(info); - return field; - } + final Field field = new Field<>(); + field.setValue(value); + field.setDataInfo(info); + return field; + } - public static List> listFields(final DataInfo info, final String... values) { - return Arrays.stream(values).map(v -> field(v, info)).filter(Objects::nonNull).collect(Collectors.toList()); - } + public static List> listFields(final DataInfo info, final String... values) { + return Arrays.stream(values) + .map(v -> field(v, info)) + .filter(Objects::nonNull) + .collect(Collectors.toList()); + } - public static List> listFields(final DataInfo info, final List values) { - return values.stream().map(v -> field(v, info)).filter(Objects::nonNull).collect(Collectors.toList()); - } + public static List> listFields(final DataInfo info, final List values) { + return values.stream() + .map(v -> field(v, info)) + .filter(Objects::nonNull) + .collect(Collectors.toList()); + } - public static Qualifier qualifier(final String classid, final String classname, final String schemeid, final String schemename) { - final Qualifier q = new Qualifier(); - q.setClassid(classid); - q.setClassname(classname); - q.setSchemeid(schemeid); - q.setSchemename(schemename); - return q; - } + public static Qualifier qualifier( + final String classid, + final String classname, + final String schemeid, + final String schemename) { + final Qualifier q = new Qualifier(); + q.setClassid(classid); + q.setClassname(classname); + q.setSchemeid(schemeid); + q.setSchemename(schemename); + return q; + } - public static StructuredProperty structuredProperty(final String value, - final String classid, - final String classname, - final String schemeid, - final String schemename, - final DataInfo dataInfo) { + public static StructuredProperty structuredProperty( + final String value, + final String classid, + final String classname, + final String schemeid, + final String schemename, + final DataInfo dataInfo) { - return structuredProperty(value, qualifier(classid, classname, schemeid, schemename), dataInfo); - } + return structuredProperty( + value, qualifier(classid, classname, schemeid, schemename), dataInfo); + } - public static StructuredProperty structuredProperty(final String value, final Qualifier qualifier, final DataInfo dataInfo) { - if (value == null) { return null; } - final StructuredProperty sp = new StructuredProperty(); - sp.setValue(value); - sp.setQualifier(qualifier); - sp.setDataInfo(dataInfo); - return sp; - } + public static StructuredProperty structuredProperty( + final String value, final Qualifier qualifier, final DataInfo dataInfo) { + if (value == null) { + return null; + } + final StructuredProperty sp = new StructuredProperty(); + sp.setValue(value); + sp.setQualifier(qualifier); + sp.setDataInfo(dataInfo); + return sp; + } - public static ExtraInfo extraInfo(final String name, final String value, final String typology, final String provenance, final String trust) { - final ExtraInfo info = new ExtraInfo(); - info.setName(name); - info.setValue(value); - info.setTypology(typology); - info.setProvenance(provenance); - info.setTrust(trust); - return info; - } + public static ExtraInfo extraInfo( + final String name, + final String value, + final String typology, + final String provenance, + final String trust) { + final ExtraInfo info = new ExtraInfo(); + info.setName(name); + info.setValue(value); + info.setTypology(typology); + info.setProvenance(provenance); + info.setTrust(trust); + return info; + } - public static OAIProvenance oaiIProvenance(final String identifier, - final String baseURL, - final String metadataNamespace, - final Boolean altered, - final String datestamp, - final String harvestDate) { + public static OAIProvenance oaiIProvenance( + final String identifier, + final String baseURL, + final String metadataNamespace, + final Boolean altered, + final String datestamp, + final String harvestDate) { - final OriginDescription desc = new OriginDescription(); - desc.setIdentifier(identifier); - desc.setBaseURL(baseURL); - desc.setMetadataNamespace(metadataNamespace); - desc.setAltered(altered); - desc.setDatestamp(datestamp); - desc.setHarvestDate(harvestDate); + final OriginDescription desc = new OriginDescription(); + desc.setIdentifier(identifier); + desc.setBaseURL(baseURL); + desc.setMetadataNamespace(metadataNamespace); + desc.setAltered(altered); + desc.setDatestamp(datestamp); + desc.setHarvestDate(harvestDate); - final OAIProvenance p = new OAIProvenance(); - p.setOriginDescription(desc); + final OAIProvenance p = new OAIProvenance(); + p.setOriginDescription(desc); - return p; - } + return p; + } - public static Journal journal(final String name, - final String issnPrinted, - final String issnOnline, - final String issnLinking, - final String ep, - final String iss, - final String sp, - final String vol, - final String edition, - final String conferenceplace, - final String conferencedate, - final DataInfo dataInfo) { + public static Journal journal( + final String name, + final String issnPrinted, + final String issnOnline, + final String issnLinking, + final String ep, + final String iss, + final String sp, + final String vol, + final String edition, + final String conferenceplace, + final String conferencedate, + final DataInfo dataInfo) { - if (StringUtils.isNotBlank(name) || StringUtils.isNotBlank(issnPrinted) || StringUtils.isNotBlank(issnOnline) || StringUtils.isNotBlank(issnLinking)) { - final Journal j = new Journal(); - j.setName(name); - j.setIssnPrinted(issnPrinted); - j.setIssnOnline(issnOnline); - j.setIssnLinking(issnLinking); - j.setEp(ep); - j.setIss(iss); - j.setSp(sp); - j.setVol(vol); - j.setEdition(edition); - j.setConferenceplace(conferenceplace); - j.setConferencedate(conferencedate); - j.setDataInfo(dataInfo); - return j; - } else { - return null; - } - } + if (StringUtils.isNotBlank(name) + || StringUtils.isNotBlank(issnPrinted) + || StringUtils.isNotBlank(issnOnline) + || StringUtils.isNotBlank(issnLinking)) { + final Journal j = new Journal(); + j.setName(name); + j.setIssnPrinted(issnPrinted); + j.setIssnOnline(issnOnline); + j.setIssnLinking(issnLinking); + j.setEp(ep); + j.setIss(iss); + j.setSp(sp); + j.setVol(vol); + j.setEdition(edition); + j.setConferenceplace(conferenceplace); + j.setConferencedate(conferencedate); + j.setDataInfo(dataInfo); + return j; + } else { + return null; + } + } - public static DataInfo dataInfo(final Boolean deletedbyinference, - final String inferenceprovenance, - final Boolean inferred, - final Boolean invisible, - final Qualifier provenanceaction, - final String trust) { - final DataInfo d = new DataInfo(); - d.setDeletedbyinference(deletedbyinference); - d.setInferenceprovenance(inferenceprovenance); - d.setInferred(inferred); - d.setInvisible(invisible); - d.setProvenanceaction(provenanceaction); - d.setTrust(trust); - return d; - } + public static DataInfo dataInfo( + final Boolean deletedbyinference, + final String inferenceprovenance, + final Boolean inferred, + final Boolean invisible, + final Qualifier provenanceaction, + final String trust) { + final DataInfo d = new DataInfo(); + d.setDeletedbyinference(deletedbyinference); + d.setInferenceprovenance(inferenceprovenance); + d.setInferred(inferred); + d.setInvisible(invisible); + d.setProvenanceaction(provenanceaction); + d.setTrust(trust); + return d; + } - public static String createOpenaireId(final int prefix, final String originalId, final boolean to_md5) { - if (to_md5) { - final String nsPrefix = StringUtils.substringBefore(originalId, "::"); - final String rest = StringUtils.substringAfter(originalId, "::"); - return String.format("%s|%s::%s", prefix, nsPrefix, DHPUtils.md5(rest)); - } else { - return String.format("%s|%s", prefix, originalId); - } - } + public static String createOpenaireId( + final int prefix, final String originalId, final boolean to_md5) { + if (to_md5) { + final String nsPrefix = StringUtils.substringBefore(originalId, "::"); + final String rest = StringUtils.substringAfter(originalId, "::"); + return String.format("%s|%s::%s", prefix, nsPrefix, DHPUtils.md5(rest)); + } else { + return String.format("%s|%s", prefix, originalId); + } + } - public static String createOpenaireId(final String type, final String originalId, final boolean to_md5) { - switch (type) { - case "datasource": - return createOpenaireId(10, originalId, to_md5); - case "organization": - return createOpenaireId(20, originalId, to_md5); - case "person": - return createOpenaireId(30, originalId, to_md5); - case "project": - return createOpenaireId(40, originalId, to_md5); - default: - return createOpenaireId(50, originalId, to_md5); - } - } - - public static String asString(final Object o) { - return o == null ? "" : o.toString(); - } + public static String createOpenaireId( + final String type, final String originalId, final boolean to_md5) { + switch (type) { + case "datasource": + return createOpenaireId(10, originalId, to_md5); + case "organization": + return createOpenaireId(20, originalId, to_md5); + case "person": + return createOpenaireId(30, originalId, to_md5); + case "project": + return createOpenaireId(40, originalId, to_md5); + default: + return createOpenaireId(50, originalId, to_md5); + } + } + public static String asString(final Object o) { + return o == null ? "" : o.toString(); + } } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/PacePerson.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/PacePerson.java index a72788728..8d29835d1 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/PacePerson.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/PacePerson.java @@ -5,171 +5,175 @@ import com.google.common.base.Splitter; import com.google.common.collect.Iterables; import com.google.common.collect.Lists; import com.google.common.hash.Hashing; -import org.apache.commons.io.IOUtils; -import org.apache.commons.lang3.text.WordUtils; - import java.nio.charset.Charset; import java.text.Normalizer; import java.util.HashSet; import java.util.List; import java.util.Set; +import org.apache.commons.io.IOUtils; +import org.apache.commons.lang3.text.WordUtils; public class PacePerson { - private static final String UTF8 = "UTF-8"; - private List name = Lists.newArrayList(); - private List surname = Lists.newArrayList(); - private List fullname = Lists.newArrayList(); - private final String original; + private static final String UTF8 = "UTF-8"; + private List name = Lists.newArrayList(); + private List surname = Lists.newArrayList(); + private List fullname = Lists.newArrayList(); + private final String original; - private static Set particles = null; + private static Set particles = null; - public static final String capitalize(final String s) { - return WordUtils.capitalize(s.toLowerCase(), ' ', '-'); - } + public static final String capitalize(final String s) { + return WordUtils.capitalize(s.toLowerCase(), ' ', '-'); + } - public static final String dotAbbreviations(final String s) { - return s.length() == 1 ? s + "." : s; - } + public static final String dotAbbreviations(final String s) { + return s.length() == 1 ? s + "." : s; + } - public static Set loadFromClasspath(final String classpath) { - final Set h = new HashSet<>(); - try { - for (final String s : IOUtils.readLines(PacePerson.class.getResourceAsStream(classpath))) { - h.add(s); - } - } catch (final Throwable e) { - return new HashSet<>(); - } - return h; - } + public static Set loadFromClasspath(final String classpath) { + final Set h = new HashSet<>(); + try { + for (final String s : + IOUtils.readLines(PacePerson.class.getResourceAsStream(classpath))) { + h.add(s); + } + } catch (final Throwable e) { + return new HashSet<>(); + } + return h; + } - public PacePerson(String s, final boolean aggressive) { - original = s; - s = Normalizer.normalize(s, Normalizer.Form.NFD); - s = s.replaceAll("\\(.+\\)", ""); - s = s.replaceAll("\\[.+\\]", ""); - s = s.replaceAll("\\{.+\\}", ""); - s = s.replaceAll("\\s+-\\s+", "-"); - s = s.replaceAll("[\\p{Punct}&&[^,-]]", " "); - s = s.replaceAll("\\d", " "); - s = s.replaceAll("\\n", " "); - s = s.replaceAll("\\.", " "); - s = s.replaceAll("\\s+", " "); + public PacePerson(String s, final boolean aggressive) { + original = s; + s = Normalizer.normalize(s, Normalizer.Form.NFD); + s = s.replaceAll("\\(.+\\)", ""); + s = s.replaceAll("\\[.+\\]", ""); + s = s.replaceAll("\\{.+\\}", ""); + s = s.replaceAll("\\s+-\\s+", "-"); + s = s.replaceAll("[\\p{Punct}&&[^,-]]", " "); + s = s.replaceAll("\\d", " "); + s = s.replaceAll("\\n", " "); + s = s.replaceAll("\\.", " "); + s = s.replaceAll("\\s+", " "); - if (aggressive) { - s = s.replaceAll("[\\p{InCombiningDiacriticalMarks}&&[^,-]]", ""); - // s = s.replaceAll("[\\W&&[^,-]]", ""); - } + if (aggressive) { + s = s.replaceAll("[\\p{InCombiningDiacriticalMarks}&&[^,-]]", ""); + // s = s.replaceAll("[\\W&&[^,-]]", ""); + } - if (s.contains(",")) { - final String[] arr = s.split(","); - if (arr.length == 1) { - fullname = splitTerms(arr[0]); - } else if (arr.length > 1) { - surname = splitTerms(arr[0]); - name = splitTerms(arr[1]); - fullname.addAll(surname); - fullname.addAll(name); - } - } else { - fullname = splitTerms(s); + if (s.contains(",")) { + final String[] arr = s.split(","); + if (arr.length == 1) { + fullname = splitTerms(arr[0]); + } else if (arr.length > 1) { + surname = splitTerms(arr[0]); + name = splitTerms(arr[1]); + fullname.addAll(surname); + fullname.addAll(name); + } + } else { + fullname = splitTerms(s); - int lastInitialPosition = fullname.size(); - boolean hasSurnameInUpperCase = false; + int lastInitialPosition = fullname.size(); + boolean hasSurnameInUpperCase = false; - for (int i = 0; i < fullname.size(); i++) { - final String term = fullname.get(i); - if (term.length() == 1) { - lastInitialPosition = i; - } else if (term.equals(term.toUpperCase())) { - hasSurnameInUpperCase = true; - } - } + for (int i = 0; i < fullname.size(); i++) { + final String term = fullname.get(i); + if (term.length() == 1) { + lastInitialPosition = i; + } else if (term.equals(term.toUpperCase())) { + hasSurnameInUpperCase = true; + } + } - if (lastInitialPosition < fullname.size() - 1) { // Case: Michele G. Artini - name = fullname.subList(0, lastInitialPosition + 1); - surname = fullname.subList(lastInitialPosition + 1, fullname.size()); - } else if (hasSurnameInUpperCase) { // Case: Michele ARTINI - for (final String term : fullname) { - if (term.length() > 1 && term.equals(term.toUpperCase())) { - surname.add(term); - } else { - name.add(term); - } - } - } - } - } + if (lastInitialPosition < fullname.size() - 1) { // Case: Michele G. Artini + name = fullname.subList(0, lastInitialPosition + 1); + surname = fullname.subList(lastInitialPosition + 1, fullname.size()); + } else if (hasSurnameInUpperCase) { // Case: Michele ARTINI + for (final String term : fullname) { + if (term.length() > 1 && term.equals(term.toUpperCase())) { + surname.add(term); + } else { + name.add(term); + } + } + } + } + } - private List splitTerms(final String s) { - if (particles == null) { - particles = loadFromClasspath("/eu/dnetlib/dhp/oa/graph/pace/name_particles.txt"); - } + private List splitTerms(final String s) { + if (particles == null) { + particles = loadFromClasspath("/eu/dnetlib/dhp/oa/graph/pace/name_particles.txt"); + } - final List list = Lists.newArrayList(); - for (final String part : Splitter.on(" ").omitEmptyStrings().split(s)) { - if (!particles.contains(part.toLowerCase())) { - list.add(part); - } - } - return list; - } + final List list = Lists.newArrayList(); + for (final String part : Splitter.on(" ").omitEmptyStrings().split(s)) { + if (!particles.contains(part.toLowerCase())) { + list.add(part); + } + } + return list; + } - public List getName() { - return name; - } + public List getName() { + return name; + } - public String getNameString() { - return Joiner.on(" ").join(getName()); - } + public String getNameString() { + return Joiner.on(" ").join(getName()); + } - public List getSurname() { - return surname; - } + public List getSurname() { + return surname; + } - public List getFullname() { - return fullname; - } + public List getFullname() { + return fullname; + } - public String getOriginal() { - return original; - } + public String getOriginal() { + return original; + } - public String hash() { - return Hashing.murmur3_128().hashString(getNormalisedFullname(), Charset.forName(UTF8)).toString(); - } + public String hash() { + return Hashing.murmur3_128() + .hashString(getNormalisedFullname(), Charset.forName(UTF8)) + .toString(); + } - public String getNormalisedFirstName() { - return Joiner.on(" ").join(getCapitalFirstnames()); - } + public String getNormalisedFirstName() { + return Joiner.on(" ").join(getCapitalFirstnames()); + } - public String getNormalisedSurname() { - return Joiner.on(" ").join(getCapitalSurname()); - } + public String getNormalisedSurname() { + return Joiner.on(" ").join(getCapitalSurname()); + } - public String getSurnameString() { - return Joiner.on(" ").join(getSurname()); - } + public String getSurnameString() { + return Joiner.on(" ").join(getSurname()); + } - public String getNormalisedFullname() { - return isAccurate() ? getNormalisedSurname() + ", " + getNormalisedFirstName() : Joiner.on(" ").join(fullname); - } + public String getNormalisedFullname() { + return isAccurate() + ? getNormalisedSurname() + ", " + getNormalisedFirstName() + : Joiner.on(" ").join(fullname); + } - public List getCapitalFirstnames() { - return Lists.newArrayList(Iterables.transform(getNameWithAbbreviations(), PacePerson::capitalize)); - } + public List getCapitalFirstnames() { + return Lists.newArrayList( + Iterables.transform(getNameWithAbbreviations(), PacePerson::capitalize)); + } - public List getCapitalSurname() { - return Lists.newArrayList(Iterables.transform(surname, PacePerson::capitalize)); - } + public List getCapitalSurname() { + return Lists.newArrayList(Iterables.transform(surname, PacePerson::capitalize)); + } - public List getNameWithAbbreviations() { - return Lists.newArrayList(Iterables.transform(name, PacePerson::dotAbbreviations)); - } - - public boolean isAccurate() { - return name != null && surname != null && !name.isEmpty() && !surname.isEmpty(); - } + public List getNameWithAbbreviations() { + return Lists.newArrayList(Iterables.transform(name, PacePerson::dotAbbreviations)); + } + public boolean isAccurate() { + return name != null && surname != null && !name.isEmpty() && !surname.isEmpty(); + } } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/ImportDataFromMongo.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/ImportDataFromMongo.java index c313c139e..a5803134e 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/ImportDataFromMongo.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/ImportDataFromMongo.java @@ -7,6 +7,14 @@ import com.mongodb.client.FindIterable; import com.mongodb.client.MongoCollection; import com.mongodb.client.MongoDatabase; import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import java.io.IOException; +import java.net.URI; +import java.util.ArrayList; +import java.util.List; +import java.util.Objects; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.function.Consumer; +import java.util.stream.Collectors; import org.apache.commons.io.IOUtils; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; @@ -16,61 +24,38 @@ import org.apache.hadoop.io.SequenceFile; import org.apache.hadoop.io.Text; import org.bson.Document; import org.bson.conversions.Bson; -import java.io.IOException; -import java.net.URI; -import java.util.ArrayList; -import java.util.List; -import java.util.Objects; -import java.util.concurrent.atomic.AtomicInteger; -import java.util.function.Consumer; -import java.util.stream.Collectors; /** - * This job is responsible to collect - * data from mongoDatabase and store in a sequence File on HDFS - * Mongo database contains information of each MDSTore in two collections: - * -metadata - * That contains info like: - * ID, format, layout, interpretation - * -metadataManager: - * that contains info : - * ID, mongoCollectionName - * from the metadata collection we filter the ids with Format, layout, and Interpretation - * from the metadataManager we get the current MONGO collection name which contains metadata XML - * see function getCurrentId - * - * This Job will be called different times in base at the triple we want import, - * and generates for each triple a sequence file of XML + * This job is responsible to collect data from mongoDatabase and store in a sequence File on HDFS + * Mongo database contains information of each MDSTore in two collections: -metadata That contains + * info like: ID, format, layout, interpretation -metadataManager: that contains info : ID, + * mongoCollectionName from the metadata collection we filter the ids with Format, layout, and + * Interpretation from the metadataManager we get the current MONGO collection name which contains + * metadata XML see function getCurrentId * + *

This Job will be called different times in base at the triple we want import, and generates + * for each triple a sequence file of XML */ - public class ImportDataFromMongo { /** - * It requires in input some parameters described on a file eu/dnetlib/dhp/graph/sx/import_from_mongo_parameters.json + * It requires in input some parameters described on a file + * eu/dnetlib/dhp/graph/sx/import_from_mongo_parameters.json * - * - the name node - * - the paht where store HDFS File - * - the mongo host - * - the mongo port - * - the metadata format to import - * - the metadata layout to import - * - the metadata interpretation to import - * - the mongo database Name - * - * This params are encoded into args - * - * - + *

- the name node - the paht where store HDFS File - the mongo host - the mongo port - the + * metadata format to import - the metadata layout to import - the metadata interpretation to + * import - the mongo database Name * + *

This params are encoded into args * * @param args * @throws Exception */ public static void main(String[] args) throws Exception { - final ArgumentApplicationParser parser = new ArgumentApplicationParser( - IOUtils.toString( - ImportDataFromMongo.class.getResourceAsStream( - "/eu/dnetlib/dhp/sx/graph/argumentparser/import_from_mongo_parameters.json"))); + final ArgumentApplicationParser parser = + new ArgumentApplicationParser( + IOUtils.toString( + ImportDataFromMongo.class.getResourceAsStream( + "/eu/dnetlib/dhp/sx/graph/argumentparser/import_from_mongo_parameters.json"))); parser.parseArgument(args); final int port = Integer.parseInt(parser.get("dbport")); final String host = parser.get("dbhost"); @@ -85,11 +70,22 @@ public class ImportDataFromMongo { MongoCollection metadata = database.getCollection("metadata"); MongoCollection metadataManager = database.getCollection("metadataManager"); - final DBObject query = QueryBuilder.start("format").is(format).and("layout").is(layout).and("interpretation").is(interpretation).get(); + final DBObject query = + QueryBuilder.start("format") + .is(format) + .and("layout") + .is(layout) + .and("interpretation") + .is(interpretation) + .get(); final List ids = new ArrayList<>(); - metadata.find((Bson) query).forEach((Consumer) document -> ids.add(document.getString("mdId"))); - List databaseId = ids.stream().map(it -> getCurrentId(it, metadataManager)).filter(Objects::nonNull).collect(Collectors.toList()); - + metadata.find((Bson) query) + .forEach((Consumer) document -> ids.add(document.getString("mdId"))); + List databaseId = + ids.stream() + .map(it -> getCurrentId(it, metadataManager)) + .filter(Objects::nonNull) + .collect(Collectors.toList()); final String hdfsuri = parser.get("namenode"); // ====== Init HDFS File System Object @@ -104,43 +100,53 @@ public class ImportDataFromMongo { Path hdfswritepath = new Path(parser.get("targetPath")); final AtomicInteger counter = new AtomicInteger(0); - try (SequenceFile.Writer writer = SequenceFile.createWriter(conf, - SequenceFile.Writer.file(hdfswritepath), SequenceFile.Writer.keyClass(IntWritable.class), - SequenceFile.Writer.valueClass(Text.class))) { + try (SequenceFile.Writer writer = + SequenceFile.createWriter( + conf, + SequenceFile.Writer.file(hdfswritepath), + SequenceFile.Writer.keyClass(IntWritable.class), + SequenceFile.Writer.valueClass(Text.class))) { final IntWritable key = new IntWritable(counter.get()); final Text value = new Text(); - databaseId.forEach(id -> { - System.out.println("Reading :"+id); - MongoCollection collection = database.getCollection(id); - collection.find().forEach((Consumer) document -> - { - key.set(counter.getAndIncrement()); - value.set(document.getString("body")); + databaseId.forEach( + id -> { + System.out.println("Reading :" + id); + MongoCollection collection = database.getCollection(id); + collection + .find() + .forEach( + (Consumer) + document -> { + key.set(counter.getAndIncrement()); + value.set(document.getString("body")); - if (counter.get() % 10000 == 0) { - System.out.println("Added "+counter.get()); - } - try { - writer.append(key, value); - } catch (IOException e) { - throw new RuntimeException(e); - } - } - ); - }); + if (counter.get() % 10000 == 0) { + System.out.println( + "Added " + counter.get()); + } + try { + writer.append(key, value); + } catch (IOException e) { + throw new RuntimeException(e); + } + }); + }); } } /** * Return the name of mongo collection giving an MdStore ID + * * @param mdId The id of the MDStore - * @param metadataManager The collection metadataManager on mongo which contains this information + * @param metadataManager The collection metadataManager on mongo which contains this + * information * @return */ - private static String getCurrentId(final String mdId, final MongoCollection metadataManager) { - FindIterable result = metadataManager.find((Bson) QueryBuilder.start("mdId").is(mdId).get()); + private static String getCurrentId( + final String mdId, final MongoCollection metadataManager) { + FindIterable result = + metadataManager.find((Bson) QueryBuilder.start("mdId").is(mdId).get()); final Document item = result.first(); return item == null ? null : item.getString("currentId"); } - } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/SparkExtractEntitiesJob.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/SparkExtractEntitiesJob.java index f2a1aa4d7..41bc86c52 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/SparkExtractEntitiesJob.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/SparkExtractEntitiesJob.java @@ -2,92 +2,87 @@ package eu.dnetlib.dhp.sx.graph; import com.jayway.jsonpath.JsonPath; import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import java.util.Arrays; +import java.util.List; +import java.util.stream.Collectors; +import net.minidev.json.JSONArray; import org.apache.commons.io.IOUtils; import org.apache.commons.lang3.StringUtils; import org.apache.hadoop.io.compress.GzipCodec; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.sql.SparkSession; -import net.minidev.json.JSONArray; - -import java.util.Arrays; -import java.util.List; -import java.util.stream.Collectors; - /** - * This Job extracts a typology of entity and stores it in a new RDD - * This job is called different times, for each file generated by the Job {@link ImportDataFromMongo} - * and store the new RDD in a path that should be under a folder: - * extractedEntities/entity/version1 + * This Job extracts a typology of entity and stores it in a new RDD This job is called different + * times, for each file generated by the Job {@link ImportDataFromMongo} and store the new RDD in a + * path that should be under a folder: extractedEntities/entity/version1 * - * at the end of this process we will have : - * extractedEntities/dataset/version1 - * extractedEntities/dataset/version2 - * extractedEntities/dataset/... - * extractedEntities/publication/version1 - * extractedEntities/publication/version2 - * extractedEntities/publication/... - * extractedEntities/unknown/version1 - * extractedEntities/unknown/version2 - * extractedEntities/unknown/... - * extractedEntities/relation/version1 - * extractedEntities/relation/version2 - * extractedEntities/relation/... + *

at the end of this process we will have : extractedEntities/dataset/version1 + * extractedEntities/dataset/version2 extractedEntities/dataset/... + * extractedEntities/publication/version1 extractedEntities/publication/version2 + * extractedEntities/publication/... extractedEntities/unknown/version1 + * extractedEntities/unknown/version2 extractedEntities/unknown/... + * extractedEntities/relation/version1 extractedEntities/relation/version2 + * extractedEntities/relation/... */ - public class SparkExtractEntitiesJob { - final static String IDJSONPATH = "$.id"; - final static String SOURCEJSONPATH = "$.source"; - final static String TARGETJSONPATH = "$.target"; - + static final String IDJSONPATH = "$.id"; + static final String SOURCEJSONPATH = "$.source"; + static final String TARGETJSONPATH = "$.target"; public static void main(String[] args) throws Exception { - final ArgumentApplicationParser parser = new ArgumentApplicationParser( - IOUtils.toString( - SparkExtractEntitiesJob.class.getResourceAsStream( - "/eu/dnetlib/dhp/sx/graph/argumentparser/input_extract_entities_parameters.json"))); + final ArgumentApplicationParser parser = + new ArgumentApplicationParser( + IOUtils.toString( + SparkExtractEntitiesJob.class.getResourceAsStream( + "/eu/dnetlib/dhp/sx/graph/argumentparser/input_extract_entities_parameters.json"))); parser.parseArgument(args); - final SparkSession spark = SparkSession - .builder() - .appName(SparkExtractEntitiesJob.class.getSimpleName()) - .master(parser.get("master")) - .getOrCreate(); + final SparkSession spark = + SparkSession.builder() + .appName(SparkExtractEntitiesJob.class.getSimpleName()) + .master(parser.get("master")) + .getOrCreate(); final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); final String inputPath = parser.get("sourcePath"); final String targetPath = parser.get("targetPath"); - final String tdir =parser.get("targetDir"); + final String tdir = parser.get("targetDir"); final JavaRDD inputRDD = sc.textFile(inputPath); - List entities = Arrays.stream(parser.get("entities").split(",")).map(String::trim).collect(Collectors.toList()); + List entities = + Arrays.stream(parser.get("entities").split(",")) + .map(String::trim) + .collect(Collectors.toList()); if (entities.stream().anyMatch("dataset"::equalsIgnoreCase)) { - //Extract Dataset - inputRDD.filter(SparkExtractEntitiesJob::isDataset).saveAsTextFile(targetPath + "/dataset/"+tdir, GzipCodec.class); + // Extract Dataset + inputRDD.filter(SparkExtractEntitiesJob::isDataset) + .saveAsTextFile(targetPath + "/dataset/" + tdir, GzipCodec.class); } if (entities.stream().anyMatch("unknown"::equalsIgnoreCase)) { - //Extract Unknown - inputRDD.filter(SparkExtractEntitiesJob::isUnknown).saveAsTextFile(targetPath + "/unknown/"+tdir, GzipCodec.class); + // Extract Unknown + inputRDD.filter(SparkExtractEntitiesJob::isUnknown) + .saveAsTextFile(targetPath + "/unknown/" + tdir, GzipCodec.class); } if (entities.stream().anyMatch("relation"::equalsIgnoreCase)) { - //Extract Relation - inputRDD.filter(SparkExtractEntitiesJob::isRelation).saveAsTextFile(targetPath + "/relation/"+tdir, GzipCodec.class); + // Extract Relation + inputRDD.filter(SparkExtractEntitiesJob::isRelation) + .saveAsTextFile(targetPath + "/relation/" + tdir, GzipCodec.class); } if (entities.stream().anyMatch("publication"::equalsIgnoreCase)) { - //Extract Relation - inputRDD.filter(SparkExtractEntitiesJob::isPublication).saveAsTextFile(targetPath + "/publication/"+tdir, GzipCodec.class); + // Extract Relation + inputRDD.filter(SparkExtractEntitiesJob::isPublication) + .saveAsTextFile(targetPath + "/publication/" + tdir, GzipCodec.class); } } - public static boolean isDataset(final String json) { final String id = getJPathString(IDJSONPATH, json); if (StringUtils.isBlank(id)) return false; return id.startsWith("60|"); } - public static boolean isPublication(final String json) { final String id = getJPathString(IDJSONPATH, json); if (StringUtils.isBlank(id)) return false; @@ -106,12 +101,10 @@ public class SparkExtractEntitiesJob { return StringUtils.isNotBlank(source) && StringUtils.isNotBlank(target); } - public static String getJPathString(final String jsonPath, final String json) { try { Object o = JsonPath.read(json, jsonPath); - if (o instanceof String) - return (String) o; + if (o instanceof String) return (String) o; if (o instanceof JSONArray && ((JSONArray) o).size() > 0) return (String) ((JSONArray) o).get(0); return ""; @@ -119,6 +112,4 @@ public class SparkExtractEntitiesJob { return ""; } } - - } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/SparkSXGeneratePidSimlarity.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/SparkSXGeneratePidSimlarity.java index aa2f2cc58..fa446b3d1 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/SparkSXGeneratePidSimlarity.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/SparkSXGeneratePidSimlarity.java @@ -1,6 +1,5 @@ package eu.dnetlib.dhp.sx.graph; -import eu.dnetlib.dhp.schema.oaf.Relation; import eu.dnetlib.dhp.schema.scholexplorer.DLIRelation; import eu.dnetlib.dhp.utils.DHPUtils; import org.apache.commons.lang3.StringUtils; @@ -13,52 +12,69 @@ import org.apache.spark.sql.SaveMode; import org.apache.spark.sql.SparkSession; import scala.Tuple2; - /** - * In some case the identifier generated for the Entity in @{@link SparkExtractEntitiesJob} is different from the identifier - * * associated by the aggregator, this means that some relation points to missing identifier - * To avoid this problem we store in the model the Id and the OriginalObJIdentifier - * This jobs extract this pair and creates a Similar relation that will be used in SparkMergeEntities - * + * In some case the identifier generated for the Entity in @{@link SparkExtractEntitiesJob} is + * different from the identifier * associated by the aggregator, this means that some relation + * points to missing identifier To avoid this problem we store in the model the Id and the + * OriginalObJIdentifier This jobs extract this pair and creates a Similar relation that will be + * used in SparkMergeEntities */ - public class SparkSXGeneratePidSimlarity { - final static String IDJSONPATH = "$.id"; - final static String OBJIDPATH = "$.originalObjIdentifier"; + static final String IDJSONPATH = "$.id"; + static final String OBJIDPATH = "$.originalObjIdentifier"; + public static void generateDataFrame( + final SparkSession spark, + final JavaSparkContext sc, + final String inputPath, + final String targetPath) { + final JavaPairRDD datasetSimRel = + sc.textFile(inputPath + "/dataset/*") + .mapToPair( + (PairFunction) + k -> + new Tuple2<>( + DHPUtils.getJPathString(IDJSONPATH, k), + DHPUtils.getJPathString(OBJIDPATH, k))) + .filter( + t -> + !StringUtils.substringAfter(t._1(), "|") + .equalsIgnoreCase( + StringUtils.substringAfter(t._2(), "::"))) + .distinct(); + final JavaPairRDD publicationSimRel = + sc.textFile(inputPath + "/publication/*") + .mapToPair( + (PairFunction) + k -> + new Tuple2<>( + DHPUtils.getJPathString(IDJSONPATH, k), + DHPUtils.getJPathString(OBJIDPATH, k))) + .filter( + t -> + !StringUtils.substringAfter(t._1(), "|") + .equalsIgnoreCase( + StringUtils.substringAfter(t._2(), "::"))) + .distinct(); - - public static void generateDataFrame(final SparkSession spark, final JavaSparkContext sc, final String inputPath, final String targetPath) { - - - final JavaPairRDD datasetSimRel = sc.textFile(inputPath+"/dataset/*") - .mapToPair((PairFunction) k -> - new Tuple2<>(DHPUtils.getJPathString(IDJSONPATH, k),DHPUtils.getJPathString(OBJIDPATH, k))) - .filter(t -> - !StringUtils.substringAfter(t._1(), "|") - .equalsIgnoreCase(StringUtils.substringAfter(t._2(), "::"))) - .distinct(); - - final JavaPairRDD publicationSimRel = sc.textFile(inputPath+"/publication/*") - .mapToPair((PairFunction) k -> - new Tuple2<>(DHPUtils.getJPathString(IDJSONPATH, k),DHPUtils.getJPathString(OBJIDPATH, k))) - .filter(t -> - !StringUtils.substringAfter(t._1(), "|") - .equalsIgnoreCase(StringUtils.substringAfter(t._2(), "::"))) - .distinct(); - - JavaRDD simRel = datasetSimRel.union(publicationSimRel).map(s -> { - final DLIRelation r = new DLIRelation(); - r.setSource(s._1()); - r.setTarget(s._2()); - r.setRelType("similar"); - return r; - } - ); - spark.createDataset(simRel.rdd(), Encoders.bean(DLIRelation.class)).distinct().write() - .mode(SaveMode.Overwrite).save(targetPath+"/pid_simRel"); + JavaRDD simRel = + datasetSimRel + .union(publicationSimRel) + .map( + s -> { + final DLIRelation r = new DLIRelation(); + r.setSource(s._1()); + r.setTarget(s._2()); + r.setRelType("similar"); + return r; + }); + spark.createDataset(simRel.rdd(), Encoders.bean(DLIRelation.class)) + .distinct() + .write() + .mode(SaveMode.Overwrite) + .save(targetPath + "/pid_simRel"); } } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/SparkScholexplorerCreateRawGraphJob.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/SparkScholexplorerCreateRawGraphJob.java index 36c94f595..b528f473b 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/SparkScholexplorerCreateRawGraphJob.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/SparkScholexplorerCreateRawGraphJob.java @@ -10,6 +10,10 @@ import eu.dnetlib.dhp.schema.scholexplorer.DLIPublication; import eu.dnetlib.dhp.schema.scholexplorer.DLIRelation; import eu.dnetlib.dhp.schema.scholexplorer.DLIUnknown; import eu.dnetlib.dhp.utils.DHPUtils; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.stream.Collectors; import net.minidev.json.JSONArray; import org.apache.commons.io.IOUtils; import org.apache.commons.lang3.StringUtils; @@ -29,59 +33,54 @@ import org.apache.spark.sql.SaveMode; import org.apache.spark.sql.SparkSession; import scala.Tuple2; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.List; -import java.util.stream.Collectors; - - /** - * This job is responsible of the creation of RAW Graph - * It is applied to the different entities generated from {@link SparkExtractEntitiesJob} - * In case of dataset, publication and Unknown Entities - * we group all the entities of the same type by their identifier, - * and then in the reduce phase we merge all the entities. - * Merge means: - * -merge all the metadata - * -merge the collected From values + * This job is responsible of the creation of RAW Graph It is applied to the different entities + * generated from {@link SparkExtractEntitiesJob} In case of dataset, publication and Unknown + * Entities we group all the entities of the same type by their identifier, and then in the reduce + * phase we merge all the entities. Merge means: -merge all the metadata -merge the collected From + * values * - * In case of relation we need to make a different work: - * -Phase 1: Map reduce jobs - * Map: Get all Relation and emit a key constructed by (source, relType, Target) and the relation itself - * Reduce: Merge all relations - * Looking at the javadoc of {@link SparkSXGeneratePidSimlarity} we take the dataset of pid relation - * and joining by source and target we replace the wrong identifier in the relation with the correct ones. - * At the end we replace the new Dataset of Relation + *

In case of relation we need to make a different work: -Phase 1: Map reduce jobs Map: Get all + * Relation and emit a key constructed by (source, relType, Target) and the relation itself Reduce: + * Merge all relations Looking at the javadoc of {@link SparkSXGeneratePidSimlarity} we take the + * dataset of pid relation and joining by source and target we replace the wrong identifier in the + * relation with the correct ones. At the end we replace the new Dataset of Relation */ - public class SparkScholexplorerCreateRawGraphJob { - final static String IDJSONPATH = "$.id"; - final static String SOURCEJSONPATH = "$.source"; - final static String TARGETJSONPATH = "$.target"; - final static String RELJSONPATH = "$.relType"; + static final String IDJSONPATH = "$.id"; + static final String SOURCEJSONPATH = "$.source"; + static final String TARGETJSONPATH = "$.target"; + static final String RELJSONPATH = "$.relType"; public static void main(String[] args) throws Exception { - - final ArgumentApplicationParser parser = new ArgumentApplicationParser( - IOUtils.toString( - SparkScholexplorerCreateRawGraphJob.class.getResourceAsStream( - "/eu/dnetlib/dhp/sx/graph/argumentparser/merge_entities_scholix_parameters.json"))); + final ArgumentApplicationParser parser = + new ArgumentApplicationParser( + IOUtils.toString( + SparkScholexplorerCreateRawGraphJob.class.getResourceAsStream( + "/eu/dnetlib/dhp/sx/graph/argumentparser/merge_entities_scholix_parameters.json"))); parser.parseArgument(args); - final SparkSession spark = SparkSession - .builder() - .config(new SparkConf() - .set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")) - .appName(SparkScholexplorerCreateRawGraphJob.class.getSimpleName()) - .master(parser.get("master")) - .getOrCreate(); + final SparkSession spark = + SparkSession.builder() + .config( + new SparkConf() + .set( + "spark.serializer", + "org.apache.spark.serializer.KryoSerializer")) + .appName(SparkScholexplorerCreateRawGraphJob.class.getSimpleName()) + .master(parser.get("master")) + .getOrCreate(); final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); final String inputPath = parser.get("sourcePath"); final String targetPath = parser.get("targetPath"); final String entity = parser.get("entity"); FileSystem fs = FileSystem.get(sc.sc().hadoopConfiguration()); - List subFolder = Arrays.stream(fs.listStatus(new Path(inputPath))).filter(FileStatus::isDirectory).map(FileStatus::getPath).collect(Collectors.toList()); + List subFolder = + Arrays.stream(fs.listStatus(new Path(inputPath))) + .filter(FileStatus::isDirectory) + .map(FileStatus::getPath) + .collect(Collectors.toList()); List> inputRdd = new ArrayList<>(); subFolder.forEach(p -> inputRdd.add(sc.textFile(p.toUri().getRawPath()))); JavaRDD union = sc.emptyRDD(); @@ -90,118 +89,192 @@ public class SparkScholexplorerCreateRawGraphJob { } switch (entity) { case "dataset": - union.mapToPair((PairFunction) f -> { - final String id = getJPathString(IDJSONPATH, f); - ObjectMapper mapper = new ObjectMapper(); - mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); - return new Tuple2<>(id, mapper.readValue(f, DLIDataset.class)); - }).reduceByKey((a, b) -> { - a.mergeFrom(b); - return a; - }).map(item -> { - ObjectMapper mapper = new ObjectMapper(); - return mapper.writeValueAsString(item._2()); - }).saveAsTextFile(targetPath, GzipCodec.class); + union.mapToPair( + (PairFunction) + f -> { + final String id = getJPathString(IDJSONPATH, f); + ObjectMapper mapper = new ObjectMapper(); + mapper.configure( + DeserializationFeature + .FAIL_ON_UNKNOWN_PROPERTIES, + false); + return new Tuple2<>( + id, mapper.readValue(f, DLIDataset.class)); + }) + .reduceByKey( + (a, b) -> { + a.mergeFrom(b); + return a; + }) + .map( + item -> { + ObjectMapper mapper = new ObjectMapper(); + return mapper.writeValueAsString(item._2()); + }) + .saveAsTextFile(targetPath, GzipCodec.class); break; case "publication": - union.mapToPair((PairFunction) f -> { - final String id = getJPathString(IDJSONPATH, f); - ObjectMapper mapper = new ObjectMapper(); - mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); - return new Tuple2<>(id, mapper.readValue(f, DLIPublication.class)); - }).reduceByKey((a, b) -> { - a.mergeFrom(b); - return a; - }).map(item -> { - ObjectMapper mapper = new ObjectMapper(); - return mapper.writeValueAsString(item._2()); - }).saveAsTextFile(targetPath, GzipCodec.class); + union.mapToPair( + (PairFunction) + f -> { + final String id = getJPathString(IDJSONPATH, f); + ObjectMapper mapper = new ObjectMapper(); + mapper.configure( + DeserializationFeature + .FAIL_ON_UNKNOWN_PROPERTIES, + false); + return new Tuple2<>( + id, mapper.readValue(f, DLIPublication.class)); + }) + .reduceByKey( + (a, b) -> { + a.mergeFrom(b); + return a; + }) + .map( + item -> { + ObjectMapper mapper = new ObjectMapper(); + return mapper.writeValueAsString(item._2()); + }) + .saveAsTextFile(targetPath, GzipCodec.class); break; case "unknown": - union.mapToPair((PairFunction) f -> { - final String id = getJPathString(IDJSONPATH, f); - ObjectMapper mapper = new ObjectMapper(); - mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); - return new Tuple2<>(id, mapper.readValue(f, DLIUnknown.class)); - }).reduceByKey((a, b) -> { - a.mergeFrom(b); - return a; - }).map(item -> { - ObjectMapper mapper = new ObjectMapper(); - return mapper.writeValueAsString(item._2()); - }).saveAsTextFile(targetPath, GzipCodec.class); + union.mapToPair( + (PairFunction) + f -> { + final String id = getJPathString(IDJSONPATH, f); + ObjectMapper mapper = new ObjectMapper(); + mapper.configure( + DeserializationFeature + .FAIL_ON_UNKNOWN_PROPERTIES, + false); + return new Tuple2<>( + id, mapper.readValue(f, DLIUnknown.class)); + }) + .reduceByKey( + (a, b) -> { + a.mergeFrom(b); + return a; + }) + .map( + item -> { + ObjectMapper mapper = new ObjectMapper(); + return mapper.writeValueAsString(item._2()); + }) + .saveAsTextFile(targetPath, GzipCodec.class); break; case "relation": + SparkSXGeneratePidSimlarity.generateDataFrame( + spark, + sc, + inputPath.replace("/relation", ""), + targetPath.replace("/relation", "")); + RDD rdd = + union.mapToPair( + (PairFunction) + f -> { + final String source = + getJPathString(SOURCEJSONPATH, f); + final String target = + getJPathString(TARGETJSONPATH, f); + final String reltype = + getJPathString(RELJSONPATH, f); + ObjectMapper mapper = new ObjectMapper(); + mapper.configure( + DeserializationFeature + .FAIL_ON_UNKNOWN_PROPERTIES, + false); + return new Tuple2<>( + DHPUtils.md5( + String.format( + "%s::%s::%s", + source.toLowerCase(), + reltype.toLowerCase(), + target.toLowerCase())), + mapper.readValue(f, DLIRelation.class)); + }) + .reduceByKey( + (a, b) -> { + a.mergeFrom(b); + return a; + }) + .map(Tuple2::_2) + .rdd(); + spark.createDataset(rdd, Encoders.bean(DLIRelation.class)) + .write() + .mode(SaveMode.Overwrite) + .save(targetPath); + Dataset rel_ds = + spark.read().load(targetPath).as(Encoders.bean(Relation.class)); + System.out.println( + "LOADING PATH :" + targetPath.replace("/relation", "") + "/pid_simRel"); + Dataset sim_ds = + spark.read() + .load(targetPath.replace("/relation", "") + "/pid_simRel") + .as(Encoders.bean(Relation.class)); - SparkSXGeneratePidSimlarity.generateDataFrame(spark, sc, inputPath.replace("/relation",""),targetPath.replace("/relation","") ); - RDD rdd = union.mapToPair((PairFunction) f -> { - final String source = getJPathString(SOURCEJSONPATH, f); - final String target = getJPathString(TARGETJSONPATH, f); - final String reltype = getJPathString(RELJSONPATH, f); - ObjectMapper mapper = new ObjectMapper(); - mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); - return new Tuple2<>(DHPUtils.md5(String.format("%s::%s::%s", source.toLowerCase(), reltype.toLowerCase(), target.toLowerCase())), mapper.readValue(f, DLIRelation.class)); - }).reduceByKey((a, b) -> { - a.mergeFrom(b); - return a; - }).map(Tuple2::_2).rdd(); + Dataset ids = + sim_ds.map( + (MapFunction) + relation -> { + final String type = + StringUtils.substringBefore( + relation.getSource(), "|"); + relation.setTarget( + String.format( + "%s|%s", + type, + StringUtils.substringAfter( + relation.getTarget(), "::"))); + return relation; + }, + Encoders.bean(Relation.class)); - spark.createDataset(rdd, Encoders.bean(DLIRelation.class)).write().mode(SaveMode.Overwrite).save(targetPath); - Dataset rel_ds =spark.read().load(targetPath).as(Encoders.bean(Relation.class)); - - System.out.println("LOADING PATH :"+targetPath.replace("/relation","")+"/pid_simRel"); - Datasetsim_ds =spark.read().load(targetPath.replace("/relation","")+"/pid_simRel").as(Encoders.bean(Relation.class)); - - Dataset ids = sim_ds.map((MapFunction) relation-> - { - final String type = StringUtils.substringBefore(relation.getSource(), "|"); - relation.setTarget(String.format("%s|%s", type, StringUtils.substringAfter(relation.getTarget(),"::"))); - return relation; - } - , Encoders.bean(Relation.class)); - - - final Dataset firstJoin = rel_ds - .joinWith(ids, ids.col("target") - .equalTo(rel_ds.col("source")), "left_outer") - .map((MapFunction, Relation>) s -> - { - if (s._2() != null) { - s._1().setSource(s._2().getSource()); - } - return s._1(); - } - , Encoders.bean(Relation.class)); - - - Dataset secondJoin = firstJoin.joinWith(ids, ids.col("target").equalTo(firstJoin.col("target")),"left_outer") - .map((MapFunction, Relation>) s -> - { - if (s._2() != null) { - s._1().setTarget(s._2().getSource()); - } - return s._1(); - } - , Encoders.bean(Relation.class)); - secondJoin.write().mode(SaveMode.Overwrite).save(targetPath+"_fixed"); + final Dataset firstJoin = + rel_ds.joinWith( + ids, + ids.col("target").equalTo(rel_ds.col("source")), + "left_outer") + .map( + (MapFunction, Relation>) + s -> { + if (s._2() != null) { + s._1().setSource(s._2().getSource()); + } + return s._1(); + }, + Encoders.bean(Relation.class)); + Dataset secondJoin = + firstJoin + .joinWith( + ids, + ids.col("target").equalTo(firstJoin.col("target")), + "left_outer") + .map( + (MapFunction, Relation>) + s -> { + if (s._2() != null) { + s._1().setTarget(s._2().getSource()); + } + return s._1(); + }, + Encoders.bean(Relation.class)); + secondJoin.write().mode(SaveMode.Overwrite).save(targetPath + "_fixed"); FileSystem fileSystem = FileSystem.get(sc.hadoopConfiguration()); - fileSystem.delete(new Path(targetPath), true); - fileSystem.rename(new Path(targetPath+"_fixed"),new Path(targetPath)); - + fileSystem.rename(new Path(targetPath + "_fixed"), new Path(targetPath)); } } public static String getJPathString(final String jsonPath, final String json) { try { Object o = JsonPath.read(json, jsonPath); - if (o instanceof String) - return (String) o; + if (o instanceof String) return (String) o; if (o instanceof JSONArray && ((JSONArray) o).size() > 0) return (String) ((JSONArray) o).get(0); return ""; diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/SparkScholexplorerGraphImporter.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/SparkScholexplorerGraphImporter.java index 90606f1b8..b998c1b04 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/SparkScholexplorerGraphImporter.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/SparkScholexplorerGraphImporter.java @@ -2,9 +2,9 @@ package eu.dnetlib.dhp.sx.graph; import com.fasterxml.jackson.databind.ObjectMapper; import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.schema.oaf.Oaf; import eu.dnetlib.dhp.sx.graph.parser.DatasetScholexplorerParser; import eu.dnetlib.dhp.sx.graph.parser.PublicationScholexplorerParser; -import eu.dnetlib.dhp.schema.oaf.Oaf; import eu.dnetlib.scholexplorer.relation.RelationMapper; import org.apache.commons.io.IOUtils; import org.apache.hadoop.io.IntWritable; @@ -15,47 +15,57 @@ import org.apache.spark.api.java.function.FlatMapFunction; import org.apache.spark.sql.SparkSession; import scala.Tuple2; - /** - * This Job read a sequential File containing XML stored in the aggregator - * and generates an RDD of heterogeneous entities like Dataset, Relation, Publication and Unknown + * This Job read a sequential File containing XML stored in the aggregator and generates an RDD of + * heterogeneous entities like Dataset, Relation, Publication and Unknown */ - public class SparkScholexplorerGraphImporter { public static void main(String[] args) throws Exception { - final ArgumentApplicationParser parser = new ArgumentApplicationParser( - IOUtils.toString( - SparkScholexplorerGraphImporter.class.getResourceAsStream( - "/eu/dnetlib/dhp/sx/graph/argumentparser/input_graph_scholix_parameters.json"))); + final ArgumentApplicationParser parser = + new ArgumentApplicationParser( + IOUtils.toString( + SparkScholexplorerGraphImporter.class.getResourceAsStream( + "/eu/dnetlib/dhp/sx/graph/argumentparser/input_graph_scholix_parameters.json"))); parser.parseArgument(args); - final SparkSession spark = SparkSession - .builder() - .appName(SparkScholexplorerGraphImporter.class.getSimpleName()) - .master(parser.get("master")) - .getOrCreate(); + final SparkSession spark = + SparkSession.builder() + .appName(SparkScholexplorerGraphImporter.class.getSimpleName()) + .master(parser.get("master")) + .getOrCreate(); final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); final String inputPath = parser.get("sourcePath"); RelationMapper relationMapper = RelationMapper.load(); - sc.sequenceFile(inputPath, IntWritable.class, Text.class).map(Tuple2::_2).map(Text::toString).repartition(500) - .flatMap((FlatMapFunction) record -> { - switch (parser.get("entity")) { - case "dataset": - final DatasetScholexplorerParser d = new DatasetScholexplorerParser(); - return d.parseObject(record,relationMapper).iterator(); - case "publication": - final PublicationScholexplorerParser p = new PublicationScholexplorerParser(); - return p.parseObject(record,relationMapper).iterator(); - default: - throw new IllegalArgumentException("wrong values of entities"); - } - }).map(k -> { - ObjectMapper mapper = new ObjectMapper(); - return mapper.writeValueAsString(k); - }).saveAsTextFile(parser.get("targetPath"), GzipCodec.class); + sc.sequenceFile(inputPath, IntWritable.class, Text.class) + .map(Tuple2::_2) + .map(Text::toString) + .repartition(500) + .flatMap( + (FlatMapFunction) + record -> { + switch (parser.get("entity")) { + case "dataset": + final DatasetScholexplorerParser d = + new DatasetScholexplorerParser(); + return d.parseObject(record, relationMapper).iterator(); + case "publication": + final PublicationScholexplorerParser p = + new PublicationScholexplorerParser(); + return p.parseObject(record, relationMapper).iterator(); + default: + throw new IllegalArgumentException( + "wrong values of entities"); + } + }) + .map( + k -> { + ObjectMapper mapper = new ObjectMapper(); + return mapper.writeValueAsString(k); + }) + .saveAsTextFile(parser.get("targetPath"), GzipCodec.class); } } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/parser/AbstractScholexplorerParser.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/parser/AbstractScholexplorerParser.java index 9eeff9613..bfd5d5c7f 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/parser/AbstractScholexplorerParser.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/parser/AbstractScholexplorerParser.java @@ -1,30 +1,39 @@ package eu.dnetlib.dhp.sx.graph.parser; - import eu.dnetlib.dhp.parser.utility.VtdUtilityParser; import eu.dnetlib.dhp.schema.oaf.*; -import eu.dnetlib.dhp.schema.scholexplorer.DLIDataset; import eu.dnetlib.dhp.schema.scholexplorer.DLIRelation; import eu.dnetlib.dhp.schema.scholexplorer.DLIUnknown; import eu.dnetlib.dhp.schema.scholexplorer.ProvenaceInfo; import eu.dnetlib.dhp.utils.DHPUtils; import eu.dnetlib.scholexplorer.relation.RelInfo; import eu.dnetlib.scholexplorer.relation.RelationMapper; -import org.apache.commons.lang3.StringUtils; -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; - -import javax.xml.stream.XMLStreamReader; import java.util.*; import java.util.regex.Matcher; import java.util.regex.Pattern; import java.util.stream.Collectors; +import javax.xml.stream.XMLStreamReader; +import org.apache.commons.lang3.StringUtils; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; public abstract class AbstractScholexplorerParser { protected static final Log log = LogFactory.getLog(AbstractScholexplorerParser.class); - final static Pattern pattern = Pattern.compile("10\\.\\d{4,9}/[-._;()/:A-Z0-9]+$", Pattern.CASE_INSENSITIVE); - private List datasetSubTypes = Arrays.asList("dataset", "software", "film", "sound", "physicalobject", "audiovisual", "collection", "other", "study", "metadata"); + static final Pattern pattern = + Pattern.compile("10\\.\\d{4,9}/[-._;()/:A-Z0-9]+$", Pattern.CASE_INSENSITIVE); + private List datasetSubTypes = + Arrays.asList( + "dataset", + "software", + "film", + "sound", + "physicalobject", + "audiovisual", + "collection", + "other", + "study", + "metadata"); public abstract List parseObject(final String record, final RelationMapper relMapper); @@ -36,27 +45,27 @@ public abstract class AbstractScholexplorerParser { return attributesMap; } - protected List extractSubject(List subjects) { final List subjectResult = new ArrayList<>(); if (subjects != null && subjects.size() > 0) { - subjects.forEach(subjectMap -> { - final StructuredProperty subject = new StructuredProperty(); - subject.setValue(subjectMap.getTextValue()); - final Qualifier schema = new Qualifier(); - schema.setClassid("dnet:subject"); - schema.setClassname("dnet:subject"); - schema.setSchemeid(subjectMap.getAttributes().get("subjectScheme")); - schema.setSchemename(subjectMap.getAttributes().get("subjectScheme")); - subject.setQualifier(schema); - subjectResult.add(subject); - }); + subjects.forEach( + subjectMap -> { + final StructuredProperty subject = new StructuredProperty(); + subject.setValue(subjectMap.getTextValue()); + final Qualifier schema = new Qualifier(); + schema.setClassid("dnet:subject"); + schema.setClassname("dnet:subject"); + schema.setSchemeid(subjectMap.getAttributes().get("subjectScheme")); + schema.setSchemename(subjectMap.getAttributes().get("subjectScheme")); + subject.setQualifier(schema); + subjectResult.add(subject); + }); } return subjectResult; } - - protected StructuredProperty extractIdentifier(List identifierType, final String fieldName) { + protected StructuredProperty extractIdentifier( + List identifierType, final String fieldName) { final StructuredProperty pid = new StructuredProperty(); if (identifierType != null && identifierType.size() > 0) { final VtdUtilityParser.Node result = identifierType.get(0); @@ -88,7 +97,7 @@ public abstract class AbstractScholexplorerParser { protected String generateId(final String pid, final String pidType, final String entityType) { String type; - switch (entityType){ + switch (entityType) { case "publication": type = "50|"; break; @@ -99,16 +108,22 @@ public abstract class AbstractScholexplorerParser { type = "70|"; break; default: - throw new IllegalArgumentException("unexpected value "+entityType); - + throw new IllegalArgumentException("unexpected value " + entityType); } - if ("dnet".equalsIgnoreCase(pidType)) - return type+StringUtils.substringAfter(pid, "::"); + if ("dnet".equalsIgnoreCase(pidType)) return type + StringUtils.substringAfter(pid, "::"); - return type+ DHPUtils.md5(String.format("%s::%s", pid.toLowerCase().trim(), pidType.toLowerCase().trim())); + return type + + DHPUtils.md5( + String.format( + "%s::%s", pid.toLowerCase().trim(), pidType.toLowerCase().trim())); } - protected DLIUnknown createUnknownObject(final String pid, final String pidType, final KeyValue cf, final DataInfo di, final String dateOfCollection) { + protected DLIUnknown createUnknownObject( + final String pid, + final String pidType, + final KeyValue cf, + final DataInfo di, + final String dateOfCollection) { final DLIUnknown uk = new DLIUnknown(); uk.setId(generateId(pid, pidType, "unknown")); ProvenaceInfo pi = new ProvenaceInfo(); @@ -130,56 +145,70 @@ public abstract class AbstractScholexplorerParser { return uk; } - protected void generateRelations(RelationMapper relationMapper, Result parsedObject, List result, DataInfo di, String dateOfCollection, List relatedIdentifiers) { - if(relatedIdentifiers!= null) { - result.addAll(relatedIdentifiers.stream() - .flatMap(n -> { - final List rels = new ArrayList<>(); - DLIRelation r = new DLIRelation(); - r.setSource(parsedObject.getId()); - final String relatedPid = n.getTextValue(); - final String relatedPidType = n.getAttributes().get("relatedIdentifierType"); - final String relatedType = n.getAttributes().getOrDefault("entityType", "unknown"); - String relationSemantic = n.getAttributes().get("relationType"); - String inverseRelation; - final String targetId = generateId(relatedPid, relatedPidType, relatedType); - r.setDateOfCollection(dateOfCollection); - if (relationMapper.containsKey(relationSemantic.toLowerCase())) - { - RelInfo relInfo = relationMapper.get(relationSemantic.toLowerCase()); - relationSemantic = relInfo.getOriginal(); - inverseRelation = relInfo.getInverse(); - } - else { - relationSemantic = "Unknown"; - inverseRelation = "Unknown"; - } - r.setTarget(targetId); - r.setRelType(relationSemantic); - r.setRelClass("datacite"); - r.setCollectedFrom(parsedObject.getCollectedfrom()); - r.setDataInfo(di); - rels.add(r); - r = new DLIRelation(); - r.setDataInfo(di); - r.setSource(targetId); - r.setTarget(parsedObject.getId()); - r.setRelType(inverseRelation); - r.setRelClass("datacite"); - r.setCollectedFrom(parsedObject.getCollectedfrom()); - r.setDateOfCollection(dateOfCollection); - rels.add(r); - if("unknown".equalsIgnoreCase(relatedType)) - result.add(createUnknownObject(relatedPid, relatedPidType, parsedObject.getCollectedfrom().get(0), di, dateOfCollection)); - return rels.stream(); - }).collect(Collectors.toList())); + protected void generateRelations( + RelationMapper relationMapper, + Result parsedObject, + List result, + DataInfo di, + String dateOfCollection, + List relatedIdentifiers) { + if (relatedIdentifiers != null) { + result.addAll( + relatedIdentifiers.stream() + .flatMap( + n -> { + final List rels = new ArrayList<>(); + DLIRelation r = new DLIRelation(); + r.setSource(parsedObject.getId()); + final String relatedPid = n.getTextValue(); + final String relatedPidType = + n.getAttributes().get("relatedIdentifierType"); + final String relatedType = + n.getAttributes() + .getOrDefault("entityType", "unknown"); + String relationSemantic = + n.getAttributes().get("relationType"); + String inverseRelation; + final String targetId = + generateId(relatedPid, relatedPidType, relatedType); + r.setDateOfCollection(dateOfCollection); + if (relationMapper.containsKey( + relationSemantic.toLowerCase())) { + RelInfo relInfo = + relationMapper.get( + relationSemantic.toLowerCase()); + relationSemantic = relInfo.getOriginal(); + inverseRelation = relInfo.getInverse(); + } else { + relationSemantic = "Unknown"; + inverseRelation = "Unknown"; + } + r.setTarget(targetId); + r.setRelType(relationSemantic); + r.setRelClass("datacite"); + r.setCollectedFrom(parsedObject.getCollectedfrom()); + r.setDataInfo(di); + rels.add(r); + r = new DLIRelation(); + r.setDataInfo(di); + r.setSource(targetId); + r.setTarget(parsedObject.getId()); + r.setRelType(inverseRelation); + r.setRelClass("datacite"); + r.setCollectedFrom(parsedObject.getCollectedfrom()); + r.setDateOfCollection(dateOfCollection); + rels.add(r); + if ("unknown".equalsIgnoreCase(relatedType)) + result.add( + createUnknownObject( + relatedPid, + relatedPidType, + parsedObject.getCollectedfrom().get(0), + di, + dateOfCollection)); + return rels.stream(); + }) + .collect(Collectors.toList())); } } - - - - } - - - diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/parser/DatasetScholexplorerParser.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/parser/DatasetScholexplorerParser.java index f1915c5cf..1fb77c147 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/parser/DatasetScholexplorerParser.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/parser/DatasetScholexplorerParser.java @@ -4,21 +4,17 @@ import com.ximpleware.AutoPilot; import com.ximpleware.VTDGen; import com.ximpleware.VTDNav; import eu.dnetlib.dhp.parser.utility.VtdUtilityParser; +import eu.dnetlib.dhp.parser.utility.VtdUtilityParser.Node; import eu.dnetlib.dhp.schema.oaf.*; import eu.dnetlib.dhp.schema.scholexplorer.DLIDataset; -import eu.dnetlib.dhp.schema.scholexplorer.DLIUnknown; import eu.dnetlib.dhp.schema.scholexplorer.ProvenaceInfo; - -import eu.dnetlib.dhp.parser.utility.VtdUtilityParser.Node; -import eu.dnetlib.scholexplorer.relation.RelInfo; import eu.dnetlib.scholexplorer.relation.RelationMapper; -import org.apache.commons.lang3.StringUtils; - import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.List; import java.util.stream.Collectors; +import org.apache.commons.lang3.StringUtils; public class DatasetScholexplorerParser extends AbstractScholexplorerParser { @Override @@ -39,13 +35,19 @@ public class DatasetScholexplorerParser extends AbstractScholexplorerParser { di.setInvisible(false); parsedObject.setDataInfo(di); - parsedObject.setOriginalId(Collections.singletonList(VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='recordIdentifier']"))); + parsedObject.setOriginalId( + Collections.singletonList( + VtdUtilityParser.getSingleValue( + ap, vn, "//*[local-name()='recordIdentifier']"))); - parsedObject.setOriginalObjIdentifier(VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='objIdentifier']")); - String dateOfCollection = VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='dateOfCollection']"); + parsedObject.setOriginalObjIdentifier( + VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='objIdentifier']")); + String dateOfCollection = + VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='dateOfCollection']"); parsedObject.setDateofcollection(dateOfCollection); - final String resolvedDate = VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='resolvedDate']"); + final String resolvedDate = + VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='resolvedDate']"); if (StringUtils.isNotBlank(resolvedDate)) { StructuredProperty currentDate = new StructuredProperty(); @@ -59,16 +61,28 @@ public class DatasetScholexplorerParser extends AbstractScholexplorerParser { parsedObject.setRelevantdate(Collections.singletonList(currentDate)); } - final String completionStatus = VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='completionStatus']"); - final String provisionMode = VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='provisionMode']"); + final String completionStatus = + VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='completionStatus']"); + final String provisionMode = + VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='provisionMode']"); - final String publisher = VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='resource']/*[local-name()='publisher']"); + final String publisher = + VtdUtilityParser.getSingleValue( + ap, vn, "//*[local-name()='resource']/*[local-name()='publisher']"); List collectedFromNodes = - VtdUtilityParser.getTextValuesWithAttributes(ap, vn, "//*[local-name()='collectedFrom']", Arrays.asList("name", "id", "mode", "completionStatus")); + VtdUtilityParser.getTextValuesWithAttributes( + ap, + vn, + "//*[local-name()='collectedFrom']", + Arrays.asList("name", "id", "mode", "completionStatus")); List resolvededFromNodes = - VtdUtilityParser.getTextValuesWithAttributes(ap, vn, "//*[local-name()='resolvedFrom']", Arrays.asList("name", "id", "mode", "completionStatus")); + VtdUtilityParser.getTextValuesWithAttributes( + ap, + vn, + "//*[local-name()='resolvedFrom']", + Arrays.asList("name", "id", "mode", "completionStatus")); Field pf = new Field<>(); pf.setValue(publisher); @@ -76,90 +90,122 @@ public class DatasetScholexplorerParser extends AbstractScholexplorerParser { parsedObject.setPublisher(pf); final List provenances = new ArrayList<>(); if (collectedFromNodes != null && collectedFromNodes.size() > 0) { - collectedFromNodes.forEach(it -> { - final ProvenaceInfo provenance = new ProvenaceInfo(); - provenance.setId(it.getAttributes().get("id")); - provenance.setName(it.getAttributes().get("name")); - provenance.setCollectionMode(provisionMode); - provenance.setCompletionStatus(it.getAttributes().get("completionStatus")); - provenances.add(provenance); - }); + collectedFromNodes.forEach( + it -> { + final ProvenaceInfo provenance = new ProvenaceInfo(); + provenance.setId(it.getAttributes().get("id")); + provenance.setName(it.getAttributes().get("name")); + provenance.setCollectionMode(provisionMode); + provenance.setCompletionStatus( + it.getAttributes().get("completionStatus")); + provenances.add(provenance); + }); } if (resolvededFromNodes != null && resolvededFromNodes.size() > 0) { - resolvededFromNodes.forEach(it -> { - final ProvenaceInfo provenance = new ProvenaceInfo(); - provenance.setId(it.getAttributes().get("id")); - provenance.setName(it.getAttributes().get("name")); - provenance.setCollectionMode("resolved"); - provenance.setCompletionStatus(it.getAttributes().get("completionStatus")); - provenances.add(provenance); - }); + resolvededFromNodes.forEach( + it -> { + final ProvenaceInfo provenance = new ProvenaceInfo(); + provenance.setId(it.getAttributes().get("id")); + provenance.setName(it.getAttributes().get("name")); + provenance.setCollectionMode("resolved"); + provenance.setCompletionStatus( + it.getAttributes().get("completionStatus")); + provenances.add(provenance); + }); } parsedObject.setDlicollectedfrom(provenances); - parsedObject.setCollectedfrom(parsedObject.getDlicollectedfrom().stream().map( - p-> { - final KeyValue cf = new KeyValue(); - cf.setKey(p.getId()); - cf.setValue(p.getName()); - return cf; - } - ).collect(Collectors.toList())); - parsedObject.setCompletionStatus(VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='completionStatus']")); + parsedObject.setCollectedfrom( + parsedObject.getDlicollectedfrom().stream() + .map( + p -> { + final KeyValue cf = new KeyValue(); + cf.setKey(p.getId()); + cf.setValue(p.getName()); + return cf; + }) + .collect(Collectors.toList())); + parsedObject.setCompletionStatus( + VtdUtilityParser.getSingleValue( + ap, vn, "//*[local-name()='completionStatus']")); final List identifierType = - VtdUtilityParser.getTextValuesWithAttributes(ap, vn, "//*[local-name()='resource']/*[local-name()='identifier']", Collections.singletonList("identifierType")); + VtdUtilityParser.getTextValuesWithAttributes( + ap, + vn, + "//*[local-name()='resource']/*[local-name()='identifier']", + Collections.singletonList("identifierType")); StructuredProperty currentPid = extractIdentifier(identifierType, "identifierType"); if (currentPid == null) return null; inferPid(currentPid); parsedObject.setPid(Collections.singletonList(currentPid)); - - final String sourceId = generateId(currentPid.getValue(), currentPid.getQualifier().getClassid(), "dataset"); + final String sourceId = + generateId( + currentPid.getValue(), + currentPid.getQualifier().getClassid(), + "dataset"); parsedObject.setId(sourceId); - - List descs = VtdUtilityParser.getTextValue(ap, vn, "//*[local-name()='description']"); + List descs = + VtdUtilityParser.getTextValue(ap, vn, "//*[local-name()='description']"); if (descs != null && descs.size() > 0) - parsedObject.setDescription(descs.stream() - .map(it -> it.length() < 10000 ? it : it.substring(0, 10000)) - .map(it -> { - final Field d = new Field<>(); - d.setValue(it); - return d; - }) - .collect(Collectors.toList())); - + parsedObject.setDescription( + descs.stream() + .map(it -> it.length() < 10000 ? it : it.substring(0, 10000)) + .map( + it -> { + final Field d = new Field<>(); + d.setValue(it); + return d; + }) + .collect(Collectors.toList())); final List relatedIdentifiers = - VtdUtilityParser.getTextValuesWithAttributes(ap, vn, "//*[local-name()='relatedIdentifier']", - Arrays.asList("relatedIdentifierType", "relationType", "entityType", "inverseRelationType")); - - - generateRelations(relationMapper, parsedObject, result, di, dateOfCollection, relatedIdentifiers); + VtdUtilityParser.getTextValuesWithAttributes( + ap, + vn, + "//*[local-name()='relatedIdentifier']", + Arrays.asList( + "relatedIdentifierType", + "relationType", + "entityType", + "inverseRelationType")); + generateRelations( + relationMapper, parsedObject, result, di, dateOfCollection, relatedIdentifiers); final List hostedBy = - VtdUtilityParser.getTextValuesWithAttributes(ap, vn, "//*[local-name()='hostedBy']", Arrays.asList("id", "name")); - + VtdUtilityParser.getTextValuesWithAttributes( + ap, vn, "//*[local-name()='hostedBy']", Arrays.asList("id", "name")); if (hostedBy != null) { - parsedObject.setInstance(hostedBy.stream().map(it -> - { - final Instance i = new Instance(); - i.setUrl(Collections.singletonList(currentPid.getValue())); - KeyValue h = new KeyValue(); - i.setHostedby(h); - h.setKey(it.getAttributes().get("id")); - h.setValue(it.getAttributes().get("name")); - return i; - }).collect(Collectors.toList())); + parsedObject.setInstance( + hostedBy.stream() + .map( + it -> { + final Instance i = new Instance(); + i.setUrl( + Collections.singletonList( + currentPid.getValue())); + KeyValue h = new KeyValue(); + i.setHostedby(h); + h.setKey(it.getAttributes().get("id")); + h.setValue(it.getAttributes().get("name")); + return i; + }) + .collect(Collectors.toList())); } - - List subjects = extractSubject(VtdUtilityParser.getTextValuesWithAttributes(ap, vn, "//*[local-name()='resource']//*[local-name()='subject']", Collections.singletonList("subjectScheme"))); + List subjects = + extractSubject( + VtdUtilityParser.getTextValuesWithAttributes( + ap, + vn, + "//*[local-name()='resource']//*[local-name()='subject']", + Collections.singletonList("subjectScheme"))); parsedObject.setSubject(subjects); @@ -172,50 +218,61 @@ public class DatasetScholexplorerParser extends AbstractScholexplorerParser { parsedObject.setCompletionStatus(completionStatus); - final List creators = VtdUtilityParser.getTextValue(ap, vn, "//*[local-name()='resource']//*[local-name()='creator']/*[local-name()='creatorName']"); + final List creators = + VtdUtilityParser.getTextValue( + ap, + vn, + "//*[local-name()='resource']//*[local-name()='creator']/*[local-name()='creatorName']"); if (creators != null && creators.size() > 0) { - parsedObject.setAuthor(creators - .stream() - .map(a -> { - final Author author = new Author(); - author.setFullname(a); - return author; - }).collect(Collectors.toList()) - ); + parsedObject.setAuthor( + creators.stream() + .map( + a -> { + final Author author = new Author(); + author.setFullname(a); + return author; + }) + .collect(Collectors.toList())); } - final List titles = VtdUtilityParser.getTextValue(ap, vn, "//*[local-name()='resource']//*[local-name()='title']"); + final List titles = + VtdUtilityParser.getTextValue( + ap, vn, "//*[local-name()='resource']//*[local-name()='title']"); if (titles != null && titles.size() > 0) { - parsedObject.setTitle(titles.stream() - .map(t -> { - final StructuredProperty st = new StructuredProperty(); - st.setValue(t); - return st; - } - ).collect(Collectors.toList()) - ); + parsedObject.setTitle( + titles.stream() + .map( + t -> { + final StructuredProperty st = new StructuredProperty(); + st.setValue(t); + return st; + }) + .collect(Collectors.toList())); } - final List dates = VtdUtilityParser.getTextValue(ap, vn, "//*[local-name()='resource']/*[local-name()='dates']/*[local-name()='date']"); - + final List dates = + VtdUtilityParser.getTextValue( + ap, + vn, + "//*[local-name()='resource']/*[local-name()='dates']/*[local-name()='date']"); if (dates != null && dates.size() > 0) { - parsedObject.setRelevantdate(dates.stream().map( - cd -> { - StructuredProperty date = new StructuredProperty(); - date.setValue(cd); - final Qualifier dq = new Qualifier(); - dq.setClassname("date"); - dq.setClassid("date"); - dq.setSchemename("dnet::date"); - dq.setSchemeid("dnet::date"); - date.setQualifier(dq); - return date; - } - ).collect(Collectors.toList())); + parsedObject.setRelevantdate( + dates.stream() + .map( + cd -> { + StructuredProperty date = new StructuredProperty(); + date.setValue(cd); + final Qualifier dq = new Qualifier(); + dq.setClassname("date"); + dq.setClassid("date"); + dq.setSchemename("dnet::date"); + dq.setSchemeid("dnet::date"); + date.setQualifier(dq); + return date; + }) + .collect(Collectors.toList())); } - - result.add(parsedObject); return result; } catch (Throwable e) { @@ -223,8 +280,4 @@ public class DatasetScholexplorerParser extends AbstractScholexplorerParser { return null; } } - - - - } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/parser/PublicationScholexplorerParser.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/parser/PublicationScholexplorerParser.java index aa2f86076..ad560805a 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/parser/PublicationScholexplorerParser.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/parser/PublicationScholexplorerParser.java @@ -8,15 +8,13 @@ import eu.dnetlib.dhp.parser.utility.VtdUtilityParser.Node; import eu.dnetlib.dhp.schema.oaf.*; import eu.dnetlib.dhp.schema.scholexplorer.DLIPublication; import eu.dnetlib.dhp.schema.scholexplorer.ProvenaceInfo; -import eu.dnetlib.scholexplorer.relation.RelInfo; import eu.dnetlib.scholexplorer.relation.RelationMapper; -import org.apache.commons.lang3.StringUtils; - import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.List; import java.util.stream.Collectors; +import org.apache.commons.lang3.StringUtils; public class PublicationScholexplorerParser extends AbstractScholexplorerParser { @@ -29,7 +27,6 @@ public class PublicationScholexplorerParser extends AbstractScholexplorerParser vg.setDoc(record.getBytes()); vg.parse(true); - final VTDNav vn = vg.getNav(); final AutoPilot ap = new AutoPilot(vn); @@ -38,11 +35,16 @@ public class PublicationScholexplorerParser extends AbstractScholexplorerParser di.setDeletedbyinference(false); di.setInvisible(false); - String dateOfCollection = VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='dateOfCollection']"); + String dateOfCollection = + VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='dateOfCollection']"); parsedObject.setDateofcollection(dateOfCollection); - final String resolvedDate = VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='resolvedDate']"); - parsedObject.setOriginalId(Collections.singletonList(VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='recordIdentifier']"))); + final String resolvedDate = + VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='resolvedDate']"); + parsedObject.setOriginalId( + Collections.singletonList( + VtdUtilityParser.getSingleValue( + ap, vn, "//*[local-name()='recordIdentifier']"))); if (StringUtils.isNotBlank(resolvedDate)) { StructuredProperty currentDate = new StructuredProperty(); @@ -56,123 +58,164 @@ public class PublicationScholexplorerParser extends AbstractScholexplorerParser parsedObject.setRelevantdate(Collections.singletonList(currentDate)); } - - final List pid = VtdUtilityParser.getTextValuesWithAttributes(ap, vn, "//*[local-name()='pid']", Arrays.asList("type")); + final List pid = + VtdUtilityParser.getTextValuesWithAttributes( + ap, vn, "//*[local-name()='pid']", Arrays.asList("type")); StructuredProperty currentPid = extractIdentifier(pid, "type"); if (currentPid == null) return null; inferPid(currentPid); parsedObject.setPid(Collections.singletonList(currentPid)); - final String sourceId = generateId(currentPid.getValue(), currentPid.getQualifier().getClassid(), "publication"); + final String sourceId = + generateId( + currentPid.getValue(), + currentPid.getQualifier().getClassid(), + "publication"); parsedObject.setId(sourceId); - parsedObject.setOriginalObjIdentifier(VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='objIdentifier']")); + parsedObject.setOriginalObjIdentifier( + VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='objIdentifier']")); - String provisionMode = VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='provisionMode']"); + String provisionMode = + VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='provisionMode']"); List collectedFromNodes = - VtdUtilityParser.getTextValuesWithAttributes(ap, vn, "//*[local-name()='collectedFrom']", Arrays.asList("name", "id", "mode", "completionStatus")); + VtdUtilityParser.getTextValuesWithAttributes( + ap, + vn, + "//*[local-name()='collectedFrom']", + Arrays.asList("name", "id", "mode", "completionStatus")); List resolvededFromNodes = - VtdUtilityParser.getTextValuesWithAttributes(ap, vn, "//*[local-name()='resolvedFrom']", Arrays.asList("name", "id", "mode", "completionStatus")); + VtdUtilityParser.getTextValuesWithAttributes( + ap, + vn, + "//*[local-name()='resolvedFrom']", + Arrays.asList("name", "id", "mode", "completionStatus")); - final String publisher = VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='publisher']"); + final String publisher = + VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='publisher']"); Field pf = new Field<>(); pf.setValue(publisher); parsedObject.setPublisher(pf); final List provenances = new ArrayList<>(); if (collectedFromNodes != null && collectedFromNodes.size() > 0) { - collectedFromNodes.forEach(it -> { - final ProvenaceInfo provenance = new ProvenaceInfo(); - provenance.setId(it.getAttributes().get("id")); - provenance.setName(it.getAttributes().get("name")); - provenance.setCollectionMode(provisionMode); - provenance.setCompletionStatus(it.getAttributes().get("completionStatus")); - provenances.add(provenance); - }); + collectedFromNodes.forEach( + it -> { + final ProvenaceInfo provenance = new ProvenaceInfo(); + provenance.setId(it.getAttributes().get("id")); + provenance.setName(it.getAttributes().get("name")); + provenance.setCollectionMode(provisionMode); + provenance.setCompletionStatus( + it.getAttributes().get("completionStatus")); + provenances.add(provenance); + }); } if (resolvededFromNodes != null && resolvededFromNodes.size() > 0) { - resolvededFromNodes.forEach(it -> { - final ProvenaceInfo provenance = new ProvenaceInfo(); - provenance.setId(it.getAttributes().get("id")); - provenance.setName(it.getAttributes().get("name")); - provenance.setCollectionMode("resolved"); - provenance.setCompletionStatus(it.getAttributes().get("completionStatus")); - provenances.add(provenance); - }); + resolvededFromNodes.forEach( + it -> { + final ProvenaceInfo provenance = new ProvenaceInfo(); + provenance.setId(it.getAttributes().get("id")); + provenance.setName(it.getAttributes().get("name")); + provenance.setCollectionMode("resolved"); + provenance.setCompletionStatus( + it.getAttributes().get("completionStatus")); + provenances.add(provenance); + }); } parsedObject.setDlicollectedfrom(provenances); - parsedObject.setCompletionStatus(VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='completionStatus']")); + parsedObject.setCompletionStatus( + VtdUtilityParser.getSingleValue( + ap, vn, "//*[local-name()='completionStatus']")); - parsedObject.setCollectedfrom(parsedObject.getDlicollectedfrom().stream().map( - p -> { - final KeyValue cf = new KeyValue(); - cf.setKey(p.getId()); - cf.setValue(p.getName()); - return cf; - } - ).collect(Collectors.toList())); + parsedObject.setCollectedfrom( + parsedObject.getDlicollectedfrom().stream() + .map( + p -> { + final KeyValue cf = new KeyValue(); + cf.setKey(p.getId()); + cf.setValue(p.getName()); + return cf; + }) + .collect(Collectors.toList())); final List relatedIdentifiers = - VtdUtilityParser.getTextValuesWithAttributes(ap, vn, "//*[local-name()='relatedIdentifier']", - Arrays.asList("relatedIdentifierType", "relationType", "entityType", "inverseRelationType")); - generateRelations(relationMapper, parsedObject, result, di, dateOfCollection, relatedIdentifiers); + VtdUtilityParser.getTextValuesWithAttributes( + ap, + vn, + "//*[local-name()='relatedIdentifier']", + Arrays.asList( + "relatedIdentifierType", + "relationType", + "entityType", + "inverseRelationType")); + generateRelations( + relationMapper, parsedObject, result, di, dateOfCollection, relatedIdentifiers); final List hostedBy = - VtdUtilityParser.getTextValuesWithAttributes(ap, vn, "//*[local-name()='hostedBy']", Arrays.asList("id", "name")); - + VtdUtilityParser.getTextValuesWithAttributes( + ap, vn, "//*[local-name()='hostedBy']", Arrays.asList("id", "name")); if (hostedBy != null) { - parsedObject.setInstance(hostedBy.stream().map(it -> - { - final Instance i = new Instance(); - i.setUrl(Collections.singletonList(currentPid.getValue())); - KeyValue h = new KeyValue(); - i.setHostedby(h); - h.setKey(it.getAttributes().get("id")); - h.setValue(it.getAttributes().get("name")); - return i; - }).collect(Collectors.toList())); + parsedObject.setInstance( + hostedBy.stream() + .map( + it -> { + final Instance i = new Instance(); + i.setUrl( + Collections.singletonList( + currentPid.getValue())); + KeyValue h = new KeyValue(); + i.setHostedby(h); + h.setKey(it.getAttributes().get("id")); + h.setValue(it.getAttributes().get("name")); + return i; + }) + .collect(Collectors.toList())); } - final List authorsNode = VtdUtilityParser.getTextValue(ap, vn, "//*[local-name()='creator']"); + final List authorsNode = + VtdUtilityParser.getTextValue(ap, vn, "//*[local-name()='creator']"); if (authorsNode != null) - parsedObject.setAuthor(authorsNode - .stream() - .map(a -> { - final Author author = new Author(); - author.setFullname(a); - return author; - }).collect(Collectors.toList()) - ); + parsedObject.setAuthor( + authorsNode.stream() + .map( + a -> { + final Author author = new Author(); + author.setFullname(a); + return author; + }) + .collect(Collectors.toList())); - final List titles = VtdUtilityParser.getTextValue(ap, vn, "//*[local-name()='title']"); + final List titles = + VtdUtilityParser.getTextValue(ap, vn, "//*[local-name()='title']"); if (titles != null) { - parsedObject.setTitle(titles.stream() - .map(t -> { - final StructuredProperty st = new StructuredProperty(); - st.setValue(t); - return st; - } - ).collect(Collectors.toList()) - ); + parsedObject.setTitle( + titles.stream() + .map( + t -> { + final StructuredProperty st = new StructuredProperty(); + st.setValue(t); + return st; + }) + .collect(Collectors.toList())); } - Field description = new Field<>(); - description.setValue(VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='description']")); + description.setValue( + VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='description']")); - if (StringUtils.isNotBlank(description.getValue()) && description.getValue().length() > 10000) { + if (StringUtils.isNotBlank(description.getValue()) + && description.getValue().length() > 10000) { description.setValue(description.getValue().substring(0, 10000)); } parsedObject.setDescription(Collections.singletonList(description)); - final String cd = VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='date']"); StructuredProperty date = new StructuredProperty(); @@ -185,7 +228,13 @@ public class PublicationScholexplorerParser extends AbstractScholexplorerParser date.setQualifier(dq); parsedObject.setRelevantdate(Collections.singletonList(date)); - List subjects = extractSubject(VtdUtilityParser.getTextValuesWithAttributes(ap, vn, "//*[local-name()='subject']", Collections.singletonList("scheme"))); + List subjects = + extractSubject( + VtdUtilityParser.getTextValuesWithAttributes( + ap, + vn, + "//*[local-name()='subject']", + Collections.singletonList("scheme"))); parsedObject.setSubject(subjects); parsedObject.setDataInfo(di); @@ -205,8 +254,5 @@ public class PublicationScholexplorerParser extends AbstractScholexplorerParser log.error("Error on parsing record ", e); return null; } - } - - } diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/GraphHiveImporterJobTest.java b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/GraphHiveImporterJobTest.java index ecadbe981..e35339bfa 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/GraphHiveImporterJobTest.java +++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/GraphHiveImporterJobTest.java @@ -1,6 +1,9 @@ package eu.dnetlib.dhp.oa.graph; import eu.dnetlib.dhp.schema.common.ModelSupport; +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; import org.apache.commons.io.FileUtils; import org.apache.commons.lang3.RandomStringUtils; import org.apache.spark.SparkConf; @@ -12,15 +15,12 @@ import org.junit.jupiter.api.Test; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.io.IOException; -import java.nio.file.Files; -import java.nio.file.Path; - public class GraphHiveImporterJobTest { private static final Logger log = LoggerFactory.getLogger(GraphHiveImporterJobTest.class); - public static final String JDBC_DERBY_TEMPLATE = "jdbc:derby:;databaseName=%s/junit_metastore_db;create=true"; + public static final String JDBC_DERBY_TEMPLATE = + "jdbc:derby:;databaseName=%s/junit_metastore_db;create=true"; private static SparkSession spark; @@ -45,14 +45,16 @@ public class GraphHiveImporterJobTest { conf.set("spark.ui.enabled", "false"); conf.set("spark.sql.warehouse.dir", workingDir.toString()); conf.set("hive.metastore.warehouse.dir", workingDir.resolve("warehouse").toString()); - conf.set("javax.jdo.option.ConnectionURL", String.format(JDBC_DERBY_TEMPLATE, workingDir.resolve("warehouse").toString())); + conf.set( + "javax.jdo.option.ConnectionURL", + String.format(JDBC_DERBY_TEMPLATE, workingDir.resolve("warehouse").toString())); - spark = SparkSession - .builder() - .appName(GraphHiveImporterJobTest.class.getSimpleName()) - .config(conf) - .enableHiveSupport() - .getOrCreate(); + spark = + SparkSession.builder() + .appName(GraphHiveImporterJobTest.class.getSimpleName()) + .config(conf) + .enableHiveSupport() + .getOrCreate(); } @AfterAll @@ -64,20 +66,25 @@ public class GraphHiveImporterJobTest { @Test public void testImportGraphAsHiveDB() throws Exception { - GraphHiveImporterJob.main(new String[]{ - "-isSparkSessionManaged", Boolean.FALSE.toString(), - "-inputPath", getClass().getResource("/eu/dnetlib/dhp/oa/graph/sample").getPath(), - "-hiveMetastoreUris", "", - "-hiveDbName", dbName - }); + GraphHiveImporterJob.main( + new String[] { + "-isSparkSessionManaged", + Boolean.FALSE.toString(), + "-inputPath", + getClass().getResource("/eu/dnetlib/dhp/oa/graph/sample").getPath(), + "-hiveMetastoreUris", + "", + "-hiveDbName", + dbName + }); - ModelSupport.oafTypes.forEach((name, clazz) -> { - long count = spark.read().table(dbName + "." + name).count(); - int expected = name.equals("relation") ? 100 : 10; - - Assertions.assertEquals(expected, count, String.format("%s should be %s", name, expected)); - }); + ModelSupport.oafTypes.forEach( + (name, clazz) -> { + long count = spark.read().table(dbName + "." + name).count(); + int expected = name.equals("relation") ? 100 : 10; + Assertions.assertEquals( + expected, count, String.format("%s should be %s", name, expected)); + }); } - } diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java index 63d7d50db..6af314d76 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java +++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java @@ -1,106 +1,102 @@ package eu.dnetlib.dhp.oa.graph.raw; -import com.google.common.collect.Maps; -import eu.dnetlib.dhp.schema.oaf.*; -import org.apache.commons.io.IOUtils; -import org.apache.commons.lang3.StringUtils; -import org.junit.jupiter.api.BeforeAll; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.Test; -import org.junit.jupiter.api.extension.ExtendWith; -import org.mockito.Mock; -import org.mockito.junit.jupiter.MockitoExtension; - -import java.io.IOException; -import java.util.List; -import java.util.Map; - import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertTrue; import static org.mockito.ArgumentMatchers.anyString; import static org.mockito.Mockito.when; +import eu.dnetlib.dhp.schema.oaf.*; +import java.io.IOException; +import java.util.List; +import java.util.Map; +import org.apache.commons.io.IOUtils; +import org.apache.commons.lang3.StringUtils; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.extension.ExtendWith; +import org.mockito.Mock; +import org.mockito.junit.jupiter.MockitoExtension; + @ExtendWith(MockitoExtension.class) public class MappersTest { - @Mock - private Map code2name; + @Mock private Map code2name; - @BeforeEach - public void setUp() throws Exception { - when(code2name.get(anyString())).thenAnswer(invocation -> invocation.getArgument(0)); - } + @BeforeEach + public void setUp() throws Exception { + when(code2name.get(anyString())).thenAnswer(invocation -> invocation.getArgument(0)); + } - @Test - void testPublication() throws IOException { - final String xml = IOUtils.toString(getClass().getResourceAsStream("oaf_record.xml")); + @Test + void testPublication() throws IOException { + final String xml = IOUtils.toString(getClass().getResourceAsStream("oaf_record.xml")); - final List list = new OafToOafMapper(code2name).processMdRecord(xml); + final List list = new OafToOafMapper(code2name).processMdRecord(xml); - assertEquals(3, list.size()); - assertTrue(list.get(0) instanceof Publication); - assertTrue(list.get(1) instanceof Relation); - assertTrue(list.get(2) instanceof Relation); + assertEquals(3, list.size()); + assertTrue(list.get(0) instanceof Publication); + assertTrue(list.get(1) instanceof Relation); + assertTrue(list.get(2) instanceof Relation); - final Publication p = (Publication) list.get(0); - final Relation r1 = (Relation) list.get(1); - final Relation r2 = (Relation) list.get(2); + final Publication p = (Publication) list.get(0); + final Relation r1 = (Relation) list.get(1); + final Relation r2 = (Relation) list.get(2); - assertValidId(p.getId()); - assertTrue(StringUtils.isNotBlank(p.getTitle().get(0).getValue())); - assertTrue(p.getAuthor().size() > 0); - assertTrue(p.getSubject().size() > 0); - assertTrue(StringUtils.isNotBlank(p.getJournal().getIssnOnline())); - assertTrue(StringUtils.isNotBlank(p.getJournal().getName())); + assertValidId(p.getId()); + assertTrue(StringUtils.isNotBlank(p.getTitle().get(0).getValue())); + assertTrue(p.getAuthor().size() > 0); + assertTrue(p.getSubject().size() > 0); + assertTrue(StringUtils.isNotBlank(p.getJournal().getIssnOnline())); + assertTrue(StringUtils.isNotBlank(p.getJournal().getName())); - assertValidId(r1.getSource()); - assertValidId(r2.getSource()); - assertEquals(r1.getSource(), r2.getTarget()); - assertEquals(r2.getSource(), r1.getTarget()); - assertTrue(StringUtils.isNotBlank(r1.getRelClass())); - assertTrue(StringUtils.isNotBlank(r2.getRelClass())); - assertTrue(StringUtils.isNotBlank(r1.getRelType())); - assertTrue(StringUtils.isNotBlank(r2.getRelType())); - } + assertValidId(r1.getSource()); + assertValidId(r2.getSource()); + assertEquals(r1.getSource(), r2.getTarget()); + assertEquals(r2.getSource(), r1.getTarget()); + assertTrue(StringUtils.isNotBlank(r1.getRelClass())); + assertTrue(StringUtils.isNotBlank(r2.getRelClass())); + assertTrue(StringUtils.isNotBlank(r1.getRelType())); + assertTrue(StringUtils.isNotBlank(r2.getRelType())); + } - @Test - void testDataset() throws IOException { - final String xml = IOUtils.toString(getClass().getResourceAsStream("odf_dataset.xml")); + @Test + void testDataset() throws IOException { + final String xml = IOUtils.toString(getClass().getResourceAsStream("odf_dataset.xml")); - final List list = new OdfToOafMapper(code2name).processMdRecord(xml); + final List list = new OdfToOafMapper(code2name).processMdRecord(xml); - assertEquals(1, list.size()); - assertTrue(list.get(0) instanceof Dataset); + assertEquals(1, list.size()); + assertTrue(list.get(0) instanceof Dataset); - final Dataset d = (Dataset) list.get(0); + final Dataset d = (Dataset) list.get(0); - assertValidId(d.getId()); - assertTrue(StringUtils.isNotBlank(d.getTitle().get(0).getValue())); - assertTrue(d.getAuthor().size() > 0); - assertTrue(d.getSubject().size() > 0); - } + assertValidId(d.getId()); + assertTrue(StringUtils.isNotBlank(d.getTitle().get(0).getValue())); + assertTrue(d.getAuthor().size() > 0); + assertTrue(d.getSubject().size() > 0); + } - @Test - void testSoftware() throws IOException { - final String xml = IOUtils.toString(getClass().getResourceAsStream("odf_software.xml")); + @Test + void testSoftware() throws IOException { + final String xml = IOUtils.toString(getClass().getResourceAsStream("odf_software.xml")); - final List list = new OdfToOafMapper(code2name).processMdRecord(xml); + final List list = new OdfToOafMapper(code2name).processMdRecord(xml); - assertEquals(1, list.size()); - assertTrue(list.get(0) instanceof Software); + assertEquals(1, list.size()); + assertTrue(list.get(0) instanceof Software); - final Software s = (Software) list.get(0); + final Software s = (Software) list.get(0); - assertValidId(s.getId()); - assertTrue(StringUtils.isNotBlank(s.getTitle().get(0).getValue())); - assertTrue(s.getAuthor().size() > 0); - assertTrue(s.getSubject().size() > 0); - } + assertValidId(s.getId()); + assertTrue(StringUtils.isNotBlank(s.getTitle().get(0).getValue())); + assertTrue(s.getAuthor().size() > 0); + assertTrue(s.getSubject().size() > 0); + } - private void assertValidId(final String id) { - assertEquals(49, id.length()); - assertEquals('|', id.charAt(2)); - assertEquals(':', id.charAt(15)); - assertEquals(':', id.charAt(16)); - } + private void assertValidId(final String id) { + assertEquals(49, id.length()); + assertEquals('|', id.charAt(2)); + assertEquals(':', id.charAt(15)); + assertEquals(':', id.charAt(16)); + } } diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplicationTest.java b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplicationTest.java index b1fc9131f..3c7bc684d 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplicationTest.java +++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplicationTest.java @@ -1,8 +1,17 @@ package eu.dnetlib.dhp.oa.graph.raw; +import static org.junit.jupiter.api.Assertions.assertEquals; + import com.fasterxml.jackson.core.type.TypeReference; import com.fasterxml.jackson.databind.ObjectMapper; import eu.dnetlib.dhp.schema.oaf.*; +import java.io.IOException; +import java.sql.Array; +import java.sql.Date; +import java.sql.ResultSet; +import java.sql.SQLException; +import java.util.List; +import java.util.Objects; import org.apache.commons.io.IOUtils; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; @@ -11,283 +20,294 @@ import org.mockito.Mock; import org.mockito.Mockito; import org.mockito.junit.jupiter.MockitoExtension; -import java.io.IOException; -import java.sql.Array; -import java.sql.Date; -import java.sql.ResultSet; -import java.sql.SQLException; -import java.util.List; -import java.util.Objects; - -import static org.junit.jupiter.api.Assertions.assertEquals; - @ExtendWith(MockitoExtension.class) public class MigrateDbEntitiesApplicationTest { - private MigrateDbEntitiesApplication app; + private MigrateDbEntitiesApplication app; - @Mock - private ResultSet rs; + @Mock private ResultSet rs; - @BeforeEach - public void setUp() { - this.app = new MigrateDbEntitiesApplication(); - } + @BeforeEach + public void setUp() { + this.app = new MigrateDbEntitiesApplication(); + } - @Test - public void testProcessDatasource() throws Exception { - final List fields = prepareMocks("datasources_resultset_entry.json"); + @Test + public void testProcessDatasource() throws Exception { + final List fields = prepareMocks("datasources_resultset_entry.json"); - final List list = app.processDatasource(rs); - assertEquals(1, list.size()); - verifyMocks(fields); + final List list = app.processDatasource(rs); + assertEquals(1, list.size()); + verifyMocks(fields); - final Datasource ds = (Datasource) list.get(0); - assertValidId(ds.getId()); - assertEquals(ds.getOfficialname().getValue(), getValueAsString("officialname", fields)); - assertEquals(ds.getEnglishname().getValue(), getValueAsString("englishname", fields)); - assertEquals(ds.getContactemail().getValue(), getValueAsString("contactemail", fields)); - assertEquals(ds.getWebsiteurl().getValue(), getValueAsString("websiteurl", fields)); - assertEquals(ds.getNamespaceprefix().getValue(), getValueAsString("namespaceprefix", fields)); - assertEquals(ds.getCollectedfrom().get(0).getKey(), getValueAsString("collectedfromid", fields)); - assertEquals(ds.getCollectedfrom().get(0).getValue(), getValueAsString("collectedfromname", fields)); - } + final Datasource ds = (Datasource) list.get(0); + assertValidId(ds.getId()); + assertEquals(ds.getOfficialname().getValue(), getValueAsString("officialname", fields)); + assertEquals(ds.getEnglishname().getValue(), getValueAsString("englishname", fields)); + assertEquals(ds.getContactemail().getValue(), getValueAsString("contactemail", fields)); + assertEquals(ds.getWebsiteurl().getValue(), getValueAsString("websiteurl", fields)); + assertEquals( + ds.getNamespaceprefix().getValue(), getValueAsString("namespaceprefix", fields)); + assertEquals( + ds.getCollectedfrom().get(0).getKey(), getValueAsString("collectedfromid", fields)); + assertEquals( + ds.getCollectedfrom().get(0).getValue(), + getValueAsString("collectedfromname", fields)); + } - @Test - public void testProcessProject() throws Exception { - final List fields = prepareMocks("projects_resultset_entry.json"); + @Test + public void testProcessProject() throws Exception { + final List fields = prepareMocks("projects_resultset_entry.json"); - final List list = app.processProject(rs); - assertEquals(1, list.size()); - verifyMocks(fields); + final List list = app.processProject(rs); + assertEquals(1, list.size()); + verifyMocks(fields); - final Project p = (Project) list.get(0); - assertValidId(p.getId()); - assertEquals(p.getAcronym().getValue(), getValueAsString("acronym", fields)); - assertEquals(p.getTitle().getValue(), getValueAsString("title", fields)); - assertEquals(p.getCollectedfrom().get(0).getKey(), getValueAsString("collectedfromid", fields)); - assertEquals(p.getCollectedfrom().get(0).getValue(), getValueAsString("collectedfromname", fields)); - } + final Project p = (Project) list.get(0); + assertValidId(p.getId()); + assertEquals(p.getAcronym().getValue(), getValueAsString("acronym", fields)); + assertEquals(p.getTitle().getValue(), getValueAsString("title", fields)); + assertEquals( + p.getCollectedfrom().get(0).getKey(), getValueAsString("collectedfromid", fields)); + assertEquals( + p.getCollectedfrom().get(0).getValue(), + getValueAsString("collectedfromname", fields)); + } - @Test - public void testProcessOrganization() throws Exception { - final List fields = prepareMocks("organizations_resultset_entry.json"); + @Test + public void testProcessOrganization() throws Exception { + final List fields = prepareMocks("organizations_resultset_entry.json"); - final List list = app.processOrganization(rs); + final List list = app.processOrganization(rs); - assertEquals(1, list.size()); + assertEquals(1, list.size()); - verifyMocks(fields); + verifyMocks(fields); - final Organization o = (Organization) list.get(0); - assertValidId(o.getId()); - assertEquals(o.getLegalshortname().getValue(), getValueAsString("legalshortname", fields)); - assertEquals(o.getLegalname().getValue(), getValueAsString("legalname", fields)); - assertEquals(o.getWebsiteurl().getValue(), getValueAsString("websiteurl", fields)); - assertEquals(o.getCountry().getClassid(), getValueAsString("country", fields).split("@@@")[0]); - assertEquals(o.getCountry().getClassname(), getValueAsString("country", fields).split("@@@")[1]); - assertEquals(o.getCountry().getSchemeid(), getValueAsString("country", fields).split("@@@")[2]); - assertEquals(o.getCountry().getSchemename(), getValueAsString("country", fields).split("@@@")[3]); - assertEquals(o.getCollectedfrom().get(0).getKey(), getValueAsString("collectedfromid", fields)); - assertEquals(o.getCollectedfrom().get(0).getValue(), getValueAsString("collectedfromname", fields)); - } + final Organization o = (Organization) list.get(0); + assertValidId(o.getId()); + assertEquals(o.getLegalshortname().getValue(), getValueAsString("legalshortname", fields)); + assertEquals(o.getLegalname().getValue(), getValueAsString("legalname", fields)); + assertEquals(o.getWebsiteurl().getValue(), getValueAsString("websiteurl", fields)); + assertEquals( + o.getCountry().getClassid(), getValueAsString("country", fields).split("@@@")[0]); + assertEquals( + o.getCountry().getClassname(), getValueAsString("country", fields).split("@@@")[1]); + assertEquals( + o.getCountry().getSchemeid(), getValueAsString("country", fields).split("@@@")[2]); + assertEquals( + o.getCountry().getSchemename(), + getValueAsString("country", fields).split("@@@")[3]); + assertEquals( + o.getCollectedfrom().get(0).getKey(), getValueAsString("collectedfromid", fields)); + assertEquals( + o.getCollectedfrom().get(0).getValue(), + getValueAsString("collectedfromname", fields)); + } - @Test - public void testProcessDatasourceOrganization() throws Exception { - final List fields = prepareMocks("datasourceorganization_resultset_entry.json"); + @Test + public void testProcessDatasourceOrganization() throws Exception { + final List fields = prepareMocks("datasourceorganization_resultset_entry.json"); - final List list = app.processDatasourceOrganization(rs); + final List list = app.processDatasourceOrganization(rs); - assertEquals(2, list.size()); - verifyMocks(fields); + assertEquals(2, list.size()); + verifyMocks(fields); - final Relation r1 = (Relation) list.get(0); - final Relation r2 = (Relation) list.get(1); - assertValidId(r1.getSource()); - assertValidId(r2.getSource()); - assertEquals(r1.getSource(), r2.getTarget()); - assertEquals(r2.getSource(), r1.getTarget()); - } + final Relation r1 = (Relation) list.get(0); + final Relation r2 = (Relation) list.get(1); + assertValidId(r1.getSource()); + assertValidId(r2.getSource()); + assertEquals(r1.getSource(), r2.getTarget()); + assertEquals(r2.getSource(), r1.getTarget()); + } - @Test - public void testProcessProjectOrganization() throws Exception { - final List fields = prepareMocks("projectorganization_resultset_entry.json"); + @Test + public void testProcessProjectOrganization() throws Exception { + final List fields = prepareMocks("projectorganization_resultset_entry.json"); - final List list = app.processProjectOrganization(rs); + final List list = app.processProjectOrganization(rs); - assertEquals(2, list.size()); - verifyMocks(fields); + assertEquals(2, list.size()); + verifyMocks(fields); - final Relation r1 = (Relation) list.get(0); - final Relation r2 = (Relation) list.get(1); - assertValidId(r1.getSource()); - assertValidId(r2.getSource()); - assertEquals(r1.getSource(), r2.getTarget()); - assertEquals(r2.getSource(), r1.getTarget()); - } + final Relation r1 = (Relation) list.get(0); + final Relation r2 = (Relation) list.get(1); + assertValidId(r1.getSource()); + assertValidId(r2.getSource()); + assertEquals(r1.getSource(), r2.getTarget()); + assertEquals(r2.getSource(), r1.getTarget()); + } - @Test - public void testProcessClaims_context() throws Exception { - final List fields = prepareMocks("claimscontext_resultset_entry.json"); + @Test + public void testProcessClaims_context() throws Exception { + final List fields = prepareMocks("claimscontext_resultset_entry.json"); - final List list = app.processClaims(rs); + final List list = app.processClaims(rs); - assertEquals(1, list.size()); - verifyMocks(fields); - } + assertEquals(1, list.size()); + verifyMocks(fields); + } - @Test - public void testProcessClaims_rels() throws Exception { - final List fields = prepareMocks("claimsrel_resultset_entry.json"); + @Test + public void testProcessClaims_rels() throws Exception { + final List fields = prepareMocks("claimsrel_resultset_entry.json"); - final List list = app.processClaims(rs); + final List list = app.processClaims(rs); - assertEquals(2, list.size()); - verifyMocks(fields); - } + assertEquals(2, list.size()); + verifyMocks(fields); + } - private List prepareMocks(final String jsonFile) throws IOException, SQLException { - final String json = IOUtils.toString(getClass().getResourceAsStream(jsonFile)); - final ObjectMapper mapper = new ObjectMapper(); - final List list = mapper.readValue(json, new TypeReference>() {}); + private List prepareMocks(final String jsonFile) throws IOException, SQLException { + final String json = IOUtils.toString(getClass().getResourceAsStream(jsonFile)); + final ObjectMapper mapper = new ObjectMapper(); + final List list = + mapper.readValue(json, new TypeReference>() {}); - for (final TypedField tf : list) { - if (tf.getValue() == null) { - switch (tf.getType()) { - case "not_used": - break; - case "boolean": - Mockito.when(rs.getBoolean(tf.getField())).thenReturn(false); - break; - case "date": - Mockito.when(rs.getDate(tf.getField())).thenReturn(null); - break; - case "int": - Mockito.when(rs.getInt(tf.getField())).thenReturn(0); - break; - case "double": - Mockito.when(rs.getDouble(tf.getField())).thenReturn(0.0); - break; - case "array": - Mockito.when(rs.getArray(tf.getField())).thenReturn(null); - break; - case "string": - default: - Mockito.when(rs.getString(tf.getField())).thenReturn(null); - break; - } - } else { - switch (tf.getType()) { - case "not_used": - break; - case "boolean": - Mockito.when(rs.getBoolean(tf.getField())).thenReturn(Boolean.parseBoolean(tf.getValue().toString())); - break; - case "date": - Mockito.when(rs.getDate(tf.getField())).thenReturn(Date.valueOf(tf.getValue().toString())); - break; - case "int": - Mockito.when(rs.getInt(tf.getField())).thenReturn(new Integer(tf.getValue().toString())); - break; - case "double": - Mockito.when(rs.getDouble(tf.getField())).thenReturn(new Double(tf.getValue().toString())); - break; - case "array": - final Array arr = Mockito.mock(Array.class); - final String[] values = ((List) tf.getValue()).stream() - .filter(Objects::nonNull) - .map(o -> o.toString()) - .toArray(String[]::new); + for (final TypedField tf : list) { + if (tf.getValue() == null) { + switch (tf.getType()) { + case "not_used": + break; + case "boolean": + Mockito.when(rs.getBoolean(tf.getField())).thenReturn(false); + break; + case "date": + Mockito.when(rs.getDate(tf.getField())).thenReturn(null); + break; + case "int": + Mockito.when(rs.getInt(tf.getField())).thenReturn(0); + break; + case "double": + Mockito.when(rs.getDouble(tf.getField())).thenReturn(0.0); + break; + case "array": + Mockito.when(rs.getArray(tf.getField())).thenReturn(null); + break; + case "string": + default: + Mockito.when(rs.getString(tf.getField())).thenReturn(null); + break; + } + } else { + switch (tf.getType()) { + case "not_used": + break; + case "boolean": + Mockito.when(rs.getBoolean(tf.getField())) + .thenReturn(Boolean.parseBoolean(tf.getValue().toString())); + break; + case "date": + Mockito.when(rs.getDate(tf.getField())) + .thenReturn(Date.valueOf(tf.getValue().toString())); + break; + case "int": + Mockito.when(rs.getInt(tf.getField())) + .thenReturn(new Integer(tf.getValue().toString())); + break; + case "double": + Mockito.when(rs.getDouble(tf.getField())) + .thenReturn(new Double(tf.getValue().toString())); + break; + case "array": + final Array arr = Mockito.mock(Array.class); + final String[] values = + ((List) tf.getValue()) + .stream() + .filter(Objects::nonNull) + .map(o -> o.toString()) + .toArray(String[]::new); - Mockito.when(arr.getArray()).thenReturn(values); - Mockito.when(rs.getArray(tf.getField())).thenReturn(arr); - break; - case "string": - default: - Mockito.when(rs.getString(tf.getField())).thenReturn(tf.getValue().toString()); - break; - } - } - } + Mockito.when(arr.getArray()).thenReturn(values); + Mockito.when(rs.getArray(tf.getField())).thenReturn(arr); + break; + case "string": + default: + Mockito.when(rs.getString(tf.getField())) + .thenReturn(tf.getValue().toString()); + break; + } + } + } - return list; - } + return list; + } - private void verifyMocks(final List list) throws SQLException { - for (final TypedField tf : list) { + private void verifyMocks(final List list) throws SQLException { + for (final TypedField tf : list) { - switch (tf.getType()) { - case "not_used": - break; - case "boolean": - Mockito.verify(rs, Mockito.atLeastOnce()).getBoolean(tf.getField()); - break; - case "date": - Mockito.verify(rs, Mockito.atLeastOnce()).getDate(tf.getField()); - break; - case "int": - Mockito.verify(rs, Mockito.atLeastOnce()).getInt(tf.getField()); - break; - case "double": - Mockito.verify(rs, Mockito.atLeastOnce()).getDouble(tf.getField()); - break; - case "array": - Mockito.verify(rs, Mockito.atLeastOnce()).getArray(tf.getField()); - break; - case "string": - default: - Mockito.verify(rs, Mockito.atLeastOnce()).getString(tf.getField()); - break; - } - } - } + switch (tf.getType()) { + case "not_used": + break; + case "boolean": + Mockito.verify(rs, Mockito.atLeastOnce()).getBoolean(tf.getField()); + break; + case "date": + Mockito.verify(rs, Mockito.atLeastOnce()).getDate(tf.getField()); + break; + case "int": + Mockito.verify(rs, Mockito.atLeastOnce()).getInt(tf.getField()); + break; + case "double": + Mockito.verify(rs, Mockito.atLeastOnce()).getDouble(tf.getField()); + break; + case "array": + Mockito.verify(rs, Mockito.atLeastOnce()).getArray(tf.getField()); + break; + case "string": + default: + Mockito.verify(rs, Mockito.atLeastOnce()).getString(tf.getField()); + break; + } + } + } - private void assertValidId(final String id) { - assertEquals(49, id.length()); - assertEquals('|', id.charAt(2)); - assertEquals(':', id.charAt(15)); - assertEquals(':', id.charAt(16)); - } + private void assertValidId(final String id) { + assertEquals(49, id.length()); + assertEquals('|', id.charAt(2)); + assertEquals(':', id.charAt(15)); + assertEquals(':', id.charAt(16)); + } - private String getValueAsString(final String name, final List fields) { - return fields.stream() - .filter(f -> f.getField().equals(name)) - .map(TypedField::getValue) - .filter(Objects::nonNull) - .map(o -> o.toString()) - .findFirst() - .get(); - } + private String getValueAsString(final String name, final List fields) { + return fields.stream() + .filter(f -> f.getField().equals(name)) + .map(TypedField::getValue) + .filter(Objects::nonNull) + .map(o -> o.toString()) + .findFirst() + .get(); + } } class TypedField { - private String field; - private String type; - private Object value; + private String field; + private String type; + private Object value; - public String getField() { - return field; - } + public String getField() { + return field; + } - public void setField(final String field) { - this.field = field; - } + public void setField(final String field) { + this.field = field; + } - public String getType() { - return type; - } + public String getType() { + return type; + } - public void setType(final String type) { - this.type = type; - } + public void setType(final String type) { + this.type = type; + } - public Object getValue() { - return value; - } - - public void setValue(final Object value) { - this.value = value; - } + public Object getValue() { + return value; + } + public void setValue(final Object value) { + this.value = value; + } } diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/sx/graph/ScholexplorerParserTest.java b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/sx/graph/ScholexplorerParserTest.java index 5741dd628..eb7a25a75 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/sx/graph/ScholexplorerParserTest.java +++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/sx/graph/ScholexplorerParserTest.java @@ -3,17 +3,15 @@ package eu.dnetlib.dhp.sx.graph; import com.fasterxml.jackson.core.JsonProcessingException; import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.SerializationFeature; -import eu.dnetlib.dhp.sx.graph.parser.DatasetScholexplorerParser; import eu.dnetlib.dhp.schema.oaf.Oaf; +import eu.dnetlib.dhp.sx.graph.parser.DatasetScholexplorerParser; import eu.dnetlib.scholexplorer.relation.RelationMapper; +import java.util.List; import org.apache.commons.io.IOUtils; import org.junit.jupiter.api.Test; -import java.util.List; - public class ScholexplorerParserTest { - @Test public void testDataciteParser() throws Exception { String xml = IOUtils.toString(this.getClass().getResourceAsStream("dmf.xml")); @@ -24,15 +22,14 @@ public class ScholexplorerParserTest { ObjectMapper m = new ObjectMapper(); m.enable(SerializationFeature.INDENT_OUTPUT); + oaves.forEach( + oaf -> { + try { + System.out.println(m.writeValueAsString(oaf)); + System.out.println("----------------------------"); + } catch (JsonProcessingException e) { - oaves.forEach(oaf -> { - try { - System.out.println(m.writeValueAsString(oaf)); - System.out.println("----------------------------"); - } catch (JsonProcessingException e) { - - } - }); - + } + }); } } diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/sx/graph/SparkScholexplorerGraphImporterTest.java b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/sx/graph/SparkScholexplorerGraphImporterTest.java index 4c4d5372c..7f32de318 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/sx/graph/SparkScholexplorerGraphImporterTest.java +++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/sx/graph/SparkScholexplorerGraphImporterTest.java @@ -1,11 +1,3 @@ package eu.dnetlib.dhp.sx.graph; - - - -public class SparkScholexplorerGraphImporterTest { - - - - -} +public class SparkScholexplorerGraphImporterTest {} diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/sx/graph/SparkScholexplorerMergeEntitiesJobTest.java b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/sx/graph/SparkScholexplorerMergeEntitiesJobTest.java index f080b36cb..af6385803 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/sx/graph/SparkScholexplorerMergeEntitiesJobTest.java +++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/sx/graph/SparkScholexplorerMergeEntitiesJobTest.java @@ -1,8 +1,3 @@ package eu.dnetlib.dhp.sx.graph; - - -public class SparkScholexplorerMergeEntitiesJobTest { - - -} +public class SparkScholexplorerMergeEntitiesJobTest {} diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/ProvisionUtil.java b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/ProvisionUtil.java index aed444660..6d1609e25 100644 --- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/ProvisionUtil.java +++ b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/ProvisionUtil.java @@ -6,25 +6,25 @@ import org.apache.commons.lang3.StringUtils; public class ProvisionUtil { - public final static String deletedByInferenceJPATH = "$.dataInfo.deletedbyinference"; - public final static String TARGETJSONPATH = "$.target"; - public final static String SOURCEJSONPATH = "$.source"; + public static final String deletedByInferenceJPATH = "$.dataInfo.deletedbyinference"; + public static final String TARGETJSONPATH = "$.target"; + public static final String SOURCEJSONPATH = "$.source"; -// public static RelatedItemInfo getItemType(final String item, final String idPath) { -// String targetId = DHPUtils.getJPathString(idPath, item); -// switch (StringUtils.substringBefore(targetId, "|")) { -// case "50": -// return new RelatedItemInfo(null,0,1,0); -// case "60": -// return new RelatedItemInfo(null,1,0,0); -// case "70": -// return new RelatedItemInfo(null,0,0,1); -// default: -// throw new RuntimeException("Unknonw target ID"); -// -// } -// -// } + // public static RelatedItemInfo getItemType(final String item, final String idPath) { + // String targetId = DHPUtils.getJPathString(idPath, item); + // switch (StringUtils.substringBefore(targetId, "|")) { + // case "50": + // return new RelatedItemInfo(null,0,1,0); + // case "60": + // return new RelatedItemInfo(null,1,0,0); + // case "70": + // return new RelatedItemInfo(null,0,0,1); + // default: + // throw new RuntimeException("Unknonw target ID"); + // + // } + // + // } public static Boolean isNotDeleted(final String item) { return !"true".equalsIgnoreCase(DHPUtils.getJPathString(deletedByInferenceJPATH, item)); @@ -41,7 +41,6 @@ public class ProvisionUtil { return Typology.unknown; default: throw new RuntimeException("Unknonw ID type"); - } } } diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/RelatedItemInfo.java b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/RelatedItemInfo.java index 3b07aab8d..145ee5b39 100644 --- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/RelatedItemInfo.java +++ b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/RelatedItemInfo.java @@ -2,10 +2,7 @@ package eu.dnetlib.dhp.provision; import java.io.Serializable; -/** - * This class models the information of related items - */ - +/** This class models the information of related items */ public class RelatedItemInfo implements Serializable { private String source; @@ -16,10 +13,10 @@ public class RelatedItemInfo implements Serializable { private long relatedUnknown = 0; - public RelatedItemInfo() { - } + public RelatedItemInfo() {} - public RelatedItemInfo(String source, long relatedDataset, long relatedPublication, long relatedUnknown) { + public RelatedItemInfo( + String source, long relatedDataset, long relatedPublication, long relatedUnknown) { this.source = source; this.relatedDataset = relatedDataset; this.relatedPublication = relatedPublication; diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/SparkExtractRelationCount.java b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/SparkExtractRelationCount.java index 2c9642794..90bde451b 100644 --- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/SparkExtractRelationCount.java +++ b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/SparkExtractRelationCount.java @@ -1,57 +1,32 @@ package eu.dnetlib.dhp.provision; -import com.fasterxml.jackson.databind.ObjectMapper; import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.schema.oaf.Relation; -import eu.dnetlib.dhp.utils.DHPUtils; import org.apache.commons.io.IOUtils; -import org.apache.hadoop.io.compress.GzipCodec; -import org.apache.spark.api.java.JavaSparkContext; -import org.apache.spark.api.java.function.Function2; -import org.apache.spark.api.java.function.PairFunction; import org.apache.spark.sql.*; -import org.apache.spark.sql.catalyst.expressions.Expression; -import scala.Tuple2; - -import java.util.Arrays; -import java.util.HashMap; -import java.util.Map; - /** - * SparkExtractRelationCount is a spark job that takes in input relation RDD - * and retrieve for each item in relation which are the number of - * - Related Dataset - * - Related Publication - * - Related Unknown + * SparkExtractRelationCount is a spark job that takes in input relation RDD and retrieve for each + * item in relation which are the number of - Related Dataset - Related Publication - Related + * Unknown */ public class SparkExtractRelationCount { - - - - public static void main(String[] args) throws Exception { - final ArgumentApplicationParser parser = new ArgumentApplicationParser(IOUtils.toString(SparkExtractRelationCount.class.getResourceAsStream("/eu/dnetlib/dhp/provision/input_related_entities_parameters.json"))); + final ArgumentApplicationParser parser = + new ArgumentApplicationParser( + IOUtils.toString( + SparkExtractRelationCount.class.getResourceAsStream( + "/eu/dnetlib/dhp/provision/input_related_entities_parameters.json"))); parser.parseArgument(args); - final SparkSession spark = SparkSession - .builder() - .appName(SparkExtractRelationCount.class.getSimpleName()) - .master(parser.get("master")) - .getOrCreate(); - + final SparkSession spark = + SparkSession.builder() + .appName(SparkExtractRelationCount.class.getSimpleName()) + .master(parser.get("master")) + .getOrCreate(); final String workingDirPath = parser.get("workingDirPath"); final String relationPath = parser.get("relationPath"); - DatasetJoiner.startJoin(spark, relationPath,workingDirPath + "/relatedItemCount"); - + DatasetJoiner.startJoin(spark, relationPath, workingDirPath + "/relatedItemCount"); } - - - - - - - } diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/SparkGenerateScholix.java b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/SparkGenerateScholix.java index 58a98e490..989e1877a 100644 --- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/SparkGenerateScholix.java +++ b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/SparkGenerateScholix.java @@ -18,67 +18,94 @@ import scala.Tuple2; public class SparkGenerateScholix { - public static void main(String[] args) throws Exception { - final ArgumentApplicationParser parser = new ArgumentApplicationParser(IOUtils.toString(SparkGenerateScholix.class.getResourceAsStream("/eu/dnetlib/dhp/provision/input_generate_summary_parameters.json"))); + final ArgumentApplicationParser parser = + new ArgumentApplicationParser( + IOUtils.toString( + SparkGenerateScholix.class.getResourceAsStream( + "/eu/dnetlib/dhp/provision/input_generate_summary_parameters.json"))); parser.parseArgument(args); SparkConf conf = new SparkConf(); - conf.set("spark.sql.shuffle.partitions","4000"); + conf.set("spark.sql.shuffle.partitions", "4000"); conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); - final SparkSession spark = SparkSession - .builder() - .config(conf) - .appName(SparkExtractRelationCount.class.getSimpleName()) - .master(parser.get("master")) - .getOrCreate(); - - conf.registerKryoClasses(new Class[]{ - Scholix.class, - ScholixCollectedFrom.class, - ScholixEntityId.class, - ScholixIdentifier.class, - ScholixRelationship.class, - ScholixResource.class - }); - + final SparkSession spark = + SparkSession.builder() + .config(conf) + .appName(SparkExtractRelationCount.class.getSimpleName()) + .master(parser.get("master")) + .getOrCreate(); + conf.registerKryoClasses( + new Class[] { + Scholix.class, + ScholixCollectedFrom.class, + ScholixEntityId.class, + ScholixIdentifier.class, + ScholixRelationship.class, + ScholixResource.class + }); final String graphPath = parser.get("graphPath"); final String workingDirPath = parser.get("workingDirPath"); final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); - final Dataset scholixSummary = spark.read().load(workingDirPath + "/summary").as(Encoders.bean(ScholixSummary.class)); - final Dataset rels = spark.read().load(graphPath + "/relation").as(Encoders.bean(Relation.class)); + final Dataset scholixSummary = + spark.read() + .load(workingDirPath + "/summary") + .as(Encoders.bean(ScholixSummary.class)); + final Dataset rels = + spark.read().load(graphPath + "/relation").as(Encoders.bean(Relation.class)); + Dataset firstJoin = + scholixSummary + .joinWith(rels, scholixSummary.col("id").equalTo(rels.col("source"))) + .map( + (MapFunction, Scholix>) + f -> Scholix.generateScholixWithSource(f._1(), f._2()), + Encoders.bean(Scholix.class)); - Dataset firstJoin = scholixSummary.joinWith(rels, scholixSummary.col("id").equalTo(rels.col("source"))) - .map((MapFunction, Scholix>) f -> Scholix.generateScholixWithSource(f._1(), f._2()), Encoders.bean(Scholix.class)); + firstJoin.write().mode(SaveMode.Overwrite).save(workingDirPath + "/scholix_1"); - firstJoin.write().mode(SaveMode.Overwrite).save(workingDirPath+"/scholix_1"); - - Dataset scholix_final = spark.read().load(workingDirPath+"/scholix_1").as(Encoders.bean(Scholix.class)); + Dataset scholix_final = + spark.read().load(workingDirPath + "/scholix_1").as(Encoders.bean(Scholix.class)); scholixSummary - .map((MapFunction) ScholixResource::fromSummary, Encoders.bean(ScholixResource.class)) + .map( + (MapFunction) ScholixResource::fromSummary, + Encoders.bean(ScholixResource.class)) .repartition(1000) .write() .mode(SaveMode.Overwrite) - .save(workingDirPath+"/scholix_target"); + .save(workingDirPath + "/scholix_target"); - Dataset target = spark.read().load(workingDirPath+"/scholix_target").as(Encoders.bean(ScholixResource.class)); + Dataset target = + spark.read() + .load(workingDirPath + "/scholix_target") + .as(Encoders.bean(ScholixResource.class)); - scholix_final.joinWith(target, scholix_final.col("identifier").equalTo(target.col("dnetIdentifier")), "inner") - .map((MapFunction, Scholix>) f -> { - final Scholix scholix = f._1(); - final ScholixResource scholixTarget = f._2(); - scholix.setTarget(scholixTarget); - scholix.generateIdentifier(); - scholix.generatelinkPublisher(); - return scholix; - }, Encoders.kryo(Scholix.class)).javaRDD().map(s-> { - ObjectMapper mapper = new ObjectMapper(); - return mapper.writeValueAsString(s); - }).saveAsTextFile(workingDirPath+"/scholix_json", GzipCodec.class); + scholix_final + .joinWith( + target, + scholix_final.col("identifier").equalTo(target.col("dnetIdentifier")), + "inner") + .map( + (MapFunction, Scholix>) + f -> { + final Scholix scholix = f._1(); + final ScholixResource scholixTarget = f._2(); + scholix.setTarget(scholixTarget); + scholix.generateIdentifier(); + scholix.generatelinkPublisher(); + return scholix; + }, + Encoders.kryo(Scholix.class)) + .javaRDD() + .map( + s -> { + ObjectMapper mapper = new ObjectMapper(); + return mapper.writeValueAsString(s); + }) + .saveAsTextFile(workingDirPath + "/scholix_json", GzipCodec.class); } } diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/SparkGenerateSummary.java b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/SparkGenerateSummary.java index 39b7a9468..ea7599788 100644 --- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/SparkGenerateSummary.java +++ b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/SparkGenerateSummary.java @@ -1,17 +1,11 @@ package eu.dnetlib.dhp.provision; -import com.fasterxml.jackson.databind.ObjectMapper; import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.provision.scholix.summary.ScholixSummary; import eu.dnetlib.dhp.utils.DHPUtils; import org.apache.commons.io.IOUtils; -import org.apache.hadoop.io.compress.GzipCodec; -import org.apache.spark.api.java.JavaPairRDD; -import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; -import org.apache.spark.api.java.function.Function; import org.apache.spark.api.java.function.MapFunction; -import org.apache.spark.api.java.function.PairFunction; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Encoders; import org.apache.spark.sql.SparkSession; @@ -21,68 +15,99 @@ public class SparkGenerateSummary { private static final String jsonIDPath = "$.id"; - public static void main(String[] args) throws Exception { - final ArgumentApplicationParser parser = new ArgumentApplicationParser(IOUtils.toString(SparkGenerateSummary.class.getResourceAsStream("/eu/dnetlib/dhp/provision/input_generate_summary_parameters.json"))); + final ArgumentApplicationParser parser = + new ArgumentApplicationParser( + IOUtils.toString( + SparkGenerateSummary.class.getResourceAsStream( + "/eu/dnetlib/dhp/provision/input_generate_summary_parameters.json"))); parser.parseArgument(args); - final SparkSession spark = SparkSession - .builder() - .appName(SparkExtractRelationCount.class.getSimpleName()) - .master(parser.get("master")) - .getOrCreate(); - + final SparkSession spark = + SparkSession.builder() + .appName(SparkExtractRelationCount.class.getSimpleName()) + .master(parser.get("master")) + .getOrCreate(); final String graphPath = parser.get("graphPath"); final String workingDirPath = parser.get("workingDirPath"); final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); - Dataset rInfo = spark.read().load(workingDirPath + "/relatedItemCount").as(Encoders.bean(RelatedItemInfo.class)); + Dataset rInfo = + spark.read() + .load(workingDirPath + "/relatedItemCount") + .as(Encoders.bean(RelatedItemInfo.class)); + Dataset entity = + spark.createDataset( + sc.textFile( + graphPath + + "/publication," + + graphPath + + "/dataset," + + graphPath + + "/unknown") + .map( + s -> + ScholixSummary.fromJsonOAF( + ProvisionUtil.getItemTypeFromId( + DHPUtils.getJPathString( + jsonIDPath, s)), + s)) + .rdd(), + Encoders.bean(ScholixSummary.class)); - Dataset entity = spark.createDataset(sc.textFile(graphPath + "/publication," + graphPath + "/dataset," + graphPath + "/unknown") - .map(s -> - ScholixSummary.fromJsonOAF(ProvisionUtil.getItemTypeFromId(DHPUtils.getJPathString(jsonIDPath, s)), s) + Dataset summaryComplete = + rInfo.joinWith(entity, rInfo.col("source").equalTo(entity.col("id"))) + .map( + (MapFunction< + Tuple2, + ScholixSummary>) + t -> { + ScholixSummary scholixSummary = t._2(); + RelatedItemInfo relatedItemInfo = t._1(); + scholixSummary.setRelatedDatasets( + relatedItemInfo.getRelatedDataset()); + scholixSummary.setRelatedPublications( + relatedItemInfo.getRelatedPublication()); + scholixSummary.setRelatedUnknown( + relatedItemInfo.getRelatedUnknown()); + return scholixSummary; + }, + Encoders.bean(ScholixSummary.class)); + summaryComplete.write().save(workingDirPath + "/summary"); - ).rdd(), Encoders.bean(ScholixSummary.class)); - - - Dataset summaryComplete = rInfo.joinWith(entity, rInfo.col("source").equalTo(entity.col("id"))).map((MapFunction, ScholixSummary>) t -> - { - ScholixSummary scholixSummary = t._2(); - RelatedItemInfo relatedItemInfo = t._1(); - scholixSummary.setRelatedDatasets(relatedItemInfo.getRelatedDataset()); - scholixSummary.setRelatedPublications(relatedItemInfo.getRelatedPublication()); - scholixSummary.setRelatedUnknown(relatedItemInfo.getRelatedUnknown()); - return scholixSummary; - }, Encoders.bean(ScholixSummary.class) - ); - - summaryComplete.write().save(workingDirPath+"/summary"); - - -// JavaPairRDD relationCount = sc.textFile(workingDirPath+"/relatedItemCount").mapToPair((PairFunction) i -> new Tuple2<>(DHPUtils.getJPathString(jsonIDPath, i), i)); -// -// JavaPairRDD entities = -// sc.textFile(graphPath + "/publication") -// .filter(ProvisionUtil::isNotDeleted) -// .mapToPair((PairFunction) i -> new Tuple2<>(DHPUtils.getJPathString(jsonIDPath, i), i)) -// .union( -// sc.textFile(graphPath + "/dataset") -// .filter(ProvisionUtil::isNotDeleted) -// .mapToPair((PairFunction) i -> new Tuple2<>(DHPUtils.getJPathString(jsonIDPath, i), i)) -// ) -// .union( -// sc.textFile(graphPath + "/unknown") -// .filter(ProvisionUtil::isNotDeleted) -// .mapToPair((PairFunction) i -> new Tuple2<>(DHPUtils.getJPathString(jsonIDPath, i), i)) -// ); -// entities.join(relationCount).map((Function>, String>) k -> -// ScholixSummary.fromJsonOAF(ProvisionUtil.getItemTypeFromId(k._1()), k._2()._1(), k._2()._2())).saveAsTextFile(workingDirPath+"/summary", GzipCodec.class); -// -// -// ; + // JavaPairRDD relationCount = + // sc.textFile(workingDirPath+"/relatedItemCount").mapToPair((PairFunction) i -> new Tuple2<>(DHPUtils.getJPathString(jsonIDPath, i), i)); + // + // JavaPairRDD entities = + // sc.textFile(graphPath + "/publication") + // .filter(ProvisionUtil::isNotDeleted) + // .mapToPair((PairFunction) i -> new + // Tuple2<>(DHPUtils.getJPathString(jsonIDPath, i), i)) + // .union( + // sc.textFile(graphPath + "/dataset") + // .filter(ProvisionUtil::isNotDeleted) + // .mapToPair((PairFunction) + // i -> + // new Tuple2<>(DHPUtils.getJPathString(jsonIDPath, i), i)) + // ) + // .union( + // sc.textFile(graphPath + "/unknown") + // .filter(ProvisionUtil::isNotDeleted) + // .mapToPair((PairFunction) + // i -> + // new Tuple2<>(DHPUtils.getJPathString(jsonIDPath, i), i)) + // ); + // entities.join(relationCount).map((Function>, + // String>) k -> + // ScholixSummary.fromJsonOAF(ProvisionUtil.getItemTypeFromId(k._1()), + // k._2()._1(), k._2()._2())).saveAsTextFile(workingDirPath+"/summary", GzipCodec.class); + // + // + // ; } } diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/SparkIndexCollectionOnES.java b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/SparkIndexCollectionOnES.java index ce3c6315c..d597f42a4 100644 --- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/SparkIndexCollectionOnES.java +++ b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/SparkIndexCollectionOnES.java @@ -2,34 +2,35 @@ package eu.dnetlib.dhp.provision; import com.fasterxml.jackson.databind.ObjectMapper; import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.provision.scholix.Scholix; import eu.dnetlib.dhp.provision.scholix.summary.ScholixSummary; +import java.util.HashMap; +import java.util.Map; import org.apache.commons.io.IOUtils; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.function.MapFunction; -import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Encoders; import org.apache.spark.sql.SparkSession; import org.elasticsearch.spark.rdd.api.java.JavaEsSpark; -import java.nio.file.attribute.AclFileAttributeView; -import java.util.HashMap; -import java.util.Map; - public class SparkIndexCollectionOnES { - public static void main(String[] args) throws Exception{ + public static void main(String[] args) throws Exception { - final ArgumentApplicationParser parser = new ArgumentApplicationParser(IOUtils.toString(SparkIndexCollectionOnES.class.getResourceAsStream("/eu/dnetlib/dhp/provision/index_on_es.json"))); + final ArgumentApplicationParser parser = + new ArgumentApplicationParser( + IOUtils.toString( + SparkIndexCollectionOnES.class.getResourceAsStream( + "/eu/dnetlib/dhp/provision/index_on_es.json"))); parser.parseArgument(args); - SparkConf conf = new SparkConf().setAppName(SparkIndexCollectionOnES.class.getSimpleName()) - .setMaster(parser.get("master")); - - conf.set("spark.sql.shuffle.partitions","4000"); + SparkConf conf = + new SparkConf() + .setAppName(SparkIndexCollectionOnES.class.getSimpleName()) + .setMaster(parser.get("master")); + conf.set("spark.sql.shuffle.partitions", "4000"); final String sourcePath = parser.get("sourcePath"); final String index = parser.get("index"); @@ -38,20 +39,24 @@ public class SparkIndexCollectionOnES { final SparkSession spark = SparkSession.builder().config(conf).getOrCreate(); - final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); JavaRDD inputRdd; - - if("summary".equalsIgnoreCase(type)) - inputRdd = spark.read().load(sourcePath).as(Encoders.bean(ScholixSummary.class)).map((MapFunction) f -> { - final ObjectMapper mapper = new ObjectMapper(); - return mapper.writeValueAsString(f); - }, Encoders.STRING()).javaRDD(); - - else - inputRdd = sc.textFile(sourcePath); + if ("summary".equalsIgnoreCase(type)) + inputRdd = + spark.read() + .load(sourcePath) + .as(Encoders.bean(ScholixSummary.class)) + .map( + (MapFunction) + f -> { + final ObjectMapper mapper = new ObjectMapper(); + return mapper.writeValueAsString(f); + }, + Encoders.STRING()) + .javaRDD(); + else inputRdd = sc.textFile(sourcePath); Map esCfg = new HashMap<>(); esCfg.put("es.nodes", "10.19.65.51, 10.19.65.52, 10.19.65.53, 10.19.65.54"); @@ -60,7 +65,6 @@ public class SparkIndexCollectionOnES { esCfg.put("es.batch.write.retry.wait", "60s"); esCfg.put("es.batch.size.entries", "200"); esCfg.put("es.nodes.wan.only", "true"); - JavaEsSpark.saveJsonToEs(inputRdd,index, esCfg); - + JavaEsSpark.saveJsonToEs(inputRdd, index, esCfg); } } diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/scholix/Scholix.java b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/scholix/Scholix.java index c3ccf6899..d10cfcabd 100644 --- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/scholix/Scholix.java +++ b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/scholix/Scholix.java @@ -23,7 +23,6 @@ public class Scholix implements Serializable { private String identifier; - public Scholix clone(final ScholixResource t) { final Scholix clone = new Scholix(); clone.setPublicationDate(publicationDate); @@ -34,70 +33,100 @@ public class Scholix implements Serializable { clone.setTarget(t); clone.generatelinkPublisher(); clone.generateIdentifier(); - return clone; + return clone; } - - public static Scholix generateScholixWithSource(final String sourceSummaryJson, final String relation) { + public static Scholix generateScholixWithSource( + final String sourceSummaryJson, final String relation) { final ObjectMapper mapper = new ObjectMapper(); try { - ScholixSummary scholixSummary = mapper.readValue(sourceSummaryJson, ScholixSummary.class); + ScholixSummary scholixSummary = + mapper.readValue(sourceSummaryJson, ScholixSummary.class); Relation rel = mapper.readValue(relation, Relation.class); final Scholix s = new Scholix(); - if (scholixSummary.getDate() != null && scholixSummary.getDate().size()>0) + if (scholixSummary.getDate() != null && scholixSummary.getDate().size() > 0) s.setPublicationDate(scholixSummary.getDate().get(0)); - s.setLinkprovider(rel.getCollectedFrom().stream().map(cf -> - new ScholixEntityId(cf.getValue(), Collections.singletonList( - new ScholixIdentifier(cf.getKey(), "dnet_identifier") - ))).collect(Collectors.toList())); - s.setRelationship(new ScholixRelationship(rel.getRelType(),rel.getRelClass(),null )); + s.setLinkprovider( + rel.getCollectedFrom().stream() + .map( + cf -> + new ScholixEntityId( + cf.getValue(), + Collections.singletonList( + new ScholixIdentifier( + cf.getKey(), + "dnet_identifier")))) + .collect(Collectors.toList())); + s.setRelationship(new ScholixRelationship(rel.getRelType(), rel.getRelClass(), null)); s.setSource(ScholixResource.fromSummary(scholixSummary)); return s; } catch (Throwable e) { - throw new RuntimeException(String.format("Summary: %s \n relation:%s",sourceSummaryJson, relation), e); + throw new RuntimeException( + String.format("Summary: %s \n relation:%s", sourceSummaryJson, relation), e); } } - public static Scholix generateScholixWithSource(final ScholixSummary scholixSummary, final Relation rel) { + public static Scholix generateScholixWithSource( + final ScholixSummary scholixSummary, final Relation rel) { final Scholix s = new Scholix(); - if (scholixSummary.getDate() != null && scholixSummary.getDate().size()>0) + if (scholixSummary.getDate() != null && scholixSummary.getDate().size() > 0) s.setPublicationDate(scholixSummary.getDate().get(0)); - s.setLinkprovider(rel.getCollectedFrom().stream().map(cf -> - new ScholixEntityId(cf.getValue(), Collections.singletonList( - new ScholixIdentifier(cf.getKey(), "dnet_identifier") - ))).collect(Collectors.toList())); - s.setRelationship(new ScholixRelationship(rel.getRelType(),rel.getRelClass(),null )); + s.setLinkprovider( + rel.getCollectedFrom().stream() + .map( + cf -> + new ScholixEntityId( + cf.getValue(), + Collections.singletonList( + new ScholixIdentifier( + cf.getKey(), "dnet_identifier")))) + .collect(Collectors.toList())); + s.setRelationship(new ScholixRelationship(rel.getRelType(), rel.getRelClass(), null)); s.setSource(ScholixResource.fromSummary(scholixSummary)); s.setIdentifier(rel.getTarget()); -// ScholixResource mockTarget = new ScholixResource(); -// mockTarget.setDnetIdentifier(rel.getTarget()); -// s.setTarget(mockTarget); -// s.generateIdentifier(); + // ScholixResource mockTarget = new ScholixResource(); + // mockTarget.setDnetIdentifier(rel.getTarget()); + // s.setTarget(mockTarget); + // s.generateIdentifier(); return s; } - public void generatelinkPublisher() { Set publisher = new HashSet<>(); if (source.getPublisher() != null) - publisher.addAll(source.getPublisher().stream().map(ScholixEntityId::getName).collect(Collectors.toList())); + publisher.addAll( + source.getPublisher().stream() + .map(ScholixEntityId::getName) + .collect(Collectors.toList())); if (target.getPublisher() != null) - publisher.addAll(target.getPublisher().stream().map(ScholixEntityId::getName).collect(Collectors.toList())); - this.publisher = publisher.stream().map(k -> new ScholixEntityId(k ,null)).collect(Collectors.toList()); + publisher.addAll( + target.getPublisher().stream() + .map(ScholixEntityId::getName) + .collect(Collectors.toList())); + this.publisher = + publisher.stream() + .map(k -> new ScholixEntityId(k, null)) + .collect(Collectors.toList()); } - public void generateIdentifier( ) { - setIdentifier(DHPUtils.md5(String.format("%s::%s::%s",source.getDnetIdentifier(),relationship.getName(), target.getDnetIdentifier()))); - + public void generateIdentifier() { + setIdentifier( + DHPUtils.md5( + String.format( + "%s::%s::%s", + source.getDnetIdentifier(), + relationship.getName(), + target.getDnetIdentifier()))); } public Scholix addTarget(final String targetSummaryJson) { final ObjectMapper mapper = new ObjectMapper(); try { - ScholixSummary targetSummary = mapper.readValue(targetSummaryJson, ScholixSummary.class); + ScholixSummary targetSummary = + mapper.readValue(targetSummaryJson, ScholixSummary.class); setTarget(ScholixResource.fromSummary(targetSummary)); generateIdentifier(); return this; @@ -157,6 +186,7 @@ public class Scholix implements Serializable { public String getIdentifier() { return identifier; } + public void setIdentifier(String identifier) { this.identifier = identifier; } diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/scholix/ScholixCollectedFrom.java b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/scholix/ScholixCollectedFrom.java index 2ba84188d..d3278268a 100644 --- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/scholix/ScholixCollectedFrom.java +++ b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/scholix/ScholixCollectedFrom.java @@ -8,10 +8,10 @@ public class ScholixCollectedFrom implements Serializable { private String provisionMode; private String completionStatus; - public ScholixCollectedFrom() { - } + public ScholixCollectedFrom() {} - public ScholixCollectedFrom(ScholixEntityId provider, String provisionMode, String completionStatus) { + public ScholixCollectedFrom( + ScholixEntityId provider, String provisionMode, String completionStatus) { this.provider = provider; this.provisionMode = provisionMode; this.completionStatus = completionStatus; diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/scholix/ScholixEntityId.java b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/scholix/ScholixEntityId.java index 0f43a8d44..3b0f0f29e 100644 --- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/scholix/ScholixEntityId.java +++ b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/scholix/ScholixEntityId.java @@ -7,8 +7,7 @@ public class ScholixEntityId implements Serializable { private String name; private List identifiers; - public ScholixEntityId() { - } + public ScholixEntityId() {} public ScholixEntityId(String name, List identifiers) { this.name = name; diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/scholix/ScholixIdentifier.java b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/scholix/ScholixIdentifier.java index f354ef10a..9ad4fd752 100644 --- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/scholix/ScholixIdentifier.java +++ b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/scholix/ScholixIdentifier.java @@ -6,8 +6,7 @@ public class ScholixIdentifier implements Serializable { private String identifier; private String schema; - public ScholixIdentifier() { - } + public ScholixIdentifier() {} public ScholixIdentifier(String identifier, String schema) { this.identifier = identifier; diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/scholix/ScholixRelationship.java b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/scholix/ScholixRelationship.java index 1a35038b9..77cdbc764 100644 --- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/scholix/ScholixRelationship.java +++ b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/scholix/ScholixRelationship.java @@ -7,8 +7,7 @@ public class ScholixRelationship implements Serializable { private String schema; private String inverse; - public ScholixRelationship() { - } + public ScholixRelationship() {} public ScholixRelationship(String name, String schema, String inverse) { this.name = name; diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/scholix/ScholixResource.java b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/scholix/ScholixResource.java index f29722eb8..d3ed3a4a4 100644 --- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/scholix/ScholixResource.java +++ b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/scholix/ScholixResource.java @@ -1,7 +1,6 @@ package eu.dnetlib.dhp.provision.scholix; import eu.dnetlib.dhp.provision.scholix.summary.ScholixSummary; - import java.io.Serializable; import java.util.Collections; import java.util.List; @@ -25,42 +24,47 @@ public class ScholixResource implements Serializable { resource.setDnetIdentifier(summary.getId()); - resource.setIdentifier(summary.getLocalIdentifier().stream() - .map(i -> - new ScholixIdentifier(i.getId(), i.getType())) - .collect(Collectors.toList())); + resource.setIdentifier( + summary.getLocalIdentifier().stream() + .map(i -> new ScholixIdentifier(i.getId(), i.getType())) + .collect(Collectors.toList())); resource.setObjectType(summary.getTypology().toString()); - if (summary.getTitle() != null && summary.getTitle().size() > 0) resource.setTitle(summary.getTitle().get(0)); if (summary.getAuthor() != null) - resource.setCreator(summary.getAuthor().stream() - .map(c -> new ScholixEntityId(c, null)) - .collect(Collectors.toList()) - ); + resource.setCreator( + summary.getAuthor().stream() + .map(c -> new ScholixEntityId(c, null)) + .collect(Collectors.toList())); if (summary.getDate() != null && summary.getDate().size() > 0) resource.setPublicationDate(summary.getDate().get(0)); if (summary.getPublisher() != null) - resource.setPublisher(summary.getPublisher().stream() - .map(p -> new ScholixEntityId(p, null)) - .collect(Collectors.toList()) - ); + resource.setPublisher( + summary.getPublisher().stream() + .map(p -> new ScholixEntityId(p, null)) + .collect(Collectors.toList())); if (summary.getDatasources() != null) - resource.setCollectedFrom(summary.getDatasources().stream() - .map(d -> - new ScholixCollectedFrom(new ScholixEntityId(d.getDatasourceName(), - Collections.singletonList(new ScholixIdentifier(d.getDatasourceId(), "dnet_identifier")) - ), "collected", d.getCompletionStatus())) - .collect(Collectors.toList())); + resource.setCollectedFrom( + summary.getDatasources().stream() + .map( + d -> + new ScholixCollectedFrom( + new ScholixEntityId( + d.getDatasourceName(), + Collections.singletonList( + new ScholixIdentifier( + d.getDatasourceId(), + "dnet_identifier"))), + "collected", + d.getCompletionStatus())) + .collect(Collectors.toList())); return resource; - } - public List getIdentifier() { return identifier; } diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/scholix/summary/CollectedFromType.java b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/scholix/summary/CollectedFromType.java index 6fc0c7b29..045b85a02 100644 --- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/scholix/summary/CollectedFromType.java +++ b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/scholix/summary/CollectedFromType.java @@ -8,9 +8,7 @@ public class CollectedFromType implements Serializable { private String datasourceId; private String completionStatus; - - public CollectedFromType() { - } + public CollectedFromType() {} public CollectedFromType(String datasourceName, String datasourceId, String completionStatus) { this.datasourceName = datasourceName; diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/scholix/summary/SchemeValue.java b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/scholix/summary/SchemeValue.java index 95a292b9d..807a1d38f 100644 --- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/scholix/summary/SchemeValue.java +++ b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/scholix/summary/SchemeValue.java @@ -6,9 +6,7 @@ public class SchemeValue implements Serializable { private String scheme; private String value; - public SchemeValue() { - - } + public SchemeValue() {} public SchemeValue(String scheme, String value) { this.scheme = scheme; diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/scholix/summary/ScholixSummary.java b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/scholix/summary/ScholixSummary.java index 26538d156..676ae8cae 100644 --- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/scholix/summary/ScholixSummary.java +++ b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/scholix/summary/ScholixSummary.java @@ -9,7 +9,6 @@ import eu.dnetlib.dhp.schema.oaf.StructuredProperty; import eu.dnetlib.dhp.schema.scholexplorer.DLIDataset; import eu.dnetlib.dhp.schema.scholexplorer.DLIPublication; import eu.dnetlib.dhp.schema.scholexplorer.DLIUnknown; - import java.io.Serializable; import java.util.Collections; import java.util.List; @@ -30,7 +29,6 @@ public class ScholixSummary implements Serializable { private long relatedUnknown; private List datasources; - public String getId() { return id; } @@ -137,7 +135,6 @@ public class ScholixSummary implements Serializable { this.datasources = datasources; } - public static ScholixSummary fromJsonOAF(final Typology oafType, final String oafJson) { try { final ObjectMapper mapper = new ObjectMapper(); @@ -145,11 +142,14 @@ public class ScholixSummary implements Serializable { mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); switch (oafType) { case dataset: - return summaryFromDataset(mapper.readValue(oafJson, DLIDataset.class), relatedItemInfo); + return summaryFromDataset( + mapper.readValue(oafJson, DLIDataset.class), relatedItemInfo); case publication: - return summaryFromPublication(mapper.readValue(oafJson, DLIPublication.class), relatedItemInfo); + return summaryFromPublication( + mapper.readValue(oafJson, DLIPublication.class), relatedItemInfo); case unknown: - return summaryFromUnknown(mapper.readValue(oafJson, DLIUnknown.class), relatedItemInfo); + return summaryFromUnknown( + mapper.readValue(oafJson, DLIUnknown.class), relatedItemInfo); } } catch (Throwable e) { throw new RuntimeException(e); @@ -157,23 +157,31 @@ public class ScholixSummary implements Serializable { return null; } - public static String fromJsonOAF(final Typology oafType, final String oafJson, final String relEntityJson) { + public static String fromJsonOAF( + final Typology oafType, final String oafJson, final String relEntityJson) { try { final ObjectMapper mapper = new ObjectMapper(); mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); - RelatedItemInfo relatedItemInfo = mapper.readValue(relEntityJson, RelatedItemInfo.class); + RelatedItemInfo relatedItemInfo = + mapper.readValue(relEntityJson, RelatedItemInfo.class); switch (oafType) { case dataset: - return mapper.writeValueAsString(summaryFromDataset(mapper.readValue(oafJson, DLIDataset.class), relatedItemInfo)); + return mapper.writeValueAsString( + summaryFromDataset( + mapper.readValue(oafJson, DLIDataset.class), relatedItemInfo)); case publication: - return mapper.writeValueAsString(summaryFromPublication(mapper.readValue(oafJson, DLIPublication.class), relatedItemInfo)); + return mapper.writeValueAsString( + summaryFromPublication( + mapper.readValue(oafJson, DLIPublication.class), + relatedItemInfo)); case unknown: - return mapper.writeValueAsString(summaryFromUnknown(mapper.readValue(oafJson, DLIUnknown.class), relatedItemInfo)); + return mapper.writeValueAsString( + summaryFromUnknown( + mapper.readValue(oafJson, DLIUnknown.class), relatedItemInfo)); } - } catch (Throwable e) { throw new RuntimeException(e); } @@ -181,23 +189,32 @@ public class ScholixSummary implements Serializable { return null; } - - private static ScholixSummary summaryFromDataset(final DLIDataset item, final RelatedItemInfo relatedItemInfo) { + private static ScholixSummary summaryFromDataset( + final DLIDataset item, final RelatedItemInfo relatedItemInfo) { ScholixSummary summary = new ScholixSummary(); summary.setId(item.getId()); if (item.getPid() != null) - summary.setLocalIdentifier(item.getPid().stream() - .map(p -> new TypedIdentifier(p.getValue(), p.getQualifier().getClassid())) - .collect(Collectors.toList()) - ); + summary.setLocalIdentifier( + item.getPid().stream() + .map( + p -> + new TypedIdentifier( + p.getValue(), p.getQualifier().getClassid())) + .collect(Collectors.toList())); summary.setTypology(Typology.dataset); if (item.getTitle() != null) - summary.setTitle(item.getTitle().stream().map(StructuredProperty::getValue).collect(Collectors.toList())); + summary.setTitle( + item.getTitle().stream() + .map(StructuredProperty::getValue) + .collect(Collectors.toList())); if (item.getAuthor() != null) { - summary.setAuthor(item.getAuthor().stream().map(Author::getFullname).collect(Collectors.toList())); + summary.setAuthor( + item.getAuthor().stream() + .map(Author::getFullname) + .collect(Collectors.toList())); } if (item.getRelevantdate() != null) @@ -205,19 +222,18 @@ public class ScholixSummary implements Serializable { item.getRelevantdate().stream() .filter(d -> "date".equalsIgnoreCase(d.getQualifier().getClassname())) .map(StructuredProperty::getValue) - .collect(Collectors.toList()) - ); + .collect(Collectors.toList())); if (item.getDescription() != null && item.getDescription().size() > 0) summary.setDescription(item.getDescription().get(0).getValue()); if (item.getSubject() != null) { - summary.setSubject(item.getSubject().stream() - .map(s -> new SchemeValue(s.getQualifier().getClassid(), s.getValue())) - .collect(Collectors.toList()) - ); + summary.setSubject( + item.getSubject().stream() + .map(s -> new SchemeValue(s.getQualifier().getClassid(), s.getValue())) + .collect(Collectors.toList())); } - if (item.getPublisher()!= null) + if (item.getPublisher() != null) summary.setPublisher(Collections.singletonList(item.getPublisher().getValue())); summary.setRelatedDatasets(relatedItemInfo.getRelatedDataset()); @@ -225,29 +241,44 @@ public class ScholixSummary implements Serializable { summary.setRelatedUnknown(relatedItemInfo.getRelatedUnknown()); if (item.getDlicollectedfrom() != null) - summary.setDatasources(item.getDlicollectedfrom().stream() - .map( - c -> new CollectedFromType(c.getName(), c.getId(), c.getCompletionStatus()) - ).collect(Collectors.toList())); + summary.setDatasources( + item.getDlicollectedfrom().stream() + .map( + c -> + new CollectedFromType( + c.getName(), + c.getId(), + c.getCompletionStatus())) + .collect(Collectors.toList())); return summary; } - private static ScholixSummary summaryFromPublication(final DLIPublication item, final RelatedItemInfo relatedItemInfo) { + private static ScholixSummary summaryFromPublication( + final DLIPublication item, final RelatedItemInfo relatedItemInfo) { ScholixSummary summary = new ScholixSummary(); summary.setId(item.getId()); if (item.getPid() != null) - summary.setLocalIdentifier(item.getPid().stream() - .map(p -> new TypedIdentifier(p.getValue(), p.getQualifier().getClassid())) - .collect(Collectors.toList()) - ); + summary.setLocalIdentifier( + item.getPid().stream() + .map( + p -> + new TypedIdentifier( + p.getValue(), p.getQualifier().getClassid())) + .collect(Collectors.toList())); summary.setTypology(Typology.publication); if (item.getTitle() != null) - summary.setTitle(item.getTitle().stream().map(StructuredProperty::getValue).collect(Collectors.toList())); + summary.setTitle( + item.getTitle().stream() + .map(StructuredProperty::getValue) + .collect(Collectors.toList())); if (item.getAuthor() != null) { - summary.setAuthor(item.getAuthor().stream().map(Author::getFullname).collect(Collectors.toList())); + summary.setAuthor( + item.getAuthor().stream() + .map(Author::getFullname) + .collect(Collectors.toList())); } if (item.getRelevantdate() != null) @@ -255,55 +286,66 @@ public class ScholixSummary implements Serializable { item.getRelevantdate().stream() .filter(d -> "date".equalsIgnoreCase(d.getQualifier().getClassname())) .map(StructuredProperty::getValue) - .collect(Collectors.toList()) - ); + .collect(Collectors.toList())); if (item.getDescription() != null && item.getDescription().size() > 0) summary.setDescription(item.getDescription().get(0).getValue()); if (item.getSubject() != null) { - summary.setSubject(item.getSubject().stream() - .map(s -> new SchemeValue(s.getQualifier().getClassid(), s.getValue())) - .collect(Collectors.toList()) - ); + summary.setSubject( + item.getSubject().stream() + .map(s -> new SchemeValue(s.getQualifier().getClassid(), s.getValue())) + .collect(Collectors.toList())); } - if (item.getPublisher()!= null) + if (item.getPublisher() != null) summary.setPublisher(Collections.singletonList(item.getPublisher().getValue())); - summary.setRelatedDatasets(relatedItemInfo.getRelatedDataset()); summary.setRelatedPublications(relatedItemInfo.getRelatedPublication()); summary.setRelatedUnknown(relatedItemInfo.getRelatedUnknown()); if (item.getDlicollectedfrom() != null) - summary.setDatasources(item.getDlicollectedfrom().stream() - .map( - c -> new CollectedFromType(c.getName(), c.getId(), c.getCompletionStatus()) - ).collect(Collectors.toList())); - + summary.setDatasources( + item.getDlicollectedfrom().stream() + .map( + c -> + new CollectedFromType( + c.getName(), + c.getId(), + c.getCompletionStatus())) + .collect(Collectors.toList())); return summary; } - private static ScholixSummary summaryFromUnknown(final DLIUnknown item, final RelatedItemInfo relatedItemInfo) { + private static ScholixSummary summaryFromUnknown( + final DLIUnknown item, final RelatedItemInfo relatedItemInfo) { ScholixSummary summary = new ScholixSummary(); summary.setId(item.getId()); if (item.getPid() != null) - summary.setLocalIdentifier(item.getPid().stream() - .map(p -> new TypedIdentifier(p.getValue(), p.getQualifier().getClassid())) - .collect(Collectors.toList()) - ); + summary.setLocalIdentifier( + item.getPid().stream() + .map( + p -> + new TypedIdentifier( + p.getValue(), p.getQualifier().getClassid())) + .collect(Collectors.toList())); summary.setRelatedDatasets(relatedItemInfo.getRelatedDataset()); summary.setRelatedPublications(relatedItemInfo.getRelatedPublication()); summary.setRelatedUnknown(relatedItemInfo.getRelatedUnknown()); summary.setTypology(Typology.unknown); if (item.getDlicollectedfrom() != null) - summary.setDatasources(item.getDlicollectedfrom().stream() - .map( - c -> new CollectedFromType(c.getName(), c.getId(), c.getCompletionStatus()) - ).collect(Collectors.toList())); + summary.setDatasources( + item.getDlicollectedfrom().stream() + .map( + c -> + new CollectedFromType( + c.getName(), + c.getId(), + c.getCompletionStatus())) + .collect(Collectors.toList())); return summary; } } diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/scholix/summary/TypedIdentifier.java b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/scholix/summary/TypedIdentifier.java index fd6c05ce3..a452de444 100644 --- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/scholix/summary/TypedIdentifier.java +++ b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/scholix/summary/TypedIdentifier.java @@ -6,8 +6,7 @@ public class TypedIdentifier implements Serializable { private String id; private String type; - public TypedIdentifier() { - } + public TypedIdentifier() {} public TypedIdentifier(String id, String type) { this.id = id; @@ -29,4 +28,4 @@ public class TypedIdentifier implements Serializable { public void setType(String type) { this.type = type; } -} \ No newline at end of file +} diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/update/CrossRefParserJSON.java b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/update/CrossRefParserJSON.java index 828d8f9b5..79e91799d 100644 --- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/update/CrossRefParserJSON.java +++ b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/update/CrossRefParserJSON.java @@ -9,14 +9,14 @@ import eu.dnetlib.dhp.provision.scholix.ScholixEntityId; import eu.dnetlib.dhp.provision.scholix.ScholixIdentifier; import eu.dnetlib.dhp.provision.scholix.ScholixResource; import eu.dnetlib.dhp.utils.DHPUtils; - import java.util.ArrayList; import java.util.Collections; import java.util.List; public class CrossRefParserJSON { - private static List collectedFrom =generateCrossrefCollectedFrom("complete"); + private static List collectedFrom = + generateCrossrefCollectedFrom("complete"); public static ScholixResource parseRecord(final String record) { if (record == null) return null; @@ -24,10 +24,8 @@ public class CrossRefParserJSON { JsonElement source = null; if (jElement.getAsJsonObject().has("_source")) { source = jElement.getAsJsonObject().get("_source"); - if (source == null || !source.isJsonObject()) - return null; - } - else if(jElement.getAsJsonObject().has("DOI")){ + if (source == null || !source.isJsonObject()) return null; + } else if (jElement.getAsJsonObject().has("DOI")) { source = jElement; } else { return null; @@ -38,14 +36,19 @@ public class CrossRefParserJSON { if (message.get("DOI") != null) { final String doi = message.get("DOI").getAsString(); - currentObject.setIdentifier(Collections.singletonList(new ScholixIdentifier(doi, "doi"))); + currentObject.setIdentifier( + Collections.singletonList(new ScholixIdentifier(doi, "doi"))); } - if ((!message.get("created").isJsonNull()) && (message.getAsJsonObject("created").get("date-time") != null)) { - currentObject.setPublicationDate(message.getAsJsonObject("created").get("date-time").getAsString()); + if ((!message.get("created").isJsonNull()) + && (message.getAsJsonObject("created").get("date-time") != null)) { + currentObject.setPublicationDate( + message.getAsJsonObject("created").get("date-time").getAsString()); } - if (message.get("title")!= null && !message.get("title").isJsonNull() && message.get("title").isJsonArray() ) { + if (message.get("title") != null + && !message.get("title").isJsonNull() + && message.get("title").isJsonArray()) { JsonArray array = message.get("title").getAsJsonArray(); currentObject.setTitle(array.get(0).getAsString()); @@ -58,10 +61,14 @@ public class CrossRefParserJSON { String family = ""; String given = ""; - if (currentAuth != null && currentAuth.get("family") != null && !currentAuth.get("family").isJsonNull()) { + if (currentAuth != null + && currentAuth.get("family") != null + && !currentAuth.get("family").isJsonNull()) { family = currentAuth.get("family").getAsString(); } - if (currentAuth != null && currentAuth.get("given") != null && !currentAuth.get("given").isJsonNull()) { + if (currentAuth != null + && currentAuth.get("given") != null + && !currentAuth.get("given").isJsonNull()) { given = currentAuth.get("given").getAsString(); } authorList.add(new ScholixEntityId(String.format("%s %s", family, given), null)); @@ -69,26 +76,34 @@ public class CrossRefParserJSON { currentObject.setCreator(authorList); } if (message.get("publisher") != null && !message.get("publisher").isJsonNull()) { - currentObject.setPublisher(Collections.singletonList(new ScholixEntityId(message.get("publisher").getAsString(), null))); + currentObject.setPublisher( + Collections.singletonList( + new ScholixEntityId(message.get("publisher").getAsString(), null))); } currentObject.setCollectedFrom(collectedFrom); currentObject.setObjectType("publication"); - currentObject.setDnetIdentifier(generateId(message.get("DOI").getAsString(), "doi", "publication")); + currentObject.setDnetIdentifier( + generateId(message.get("DOI").getAsString(), "doi", "publication")); return currentObject; } - private static List generateCrossrefCollectedFrom(final String completionStatus) { - final ScholixEntityId scholixEntityId = new ScholixEntityId("Crossref", - Collections.singletonList(new ScholixIdentifier("dli_________::crossref", "dnet_identifier"))); + private static List generateCrossrefCollectedFrom( + final String completionStatus) { + final ScholixEntityId scholixEntityId = + new ScholixEntityId( + "Crossref", + Collections.singletonList( + new ScholixIdentifier( + "dli_________::crossref", "dnet_identifier"))); return Collections.singletonList( - new ScholixCollectedFrom( - scholixEntityId,"resolved", completionStatus)); + new ScholixCollectedFrom(scholixEntityId, "resolved", completionStatus)); } - private static String generateId(final String pid, final String pidType, final String entityType) { + private static String generateId( + final String pid, final String pidType, final String entityType) { String type; - switch (entityType){ + switch (entityType) { case "publication": type = "50|"; break; @@ -99,10 +114,11 @@ public class CrossRefParserJSON { type = "70|"; break; default: - throw new IllegalArgumentException("unexpected value "+entityType); + throw new IllegalArgumentException("unexpected value " + entityType); } - return type+ DHPUtils.md5(String.format("%s::%s", pid.toLowerCase().trim(), pidType.toLowerCase().trim())); + return type + + DHPUtils.md5( + String.format( + "%s::%s", pid.toLowerCase().trim(), pidType.toLowerCase().trim())); } - - } diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/update/CrossrefClient.java b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/update/CrossrefClient.java index 3190ee516..e000dcbce 100644 --- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/update/CrossrefClient.java +++ b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/update/CrossrefClient.java @@ -3,6 +3,8 @@ package eu.dnetlib.dhp.provision.update; import com.google.gson.JsonElement; import com.google.gson.JsonParser; import eu.dnetlib.dhp.provision.scholix.ScholixResource; +import java.io.ByteArrayOutputStream; +import java.util.zip.Inflater; import org.apache.commons.codec.binary.Base64; import org.apache.commons.io.IOUtils; import org.apache.http.client.methods.CloseableHttpResponse; @@ -10,16 +12,12 @@ import org.apache.http.client.methods.HttpGet; import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.impl.client.HttpClients; -import java.io.ByteArrayOutputStream; -import java.util.zip.Inflater; - public class CrossrefClient { private String host; - private String index ="crossref"; + private String index = "crossref"; private String indexType = "item"; - public CrossrefClient(String host) { this.host = host; } @@ -63,26 +61,33 @@ public class CrossrefClient { decompresser.end(); return new String(unzippeddata); } catch (Throwable e) { - throw new RuntimeException("Wrong record:" + blob,e); + throw new RuntimeException("Wrong record:" + blob, e); } } - - public ScholixResource getResourceByDOI(final String doi) { try (CloseableHttpClient client = HttpClients.createDefault()) { - HttpGet httpGet = new HttpGet(String.format("http://%s:9200/%s/%s/%s", host, index,indexType, doi.replaceAll("/","%2F"))); + HttpGet httpGet = + new HttpGet( + String.format( + "http://%s:9200/%s/%s/%s", + host, index, indexType, doi.replaceAll("/", "%2F"))); CloseableHttpResponse response = client.execute(httpGet); String json = IOUtils.toString(response.getEntity().getContent()); if (json.contains("blob")) { JsonParser p = new JsonParser(); final JsonElement root = p.parse(json); - json =decompressBlob(root.getAsJsonObject().get("_source").getAsJsonObject().get("blob").getAsString()); + json = + decompressBlob( + root.getAsJsonObject() + .get("_source") + .getAsJsonObject() + .get("blob") + .getAsString()); } return CrossRefParserJSON.parseRecord(json); } catch (Throwable e) { return null; } - } } diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/update/Datacite2Scholix.java b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/update/Datacite2Scholix.java index fd2e37837..39f2dbaa6 100644 --- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/update/Datacite2Scholix.java +++ b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/update/Datacite2Scholix.java @@ -5,13 +5,12 @@ import eu.dnetlib.dhp.provision.scholix.*; import eu.dnetlib.dhp.utils.DHPUtils; import eu.dnetlib.scholexplorer.relation.RelInfo; import eu.dnetlib.scholexplorer.relation.RelationMapper; -import org.apache.commons.lang3.StringUtils; - import java.util.ArrayList; import java.util.Collections; import java.util.List; import java.util.Map; import java.util.stream.Collectors; +import org.apache.commons.lang3.StringUtils; public class Datacite2Scholix { @@ -24,24 +23,38 @@ public class Datacite2Scholix { public List generateScholixFromJson(final String dJson) { List> relIds = getRelatedIendtifiers(dJson); - relIds = relIds!= null ? relIds.stream().filter(m-> - m.containsKey("relatedIdentifierType") && m.containsKey("relationType" ) && m.containsKey( "relatedIdentifier") - ).collect(Collectors.toList()) : null; - if(relIds== null || relIds.size() ==0 ) - return null; + relIds = + relIds != null + ? relIds.stream() + .filter( + m -> + m.containsKey("relatedIdentifierType") + && m.containsKey("relationType") + && m.containsKey("relatedIdentifier")) + .collect(Collectors.toList()) + : null; + if (relIds == null || relIds.size() == 0) return null; final String updated = JsonPath.read(dJson, rootPath + ".updated"); ScholixResource resource = generateDataciteScholixResource(dJson); - return relIds.stream().flatMap(s-> { - try { - final List result = generateScholix(resource, ""+s.get("relatedIdentifier"), s.get("relatedIdentifierType"), s.get("relationType"), updated); - return result.stream(); - } catch (Throwable e) - { - return new ArrayList().stream(); - } - }).collect(Collectors.toList()); + return relIds.stream() + .flatMap( + s -> { + try { + final List result = + generateScholix( + resource, + "" + s.get("relatedIdentifier"), + s.get("relatedIdentifierType"), + s.get("relationType"), + updated); + return result.stream(); + } catch (Throwable e) { + return new ArrayList().stream(); + } + }) + .collect(Collectors.toList()); } public String getRootPath() { @@ -52,13 +65,20 @@ public class Datacite2Scholix { this.rootPath = rootPath; } - private List generateScholix(ScholixResource source, final String pid, final String pidtype, final String relType, final String updated) { + private List generateScholix( + ScholixResource source, + final String pid, + final String pidtype, + final String relType, + final String updated) { if ("doi".equalsIgnoreCase(pidtype)) { ScholixResource target = new ScholixResource(); target.setIdentifier(Collections.singletonList(new ScholixIdentifier(pid, pidtype))); final RelInfo relInfo = relationMapper.get(relType.toLowerCase()); - final ScholixRelationship rel = new ScholixRelationship(relInfo.getOriginal(), "datacite", relInfo.getInverse()); + final ScholixRelationship rel = + new ScholixRelationship( + relInfo.getOriginal(), "datacite", relInfo.getInverse()); final ScholixEntityId provider = source.getCollectedFrom().get(0).getProvider(); final Scholix s = new Scholix(); s.setSource(source); @@ -76,7 +96,9 @@ public class Datacite2Scholix { target.setObjectType("unknown"); target.setCollectedFrom(generateDataciteCollectedFrom("incomplete")); final RelInfo relInfo = relationMapper.get(relType.toLowerCase()); - final ScholixRelationship rel = new ScholixRelationship(relInfo.getOriginal(), "datacite", relInfo.getInverse()); + final ScholixRelationship rel = + new ScholixRelationship( + relInfo.getOriginal(), "datacite", relInfo.getInverse()); final ScholixEntityId provider = source.getCollectedFrom().get(0).getProvider(); final Scholix s = new Scholix(); s.setSource(source); @@ -92,7 +114,9 @@ public class Datacite2Scholix { s2.setTarget(source); s2.setLinkprovider(Collections.singletonList(provider)); s2.setPublisher(source.getPublisher()); - s2.setRelationship(new ScholixRelationship(relInfo.getInverse(), "datacite", relInfo.getOriginal())); + s2.setRelationship( + new ScholixRelationship( + relInfo.getInverse(), "datacite", relInfo.getOriginal())); s2.setPublicationDate(updated); s2.generateIdentifier(); result.add(s2); @@ -112,45 +136,51 @@ public class Datacite2Scholix { if (StringUtils.isNotBlank(publisher)) resource.setPublisher(Collections.singletonList(new ScholixEntityId(publisher, null))); final String date = getDate(dJson); - if (StringUtils.isNotBlank(date)) - resource.setPublicationDate(date); + if (StringUtils.isNotBlank(date)) resource.setPublicationDate(date); final String title = getTitle(dJson); - if(StringUtils.isNotBlank(title)) - resource.setTitle(title); + if (StringUtils.isNotBlank(title)) resource.setTitle(title); resource.setCreator(getCreators(dJson)); return resource; } private List getCreators(final String json) { final List creatorName = JsonPath.read(json, rootPath + ".creators[*].name"); - if (creatorName!= null && creatorName.size() >0) { - return creatorName.stream().map(s-> new ScholixEntityId(s, null)).collect(Collectors.toList()); + if (creatorName != null && creatorName.size() > 0) { + return creatorName.stream() + .map(s -> new ScholixEntityId(s, null)) + .collect(Collectors.toList()); } return null; } - private String getTitle(final String json){ + private String getTitle(final String json) { final List titles = JsonPath.read(json, rootPath + ".titles[*].title"); - return titles!= null && titles.size()>0?titles.get(0): null; + return titles != null && titles.size() > 0 ? titles.get(0) : null; } private String getDate(final String json) { - final List> dates = JsonPath.read(json, rootPath + ".dates"); - if(dates!= null && dates.size()>0){ + final List> dates = JsonPath.read(json, rootPath + ".dates"); + if (dates != null && dates.size() > 0) { - List> issued = dates.stream().filter(s -> "issued".equalsIgnoreCase(s.get("dateType"))).collect(Collectors.toList()); - if (issued.size()>0) - return issued.get(0).get("date"); + List> issued = + dates.stream() + .filter(s -> "issued".equalsIgnoreCase(s.get("dateType"))) + .collect(Collectors.toList()); + if (issued.size() > 0) return issued.get(0).get("date"); } return null; } - private List generateDataciteCollectedFrom(final String completionStatus) { - final ScholixEntityId scholixEntityId = new ScholixEntityId("Datasets in Datacite", - Collections.singletonList(new ScholixIdentifier("dli_________::datacite", "dnet_identifier"))); + private List generateDataciteCollectedFrom( + final String completionStatus) { + final ScholixEntityId scholixEntityId = + new ScholixEntityId( + "Datasets in Datacite", + Collections.singletonList( + new ScholixIdentifier( + "dli_________::datacite", "dnet_identifier"))); return Collections.singletonList( - new ScholixCollectedFrom( - scholixEntityId,"collected", completionStatus)); + new ScholixCollectedFrom(scholixEntityId, "collected", completionStatus)); } private String getType(final String json) { @@ -171,9 +201,10 @@ public class Datacite2Scholix { return res; } - public static String generateId(final String pid, final String pidType, final String entityType) { + public static String generateId( + final String pid, final String pidType, final String entityType) { String type; - switch (entityType){ + switch (entityType) { case "publication": type = "50|"; break; @@ -184,8 +215,11 @@ public class Datacite2Scholix { type = "70|"; break; default: - throw new IllegalArgumentException("unexpected value "+entityType); + throw new IllegalArgumentException("unexpected value " + entityType); } - return type+ DHPUtils.md5(String.format("%s::%s", pid.toLowerCase().trim(), pidType.toLowerCase().trim())); + return type + + DHPUtils.md5( + String.format( + "%s::%s", pid.toLowerCase().trim(), pidType.toLowerCase().trim())); } } diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/update/DataciteClient.java b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/update/DataciteClient.java index e1d25bf2e..635dc0c54 100644 --- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/update/DataciteClient.java +++ b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/update/DataciteClient.java @@ -1,19 +1,17 @@ package eu.dnetlib.dhp.provision.update; import eu.dnetlib.dhp.provision.scholix.ScholixResource; +import java.io.IOException; import org.apache.commons.io.IOUtils; import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.client.methods.HttpGet; import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.impl.client.HttpClients; -import java.io.IOException; - public class DataciteClient { - private String host; - private String index ="datacite"; + private String index = "datacite"; private String indexType = "dump"; private Datacite2Scholix d2s; @@ -25,15 +23,13 @@ public class DataciteClient { } public Iterable getDatasetsFromTs(final Long timestamp) { - return ()-> { + return () -> { try { return new DataciteClientIterator(host, index, timestamp); } catch (IOException e) { throw new RuntimeException(e); } }; - - } public String getHost() { @@ -62,14 +58,16 @@ public class DataciteClient { public ScholixResource getDatasetByDOI(final String doi) { try (CloseableHttpClient client = HttpClients.createDefault()) { - HttpGet httpGet = new HttpGet(String.format("http://%s:9200/%s/%s/%s", host, index,indexType, doi.replaceAll("/","%2F"))); + HttpGet httpGet = + new HttpGet( + String.format( + "http://%s:9200/%s/%s/%s", + host, index, indexType, doi.replaceAll("/", "%2F"))); CloseableHttpResponse response = client.execute(httpGet); - final String json =IOUtils.toString(response.getEntity().getContent()); + final String json = IOUtils.toString(response.getEntity().getContent()); return d2s.generateDataciteScholixResource(json); } catch (Throwable e) { return null; } } - - } diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/update/DataciteClientIterator.java b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/update/DataciteClientIterator.java index 61c1aa39f..94937d7d3 100644 --- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/update/DataciteClientIterator.java +++ b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/update/DataciteClientIterator.java @@ -1,6 +1,11 @@ package eu.dnetlib.dhp.provision.update; + import com.fasterxml.jackson.databind.ObjectMapper; import com.jayway.jsonpath.JsonPath; +import java.io.IOException; +import java.util.Iterator; +import java.util.List; +import java.util.stream.Collectors; import net.minidev.json.JSONArray; import org.apache.commons.io.IOUtils; import org.apache.http.client.methods.CloseableHttpResponse; @@ -9,15 +14,10 @@ import org.apache.http.entity.StringEntity; import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.impl.client.HttpClients; -import java.io.IOException; -import java.util.Iterator; -import java.util.List; -import java.util.stream.Collectors; - public class DataciteClientIterator implements Iterator { - final static String blobPath = "$.hits.hits[*]._source"; - final static String scrollIdPath = "$._scroll_id"; + static final String blobPath = "$.hits.hits[*]._source"; + static final String scrollIdPath = "$._scroll_id"; String scrollId; @@ -27,25 +27,29 @@ public class DataciteClientIterator implements Iterator { final String esIndex; final ObjectMapper mapper = new ObjectMapper(); - public DataciteClientIterator(final String esHost, final String esIndex, long timestamp) throws IOException { + public DataciteClientIterator(final String esHost, final String esIndex, long timestamp) + throws IOException { this.esHost = esHost; this.esIndex = esIndex; // THIS FIX IS NECESSARY to avoid different timezone - timestamp -= (60 *60 *2); - final String body =getResponse(String.format("http://%s:9200/%s/_search?scroll=1m", esHost, esIndex), String.format("{\"size\":1000, \"query\":{\"range\":{\"timestamp\":{\"gte\":%d}}}}", timestamp)); - scrollId= getJPathString(scrollIdPath, body); + timestamp -= (60 * 60 * 2); + final String body = + getResponse( + String.format("http://%s:9200/%s/_search?scroll=1m", esHost, esIndex), + String.format( + "{\"size\":1000, \"query\":{\"range\":{\"timestamp\":{\"gte\":%d}}}}", + timestamp)); + scrollId = getJPathString(scrollIdPath, body); buffer = getBlobs(body); - } - - public String getResponse(final String url,final String json ) { + public String getResponse(final String url, final String json) { CloseableHttpClient client = HttpClients.createDefault(); try { HttpPost httpPost = new HttpPost(url); - if (json!= null) { + if (json != null) { StringEntity entity = new StringEntity(json); httpPost.setEntity(entity); httpPost.setHeader("Accept", "application/json"); @@ -55,22 +59,20 @@ public class DataciteClientIterator implements Iterator { return IOUtils.toString(response.getEntity().getContent()); } catch (Throwable e) { - throw new RuntimeException("Error on executing request ",e); + throw new RuntimeException("Error on executing request ", e); } finally { try { client.close(); } catch (IOException e) { - throw new RuntimeException("Unable to close client ",e); + throw new RuntimeException("Unable to close client ", e); } } - } - private String getJPathString(final String jsonPath, final String json) { + private String getJPathString(final String jsonPath, final String json) { try { Object o = JsonPath.read(json, jsonPath); - if (o instanceof String) - return (String) o; + if (o instanceof String) return (String) o; return null; } catch (Exception e) { return ""; @@ -79,37 +81,36 @@ public class DataciteClientIterator implements Iterator { private List getBlobs(final String body) { JSONArray array = JsonPath.read(body, blobPath); - return array.stream().map( - o -> { - try { - return mapper.writeValueAsString(o); - } catch (Throwable e) { - throw new RuntimeException(e); - } - } - ).collect(Collectors.toList()); + return array.stream() + .map( + o -> { + try { + return mapper.writeValueAsString(o); + } catch (Throwable e) { + throw new RuntimeException(e); + } + }) + .collect(Collectors.toList()); } - @Override public boolean hasNext() { - return (buffer!= null && !buffer.isEmpty()); - + return (buffer != null && !buffer.isEmpty()); } @Override public String next() { final String nextItem = buffer.remove(0); if (buffer.isEmpty()) { - final String json_param = String.format("{\"scroll_id\":\"%s\",\"scroll\" : \"1m\"}", scrollId); - final String body =getResponse(String.format("http://%s:9200/_search/scroll", esHost), json_param); + final String json_param = + String.format("{\"scroll_id\":\"%s\",\"scroll\" : \"1m\"}", scrollId); + final String body = + getResponse(String.format("http://%s:9200/_search/scroll", esHost), json_param); try { buffer = getBlobs(body); } catch (Throwable e) { System.out.println(body); - } - } return nextItem; } diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/update/RetrieveUpdateFromDatacite.java b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/update/RetrieveUpdateFromDatacite.java index ea659dbb1..c8f60827c 100644 --- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/update/RetrieveUpdateFromDatacite.java +++ b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/update/RetrieveUpdateFromDatacite.java @@ -4,6 +4,8 @@ import com.fasterxml.jackson.databind.ObjectMapper; import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.provision.scholix.Scholix; import eu.dnetlib.scholexplorer.relation.RelationMapper; +import java.net.URI; +import java.util.List; import org.apache.commons.io.IOUtils; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; @@ -12,14 +14,14 @@ import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.SequenceFile; import org.apache.hadoop.io.Text; -import java.net.URI; -import java.util.List; -import java.util.concurrent.atomic.AtomicInteger; - public class RetrieveUpdateFromDatacite { - public static void main(String[] args) throws Exception{ - final ArgumentApplicationParser parser = new ArgumentApplicationParser(IOUtils.toString(RetrieveUpdateFromDatacite.class.getResourceAsStream("/eu/dnetlib/dhp/provision/input_retrieve_update_parameters.json"))); + public static void main(String[] args) throws Exception { + final ArgumentApplicationParser parser = + new ArgumentApplicationParser( + IOUtils.toString( + RetrieveUpdateFromDatacite.class.getResourceAsStream( + "/eu/dnetlib/dhp/provision/input_retrieve_update_parameters.json"))); parser.parseArgument(args); final String hdfsuri = parser.get("namenode"); Path hdfswritepath = new Path(parser.get("targetPath")); @@ -38,26 +40,28 @@ public class RetrieveUpdateFromDatacite { FileSystem.get(URI.create(hdfsuri), conf); final Datacite2Scholix d2s = new Datacite2Scholix(RelationMapper.load()); final ObjectMapper mapper = new ObjectMapper(); - try (SequenceFile.Writer writer = SequenceFile.createWriter(conf, - SequenceFile.Writer.file(hdfswritepath), SequenceFile.Writer.keyClass(IntWritable.class), - SequenceFile.Writer.valueClass(Text.class))) { + try (SequenceFile.Writer writer = + SequenceFile.createWriter( + conf, + SequenceFile.Writer.file(hdfswritepath), + SequenceFile.Writer.keyClass(IntWritable.class), + SequenceFile.Writer.valueClass(Text.class))) { final Text value = new Text(); final IntWritable key = new IntWritable(); int i = 0; - for(String dataset: new DataciteClient(host).getDatasetsFromTs(timestamp)) { + for (String dataset : new DataciteClient(host).getDatasetsFromTs(timestamp)) { i++; List scholix = d2s.generateScholixFromJson(dataset); - if (scholix!= null) - for(Scholix s: scholix) { + if (scholix != null) + for (Scholix s : scholix) { key.set(i); value.set(mapper.writeValueAsString(s)); writer.append(key, value); if (i % 10000 == 0) { - System.out.println("wrote "+i); + System.out.println("wrote " + i); } } } - } } } diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/update/SparkResolveScholixTarget.java b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/update/SparkResolveScholixTarget.java index 35020ecdf..d6f6dfeac 100644 --- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/update/SparkResolveScholixTarget.java +++ b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/update/SparkResolveScholixTarget.java @@ -7,7 +7,10 @@ import eu.dnetlib.dhp.provision.scholix.ScholixIdentifier; import eu.dnetlib.dhp.provision.scholix.ScholixRelationship; import eu.dnetlib.dhp.provision.scholix.ScholixResource; import eu.dnetlib.dhp.utils.DHPUtils; -import eu.dnetlib.scholexplorer.relation.RelationMapper; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import java.util.stream.Collectors; import org.apache.commons.io.IOUtils; import org.apache.commons.lang3.StringUtils; import org.apache.hadoop.io.IntWritable; @@ -19,116 +22,168 @@ import org.apache.spark.api.java.function.MapFunction; import org.apache.spark.sql.*; import scala.Tuple2; -import java.util.ArrayList; -import java.util.Collections; -import java.util.List; -import java.util.stream.Collectors; - public class SparkResolveScholixTarget { - public static void main(String[] args) throws Exception { - final ArgumentApplicationParser parser = new ArgumentApplicationParser(IOUtils.toString(SparkResolveScholixTarget.class.getResourceAsStream("/eu/dnetlib/dhp/provision/input_resolve_scholix_parameters.json"))); + public static void main(String[] args) throws Exception { + final ArgumentApplicationParser parser = + new ArgumentApplicationParser( + IOUtils.toString( + SparkResolveScholixTarget.class.getResourceAsStream( + "/eu/dnetlib/dhp/provision/input_resolve_scholix_parameters.json"))); parser.parseArgument(args); final SparkConf conf = new SparkConf(); final String master = parser.get("master"); final String sourcePath = parser.get("sourcePath"); - final String workingDirPath= parser.get("workingDirPath"); - final String indexHost= parser.get("indexHost"); - try (SparkSession spark = getSession(conf, master)){ + final String workingDirPath = parser.get("workingDirPath"); + final String indexHost = parser.get("indexHost"); + try (SparkSession spark = getSession(conf, master)) { final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); + spark.createDataset( + sc.sequenceFile(sourcePath, IntWritable.class, Text.class) + .map(Tuple2::_2) + .map( + s -> + new ObjectMapper() + .readValue(s.toString(), Scholix.class)) + .rdd(), + Encoders.bean(Scholix.class)) + .write() + .save(workingDirPath + "/stepA"); - spark.createDataset(sc.sequenceFile(sourcePath, IntWritable.class,Text.class) - .map(Tuple2::_2) - .map(s-> new ObjectMapper().readValue(s.toString(), Scholix.class)).rdd(), Encoders.bean(Scholix.class)) - .write().save(workingDirPath+"/stepA"); + Dataset s1 = + spark.read().load(workingDirPath + "/stepA").as(Encoders.bean(Scholix.class)); + s1.where(s1.col("target.dnetIdentifier").isNull()) + .select(s1.col("target.identifier")) + .distinct() + .map( + (MapFunction) + f -> { + final String pid = ((Row) f.getList(0).get(0)).getString(0); + ScholixResource publication = + new CrossrefClient(indexHost).getResourceByDOI(pid); + if (publication != null) { + return publication; + } + ScholixResource dataset = + new DataciteClient(indexHost).getDatasetByDOI(pid); + if (dataset != null) { + return dataset; + } + ScholixResource r = new ScholixResource(); + r.setIdentifier( + Collections.singletonList( + new ScholixIdentifier(pid, "doi"))); + r.setObjectType("unknown"); + r.setDnetIdentifier( + "70|" + + DHPUtils.md5( + String.format( + "%s::doi", + pid.toLowerCase().trim()))); + return r; + }, + Encoders.bean(ScholixResource.class)) + .write() + .mode(SaveMode.Overwrite) + .save(workingDirPath + "/stepB"); - Dataset s1 = spark.read().load(workingDirPath+"/stepA").as(Encoders.bean(Scholix.class)); + Dataset s2 = + spark.read() + .load(workingDirPath + "/stepB") + .as(Encoders.bean(ScholixResource.class)); - s1.where(s1.col("target.dnetIdentifier").isNull()).select(s1.col("target.identifier")).distinct() - .map((MapFunction) f-> { - final String pid = ((Row) f.getList(0).get(0)).getString(0); - ScholixResource publication = new CrossrefClient(indexHost).getResourceByDOI(pid); - if (publication != null) { - return publication; - } - ScholixResource dataset = new DataciteClient(indexHost).getDatasetByDOI(pid); - if (dataset!= null) { - return dataset; - } - ScholixResource r = new ScholixResource(); - r.setIdentifier(Collections.singletonList(new ScholixIdentifier(pid, "doi"))); - r.setObjectType("unknown"); - r.setDnetIdentifier("70|"+DHPUtils.md5(String.format("%s::doi", pid.toLowerCase().trim()))); + s1.joinWith( + s2, + s1.col("target.identifier.identifier") + .equalTo(s2.col("identifier.identifier")), + "left") + .flatMap( + (FlatMapFunction, Scholix>) + f -> { + final List res = new ArrayList<>(); + final Scholix s = f._1(); + final ScholixResource target = f._2(); + if (StringUtils.isNotBlank(s.getIdentifier())) res.add(s); + else if (target == null) { + ScholixResource currentTarget = s.getTarget(); + currentTarget.setObjectType("unknown"); + currentTarget.setDnetIdentifier( + Datacite2Scholix.generateId( + currentTarget + .getIdentifier() + .get(0) + .getIdentifier(), + currentTarget + .getIdentifier() + .get(0) + .getSchema(), + currentTarget.getObjectType())); - return r; - }, Encoders.bean(ScholixResource.class)).write().mode(SaveMode.Overwrite).save(workingDirPath+"/stepB"); + s.generateIdentifier(); + res.add(s); + final Scholix inverse = new Scholix(); + inverse.setTarget(s.getSource()); + inverse.setSource(s.getTarget()); + inverse.setLinkprovider(s.getLinkprovider()); + inverse.setPublicationDate(s.getPublicationDate()); + inverse.setPublisher(s.getPublisher()); + inverse.setRelationship( + new ScholixRelationship( + s.getRelationship().getInverse(), + s.getRelationship().getSchema(), + s.getRelationship().getName())); + inverse.generateIdentifier(); + res.add(inverse); + } else { + target.setIdentifier( + target.getIdentifier().stream() + .map( + d -> + new ScholixIdentifier( + d.getIdentifier() + .toLowerCase(), + d.getSchema() + .toLowerCase())) + .collect(Collectors.toList())); + s.setTarget(target); + s.generateIdentifier(); + res.add(s); + final Scholix inverse = new Scholix(); + inverse.setTarget(s.getSource()); + inverse.setSource(s.getTarget()); + inverse.setLinkprovider(s.getLinkprovider()); + inverse.setPublicationDate(s.getPublicationDate()); + inverse.setPublisher(s.getPublisher()); + inverse.setRelationship( + new ScholixRelationship( + s.getRelationship().getInverse(), + s.getRelationship().getSchema(), + s.getRelationship().getName())); + inverse.generateIdentifier(); + res.add(inverse); + } - Dataset s2 = spark.read().load(workingDirPath+"/stepB").as(Encoders.bean(ScholixResource.class)); - - - s1.joinWith(s2, s1.col("target.identifier.identifier").equalTo(s2.col("identifier.identifier")), "left") - - .flatMap((FlatMapFunction, Scholix>) f -> - { - - final List res = new ArrayList<>(); - final Scholix s = f._1(); - final ScholixResource target = f._2(); - if (StringUtils.isNotBlank(s.getIdentifier())) - res.add(s); - else if (target == null) { - ScholixResource currentTarget = s.getTarget(); - currentTarget.setObjectType("unknown"); - currentTarget.setDnetIdentifier(Datacite2Scholix.generateId(currentTarget.getIdentifier().get(0).getIdentifier(),currentTarget.getIdentifier().get(0).getSchema(), currentTarget.getObjectType())); - - s.generateIdentifier(); - res.add(s); - final Scholix inverse = new Scholix(); - inverse.setTarget(s.getSource()); - inverse.setSource(s.getTarget()); - inverse.setLinkprovider(s.getLinkprovider()); - inverse.setPublicationDate(s.getPublicationDate()); - inverse.setPublisher(s.getPublisher()); - inverse.setRelationship(new ScholixRelationship(s.getRelationship().getInverse(), s.getRelationship().getSchema(), s.getRelationship().getName())); - inverse.generateIdentifier(); - res.add(inverse); - - } else - { - target.setIdentifier(target.getIdentifier().stream().map(d -> new ScholixIdentifier(d.getIdentifier().toLowerCase(), d.getSchema().toLowerCase())).collect(Collectors.toList())); - s.setTarget(target); - s.generateIdentifier(); - res.add(s); - final Scholix inverse = new Scholix(); - inverse.setTarget(s.getSource()); - inverse.setSource(s.getTarget()); - inverse.setLinkprovider(s.getLinkprovider()); - inverse.setPublicationDate(s.getPublicationDate()); - inverse.setPublisher(s.getPublisher()); - inverse.setRelationship(new ScholixRelationship(s.getRelationship().getInverse(), s.getRelationship().getSchema(), s.getRelationship().getName())); - inverse.generateIdentifier(); - res.add(inverse); - } - - return res.iterator(); - }, Encoders.bean(Scholix.class)).javaRDD().map(s -> new ObjectMapper().writeValueAsString(s)).saveAsTextFile(workingDirPath+"/resolved_json"); + return res.iterator(); + }, + Encoders.bean(Scholix.class)) + .javaRDD() + .map(s -> new ObjectMapper().writeValueAsString(s)) + .saveAsTextFile(workingDirPath + "/resolved_json"); } } private static SparkSession getSession(SparkConf conf, String master) { - return SparkSession - .builder() + return SparkSession.builder() .config(conf) .appName(SparkResolveScholixTarget.class.getSimpleName()) .master(master) .getOrCreate(); } - } diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/test/java/eu/dnetlib/dhp/provision/DataciteClientTest.java b/dhp-workflows/dhp-graph-provision-scholexplorer/src/test/java/eu/dnetlib/dhp/provision/DataciteClientTest.java index cc4b0047a..cd86702be 100644 --- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/test/java/eu/dnetlib/dhp/provision/DataciteClientTest.java +++ b/dhp-workflows/dhp-graph-provision-scholexplorer/src/test/java/eu/dnetlib/dhp/provision/DataciteClientTest.java @@ -5,13 +5,11 @@ import eu.dnetlib.dhp.provision.scholix.Scholix; import eu.dnetlib.dhp.provision.scholix.ScholixResource; import eu.dnetlib.dhp.provision.update.*; import eu.dnetlib.scholexplorer.relation.RelationMapper; +import java.util.List; import org.apache.commons.io.IOUtils; import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.Test; -import java.util.List; - - public class DataciteClientTest { @Test public void dataciteSCholixTest() throws Exception { @@ -23,18 +21,16 @@ public class DataciteClientTest { System.out.println(new ObjectMapper().writeValueAsString(s)); } - -// public void testS() throws Exception { -// RetrieveUpdateFromDatacite.main(new String[]{ -// "-n", "file:///data/new_s2.txt", -// "-t", "/data/new_s2.txt", -// "-ts", "1586974078", -// "-ih", "ip-90-147-167-25.ct1.garrservices.it", -// "-in", "datacite", -// }); -// -// } - + // public void testS() throws Exception { + // RetrieveUpdateFromDatacite.main(new String[]{ + // "-n", "file:///data/new_s2.txt", + // "-t", "/data/new_s2.txt", + // "-ts", "1586974078", + // "-ih", "ip-90-147-167-25.ct1.garrservices.it", + // "-in", "datacite", + // }); + // + // } public void testResolveDataset() throws Exception { DataciteClient dc = new DataciteClient("ip-90-147-167-25.ct1.garrservices.it"); @@ -42,9 +38,9 @@ public class DataciteClientTest { Assertions.assertNotNull(datasetByDOI); System.out.println(new ObjectMapper().writeValueAsString(datasetByDOI)); - CrossrefClient cr = new CrossrefClient("ip-90-147-167-25.ct1.garrservices.it"); - ScholixResource crossrefByDOI = cr.getResourceByDOI("10.26850/1678-4618eqj.v35.1.2010.p41-46"); + ScholixResource crossrefByDOI = + cr.getResourceByDOI("10.26850/1678-4618eqj.v35.1.2010.p41-46"); Assertions.assertNotNull(crossrefByDOI); System.out.println(new ObjectMapper().writeValueAsString(crossrefByDOI)); } diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/test/java/eu/dnetlib/dhp/provision/ExtractInfoTest.java b/dhp-workflows/dhp-graph-provision-scholexplorer/src/test/java/eu/dnetlib/dhp/provision/ExtractInfoTest.java index b5142447d..b85dd1709 100644 --- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/test/java/eu/dnetlib/dhp/provision/ExtractInfoTest.java +++ b/dhp-workflows/dhp-graph-provision-scholexplorer/src/test/java/eu/dnetlib/dhp/provision/ExtractInfoTest.java @@ -21,8 +21,8 @@ public class ExtractInfoTest { @Test public void testScholix() throws Exception { final String jsonSummary = IOUtils.toString(getClass().getResourceAsStream("summary.json")); - final String jsonRelation = IOUtils.toString(getClass().getResourceAsStream("relation.json")); + final String jsonRelation = + IOUtils.toString(getClass().getResourceAsStream("relation.json")); Scholix.generateScholixWithSource(jsonSummary, jsonRelation); } - } diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/AdjacencyListBuilderJob.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/AdjacencyListBuilderJob.java index 8162927b8..27f99433c 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/AdjacencyListBuilderJob.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/AdjacencyListBuilderJob.java @@ -1,11 +1,16 @@ package eu.dnetlib.dhp.oa.provision; +import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; + import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.common.HdfsSupport; import eu.dnetlib.dhp.oa.provision.model.EntityRelEntity; import eu.dnetlib.dhp.oa.provision.model.JoinedEntity; import eu.dnetlib.dhp.oa.provision.model.Tuple2; import eu.dnetlib.dhp.schema.common.ModelSupport; +import java.util.ArrayList; +import java.util.List; +import java.util.Optional; import org.apache.commons.io.IOUtils; import org.apache.spark.SparkConf; import org.apache.spark.api.java.function.MapFunction; @@ -16,42 +21,29 @@ import org.apache.spark.sql.SparkSession; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.util.ArrayList; -import java.util.List; -import java.util.Optional; - -import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; - /** - * Joins the graph nodes by resolving the links of distance = 1 to create an adjacency list of linked objects. - * The operation considers all the entity types (publication, dataset, software, ORP, project, datasource, organization, - * and all the possible relationships (similarity links produced by the Dedup process are excluded). + * Joins the graph nodes by resolving the links of distance = 1 to create an adjacency list of + * linked objects. The operation considers all the entity types (publication, dataset, software, + * ORP, project, datasource, organization, and all the possible relationships (similarity links + * produced by the Dedup process are excluded). * - * The operation is implemented by sequentially joining one entity type at time (E) with the relationships (R), and again - * by E, finally grouped by E.id; + *

The operation is implemented by sequentially joining one entity type at time (E) with the + * relationships (R), and again by E, finally grouped by E.id; * - * The workflow is organized in different parts aimed to to reduce the complexity of the operation - * 1) PrepareRelationsJob: - * only consider relationships that are not virtually deleted ($.dataInfo.deletedbyinference == false), each entity - * can be linked at most to 100 other objects + *

The workflow is organized in different parts aimed to to reduce the complexity of the + * operation 1) PrepareRelationsJob: only consider relationships that are not virtually deleted + * ($.dataInfo.deletedbyinference == false), each entity can be linked at most to 100 other objects * - * 2) JoinRelationEntityByTargetJob: - * (phase 1): prepare tuples [relation - target entity] (R - T): - * for each entity type E_i - * map E_i as RelatedEntity T_i to simplify the model and extracting only the necessary information - * join (R.target = T_i.id) - * save the tuples (R_i, T_i) - * (phase 2): - * create the union of all the entity types E, hash by id - * read the tuples (R, T), hash by R.source - * join E.id = (R, T).source, where E becomes the Source Entity S - * save the tuples (S, R, T) + *

2) JoinRelationEntityByTargetJob: (phase 1): prepare tuples [relation - target entity] (R - + * T): for each entity type E_i map E_i as RelatedEntity T_i to simplify the model and extracting + * only the necessary information join (R.target = T_i.id) save the tuples (R_i, T_i) (phase 2): + * create the union of all the entity types E, hash by id read the tuples (R, T), hash by R.source + * join E.id = (R, T).source, where E becomes the Source Entity S save the tuples (S, R, T) * - * 3) AdjacencyListBuilderJob: - * given the tuple (S - R - T) we need to group by S.id -> List [ R - T ], mapping the result as JoinedEntity + *

3) AdjacencyListBuilderJob: given the tuple (S - R - T) we need to group by S.id -> List [ R - + * T ], mapping the result as JoinedEntity * - * 4) XmlConverterJob: - * convert the JoinedEntities as XML records + *

4) XmlConverterJob: convert the JoinedEntities as XML records */ public class AdjacencyListBuilderJob { @@ -61,16 +53,17 @@ public class AdjacencyListBuilderJob { public static void main(String[] args) throws Exception { - final ArgumentApplicationParser parser = new ArgumentApplicationParser( - IOUtils.toString( - AdjacencyListBuilderJob.class - .getResourceAsStream("/eu/dnetlib/dhp/oa/provision/input_params_build_adjacency_lists.json"))); + final ArgumentApplicationParser parser = + new ArgumentApplicationParser( + IOUtils.toString( + AdjacencyListBuilderJob.class.getResourceAsStream( + "/eu/dnetlib/dhp/oa/provision/input_params_build_adjacency_lists.json"))); parser.parseArgument(args); - Boolean isSparkSessionManaged = Optional - .ofNullable(parser.get("isSparkSessionManaged")) - .map(Boolean::valueOf) - .orElse(Boolean.TRUE); + Boolean isSparkSessionManaged = + Optional.ofNullable(parser.get("isSparkSessionManaged")) + .map(Boolean::valueOf) + .orElse(Boolean.TRUE); log.info("isSparkSessionManaged: {}", isSparkSessionManaged); String inputPath = parser.get("inputPath"); @@ -83,33 +76,41 @@ public class AdjacencyListBuilderJob { conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); conf.registerKryoClasses(ModelSupport.getOafModelClasses()); - runWithSparkSession(conf, isSparkSessionManaged, + runWithSparkSession( + conf, + isSparkSessionManaged, spark -> { removeOutputDir(spark, outputPath); createAdjacencyLists(spark, inputPath, outputPath); }); } - private static void createAdjacencyLists(SparkSession spark, String inputPath, String outputPath) { + private static void createAdjacencyLists( + SparkSession spark, String inputPath, String outputPath) { log.info("Reading joined entities from: {}", inputPath); spark.read() .load(inputPath) .as(Encoders.bean(EntityRelEntity.class)) - .groupByKey((MapFunction) value -> value.getEntity().getId(), Encoders.STRING()) - .mapGroups((MapGroupsFunction) (key, values) -> { - JoinedEntity j = new JoinedEntity(); - List links = new ArrayList<>(); - while (values.hasNext() && links.size() < MAX_LINKS) { - EntityRelEntity curr = values.next(); - if (j.getEntity() == null) { - j.setEntity(curr.getEntity()); - } - links.add(new Tuple2(curr.getRelation(), curr.getTarget())); - } - j.setLinks(links); - return j; - }, Encoders.bean(JoinedEntity.class)) + .groupByKey( + (MapFunction) value -> value.getEntity().getId(), + Encoders.STRING()) + .mapGroups( + (MapGroupsFunction) + (key, values) -> { + JoinedEntity j = new JoinedEntity(); + List links = new ArrayList<>(); + while (values.hasNext() && links.size() < MAX_LINKS) { + EntityRelEntity curr = values.next(); + if (j.getEntity() == null) { + j.setEntity(curr.getEntity()); + } + links.add(new Tuple2(curr.getRelation(), curr.getTarget())); + } + j.setLinks(links); + return j; + }, + Encoders.bean(JoinedEntity.class)) .write() .mode(SaveMode.Overwrite) .parquet(outputPath); @@ -118,5 +119,4 @@ public class AdjacencyListBuilderJob { private static void removeOutputDir(SparkSession spark, String path) { HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration()); } - } diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/CreateRelatedEntitiesJob_phase1.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/CreateRelatedEntitiesJob_phase1.java index d0035fa78..6b1dd52ce 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/CreateRelatedEntitiesJob_phase1.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/CreateRelatedEntitiesJob_phase1.java @@ -1,5 +1,8 @@ package eu.dnetlib.dhp.oa.provision; +import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; +import static eu.dnetlib.dhp.oa.provision.utils.GraphMappingUtils.*; + import com.fasterxml.jackson.databind.ObjectMapper; import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.common.HdfsSupport; @@ -9,6 +12,7 @@ import eu.dnetlib.dhp.oa.provision.model.SortableRelation; import eu.dnetlib.dhp.schema.common.EntityType; import eu.dnetlib.dhp.schema.common.ModelSupport; import eu.dnetlib.dhp.schema.oaf.OafEntity; +import java.util.Optional; import org.apache.commons.io.IOUtils; import org.apache.spark.SparkConf; import org.apache.spark.api.java.function.FilterFunction; @@ -21,60 +25,50 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; import scala.Tuple2; -import java.util.Optional; - -import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; -import static eu.dnetlib.dhp.oa.provision.utils.GraphMappingUtils.*; - /** - * Joins the graph nodes by resolving the links of distance = 1 to create an adjacency list of linked objects. - * The operation considers all the entity types (publication, dataset, software, ORP, project, datasource, organization, - * and all the possible relationships (similarity links produced by the Dedup process are excluded). + * Joins the graph nodes by resolving the links of distance = 1 to create an adjacency list of + * linked objects. The operation considers all the entity types (publication, dataset, software, + * ORP, project, datasource, organization, and all the possible relationships (similarity links + * produced by the Dedup process are excluded). * - * The operation is implemented by sequentially joining one entity type at time (E) with the relationships (R), and again - * by E, finally grouped by E.id; + *

The operation is implemented by sequentially joining one entity type at time (E) with the + * relationships (R), and again by E, finally grouped by E.id; * - * The workflow is organized in different parts aimed to to reduce the complexity of the operation - * 1) PrepareRelationsJob: - * only consider relationships that are not virtually deleted ($.dataInfo.deletedbyinference == false), each entity - * can be linked at most to 100 other objects + *

The workflow is organized in different parts aimed to to reduce the complexity of the + * operation 1) PrepareRelationsJob: only consider relationships that are not virtually deleted + * ($.dataInfo.deletedbyinference == false), each entity can be linked at most to 100 other objects * - * 2) JoinRelationEntityByTargetJob: - * (phase 1): prepare tuples [relation - target entity] (R - T): - * for each entity type E_i - * map E_i as RelatedEntity T_i to simplify the model and extracting only the necessary information - * join (R.target = T_i.id) - * save the tuples (R_i, T_i) - * (phase 2): - * create the union of all the entity types E, hash by id - * read the tuples (R, T), hash by R.source - * join E.id = (R, T).source, where E becomes the Source Entity S - * save the tuples (S, R, T) + *

2) JoinRelationEntityByTargetJob: (phase 1): prepare tuples [relation - target entity] (R - + * T): for each entity type E_i map E_i as RelatedEntity T_i to simplify the model and extracting + * only the necessary information join (R.target = T_i.id) save the tuples (R_i, T_i) (phase 2): + * create the union of all the entity types E, hash by id read the tuples (R, T), hash by R.source + * join E.id = (R, T).source, where E becomes the Source Entity S save the tuples (S, R, T) * - * 3) AdjacencyListBuilderJob: - * given the tuple (S - R - T) we need to group by S.id -> List [ R - T ], mapping the result as JoinedEntity + *

3) AdjacencyListBuilderJob: given the tuple (S - R - T) we need to group by S.id -> List [ R - + * T ], mapping the result as JoinedEntity * - * 4) XmlConverterJob: - * convert the JoinedEntities as XML records + *

4) XmlConverterJob: convert the JoinedEntities as XML records */ public class CreateRelatedEntitiesJob_phase1 { - private static final Logger log = LoggerFactory.getLogger(CreateRelatedEntitiesJob_phase1.class); + private static final Logger log = + LoggerFactory.getLogger(CreateRelatedEntitiesJob_phase1.class); private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); public static void main(String[] args) throws Exception { - String jsonConfiguration = IOUtils.toString( - PrepareRelationsJob.class - .getResourceAsStream("/eu/dnetlib/dhp/oa/provision/input_params_related_entities_pahase1.json")); + String jsonConfiguration = + IOUtils.toString( + PrepareRelationsJob.class.getResourceAsStream( + "/eu/dnetlib/dhp/oa/provision/input_params_related_entities_pahase1.json")); final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); parser.parseArgument(args); - Boolean isSparkSessionManaged = Optional - .ofNullable(parser.get("isSparkSessionManaged")) - .map(Boolean::valueOf) - .orElse(Boolean.TRUE); + Boolean isSparkSessionManaged = + Optional.ofNullable(parser.get("isSparkSessionManaged")) + .map(Boolean::valueOf) + .orElse(Boolean.TRUE); log.info("isSparkSessionManaged: {}", isSparkSessionManaged); String inputRelationsPath = parser.get("inputRelationsPath"); @@ -89,70 +83,100 @@ public class CreateRelatedEntitiesJob_phase1 { String graphTableClassName = parser.get("graphTableClassName"); log.info("graphTableClassName: {}", graphTableClassName); - Class entityClazz = (Class) Class.forName(graphTableClassName); + Class entityClazz = + (Class) Class.forName(graphTableClassName); SparkConf conf = new SparkConf(); conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); conf.registerKryoClasses(ModelSupport.getOafModelClasses()); - runWithSparkSession(conf, isSparkSessionManaged, + runWithSparkSession( + conf, + isSparkSessionManaged, spark -> { removeOutputDir(spark, outputPath); - joinRelationEntity(spark, inputRelationsPath, inputEntityPath, entityClazz, outputPath); + joinRelationEntity( + spark, inputRelationsPath, inputEntityPath, entityClazz, outputPath); }); } - private static void joinRelationEntity(SparkSession spark, String inputRelationsPath, String inputEntityPath, Class entityClazz, String outputPath) { + private static void joinRelationEntity( + SparkSession spark, + String inputRelationsPath, + String inputEntityPath, + Class entityClazz, + String outputPath) { - Dataset> relsByTarget = readPathRelation(spark, inputRelationsPath) - .filter((FilterFunction) value -> value.getDataInfo().getDeletedbyinference() == false) - .map((MapFunction>) r -> new Tuple2<>(r.getTarget(), r), - Encoders.tuple(Encoders.STRING(), Encoders.kryo(SortableRelation.class))) - .cache(); + Dataset> relsByTarget = + readPathRelation(spark, inputRelationsPath) + .filter( + (FilterFunction) + value -> + value.getDataInfo().getDeletedbyinference() + == false) + .map( + (MapFunction>) + r -> new Tuple2<>(r.getTarget(), r), + Encoders.tuple( + Encoders.STRING(), Encoders.kryo(SortableRelation.class))) + .cache(); - Dataset> entities = readPathEntity(spark, inputEntityPath, entityClazz) - .map((MapFunction) value -> asRelatedEntity(value, entityClazz), Encoders.bean(RelatedEntity.class)) - .map((MapFunction>) e -> new Tuple2<>(e.getId(), e), - Encoders.tuple(Encoders.STRING(), Encoders.kryo(RelatedEntity.class))) - .cache(); + Dataset> entities = + readPathEntity(spark, inputEntityPath, entityClazz) + .map( + (MapFunction) + value -> asRelatedEntity(value, entityClazz), + Encoders.bean(RelatedEntity.class)) + .map( + (MapFunction>) + e -> new Tuple2<>(e.getId(), e), + Encoders.tuple( + Encoders.STRING(), Encoders.kryo(RelatedEntity.class))) + .cache(); relsByTarget .joinWith(entities, entities.col("_1").equalTo(relsByTarget.col("_1")), "inner") - .map((MapFunction, Tuple2>, EntityRelEntity>) - t -> new EntityRelEntity(t._1()._2(), t._2()._2()), + .map( + (MapFunction< + Tuple2< + Tuple2, + Tuple2>, + EntityRelEntity>) + t -> new EntityRelEntity(t._1()._2(), t._2()._2()), Encoders.bean(EntityRelEntity.class)) .write() .mode(SaveMode.Overwrite) .parquet(outputPath + "/" + EntityType.fromClass(entityClazz)); } - private static Dataset readPathEntity(SparkSession spark, String inputEntityPath, Class entityClazz) { + private static Dataset readPathEntity( + SparkSession spark, String inputEntityPath, Class entityClazz) { log.info("Reading Graph table from: {}", inputEntityPath); - return spark - .read() + return spark.read() .textFile(inputEntityPath) - .map((MapFunction) value -> OBJECT_MAPPER.readValue(value, entityClazz), Encoders.bean(entityClazz)); + .map( + (MapFunction) + value -> OBJECT_MAPPER.readValue(value, entityClazz), + Encoders.bean(entityClazz)); } /** - * Reads a Dataset of eu.dnetlib.dhp.oa.provision.model.SortableRelation objects from a newline delimited json text file, + * Reads a Dataset of eu.dnetlib.dhp.oa.provision.model.SortableRelation objects from a newline + * delimited json text file, * * @param spark * @param relationPath * @return the Dataset containing all the relationships */ - private static Dataset readPathRelation(SparkSession spark, final String relationPath) { + private static Dataset readPathRelation( + SparkSession spark, final String relationPath) { log.info("Reading relations from: {}", relationPath); - return spark.read() - .load(relationPath) - .as(Encoders.bean(SortableRelation.class)); + return spark.read().load(relationPath).as(Encoders.bean(SortableRelation.class)); } private static void removeOutputDir(SparkSession spark, String path) { HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration()); } - - } diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/CreateRelatedEntitiesJob_phase2.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/CreateRelatedEntitiesJob_phase2.java index 0a9235cc9..20786582f 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/CreateRelatedEntitiesJob_phase2.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/CreateRelatedEntitiesJob_phase2.java @@ -1,5 +1,7 @@ package eu.dnetlib.dhp.oa.provision; +import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; + import com.fasterxml.jackson.core.JsonProcessingException; import com.fasterxml.jackson.databind.ObjectMapper; import eu.dnetlib.dhp.application.ArgumentApplicationParser; @@ -8,6 +10,8 @@ import eu.dnetlib.dhp.oa.provision.model.EntityRelEntity; import eu.dnetlib.dhp.oa.provision.model.TypedRow; import eu.dnetlib.dhp.schema.common.ModelSupport; import eu.dnetlib.dhp.schema.oaf.*; +import java.util.List; +import java.util.Optional; import org.apache.commons.io.IOUtils; import org.apache.commons.lang3.StringUtils; import org.apache.spark.SparkConf; @@ -23,60 +27,50 @@ import scala.Tuple2; import scala.collection.JavaConverters; import scala.collection.Seq; -import java.util.List; -import java.util.Optional; - -import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; - /** - * Joins the graph nodes by resolving the links of distance = 1 to create an adjacency list of linked objects. - * The operation considers all the entity types (publication, dataset, software, ORP, project, datasource, organization, - * and all the possible relationships (similarity links produced by the Dedup process are excluded). + * Joins the graph nodes by resolving the links of distance = 1 to create an adjacency list of + * linked objects. The operation considers all the entity types (publication, dataset, software, + * ORP, project, datasource, organization, and all the possible relationships (similarity links + * produced by the Dedup process are excluded). * - * The operation is implemented by sequentially joining one entity type at time (E) with the relationships (R), and again - * by E, finally grouped by E.id; + *

The operation is implemented by sequentially joining one entity type at time (E) with the + * relationships (R), and again by E, finally grouped by E.id; * - * The workflow is organized in different parts aimed to to reduce the complexity of the operation - * 1) PrepareRelationsJob: - * only consider relationships that are not virtually deleted ($.dataInfo.deletedbyinference == false), each entity - * can be linked at most to 100 other objects + *

The workflow is organized in different parts aimed to to reduce the complexity of the + * operation 1) PrepareRelationsJob: only consider relationships that are not virtually deleted + * ($.dataInfo.deletedbyinference == false), each entity can be linked at most to 100 other objects * - * 2) JoinRelationEntityByTargetJob: - * (phase 1): prepare tuples [relation - target entity] (R - T): - * for each entity type E_i - * map E_i as RelatedEntity T_i to simplify the model and extracting only the necessary information - * join (R.target = T_i.id) - * save the tuples (R_i, T_i) - * (phase 2): - * create the union of all the entity types E, hash by id - * read the tuples (R, T), hash by R.source - * join E.id = (R, T).source, where E becomes the Source Entity S - * save the tuples (S, R, T) + *

2) JoinRelationEntityByTargetJob: (phase 1): prepare tuples [relation - target entity] (R - + * T): for each entity type E_i map E_i as RelatedEntity T_i to simplify the model and extracting + * only the necessary information join (R.target = T_i.id) save the tuples (R_i, T_i) (phase 2): + * create the union of all the entity types E, hash by id read the tuples (R, T), hash by R.source + * join E.id = (R, T).source, where E becomes the Source Entity S save the tuples (S, R, T) * - * 3) AdjacencyListBuilderJob: - * given the tuple (S - R - T) we need to group by S.id -> List [ R - T ], mapping the result as JoinedEntity + *

3) AdjacencyListBuilderJob: given the tuple (S - R - T) we need to group by S.id -> List [ R - + * T ], mapping the result as JoinedEntity * - * 4) XmlConverterJob: - * convert the JoinedEntities as XML records + *

4) XmlConverterJob: convert the JoinedEntities as XML records */ public class CreateRelatedEntitiesJob_phase2 { - private static final Logger log = LoggerFactory.getLogger(CreateRelatedEntitiesJob_phase2.class); + private static final Logger log = + LoggerFactory.getLogger(CreateRelatedEntitiesJob_phase2.class); private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); public static void main(String[] args) throws Exception { - String jsonConfiguration = IOUtils.toString( - PrepareRelationsJob.class - .getResourceAsStream("/eu/dnetlib/dhp/oa/provision/input_params_related_entities_pahase2.json")); + String jsonConfiguration = + IOUtils.toString( + PrepareRelationsJob.class.getResourceAsStream( + "/eu/dnetlib/dhp/oa/provision/input_params_related_entities_pahase2.json")); final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); parser.parseArgument(args); - Boolean isSparkSessionManaged = Optional - .ofNullable(parser.get("isSparkSessionManaged")) - .map(Boolean::valueOf) - .orElse(Boolean.TRUE); + Boolean isSparkSessionManaged = + Optional.ofNullable(parser.get("isSparkSessionManaged")) + .map(Boolean::valueOf) + .orElse(Boolean.TRUE); log.info("isSparkSessionManaged: {}", isSparkSessionManaged); String inputRelatedEntitiesPath = parser.get("inputRelatedEntitiesPath"); @@ -95,45 +89,88 @@ public class CreateRelatedEntitiesJob_phase2 { conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); conf.registerKryoClasses(ModelSupport.getOafModelClasses()); - runWithSparkSession(conf, isSparkSessionManaged, + runWithSparkSession( + conf, + isSparkSessionManaged, spark -> { removeOutputDir(spark, outputPath); - joinAllEntities(spark, inputRelatedEntitiesPath, inputGraphRootPath, outputPath, numPartitions); + joinAllEntities( + spark, + inputRelatedEntitiesPath, + inputGraphRootPath, + outputPath, + numPartitions); }); } - private static void joinAllEntities(SparkSession spark, String inputRelatedEntitiesPath, String inputGraphRootPath, String outputPath, int numPartitions) { + private static void joinAllEntities( + SparkSession spark, + String inputRelatedEntitiesPath, + String inputGraphRootPath, + String outputPath, + int numPartitions) { - Dataset> entities = readAllEntities(spark, inputGraphRootPath, numPartitions); - Dataset> relsBySource = readRelatedEntities(spark, inputRelatedEntitiesPath); + Dataset> entities = + readAllEntities(spark, inputGraphRootPath, numPartitions); + Dataset> relsBySource = + readRelatedEntities(spark, inputRelatedEntitiesPath); - entities - .joinWith(relsBySource, entities.col("_1").equalTo(relsBySource.col("_1")), "left_outer") - .map((MapFunction, Tuple2>, EntityRelEntity>) value -> { - EntityRelEntity re = new EntityRelEntity(); - re.setEntity(value._1()._2()); - Optional related = Optional.ofNullable(value._2()).map(Tuple2::_2); - if (related.isPresent()) { - re.setRelation(related.get().getRelation()); - re.setTarget(related.get().getTarget()); - } - return re; - }, Encoders.bean(EntityRelEntity.class)) + entities.joinWith( + relsBySource, + entities.col("_1").equalTo(relsBySource.col("_1")), + "left_outer") + .map( + (MapFunction< + Tuple2< + Tuple2, + Tuple2>, + EntityRelEntity>) + value -> { + EntityRelEntity re = new EntityRelEntity(); + re.setEntity(value._1()._2()); + Optional related = + Optional.ofNullable(value._2()).map(Tuple2::_2); + if (related.isPresent()) { + re.setRelation(related.get().getRelation()); + re.setTarget(related.get().getTarget()); + } + return re; + }, + Encoders.bean(EntityRelEntity.class)) .repartition(numPartitions) - .filter((FilterFunction) value -> value.getEntity() != null && StringUtils.isNotBlank(value.getEntity().getId())) + .filter( + (FilterFunction) + value -> + value.getEntity() != null + && StringUtils.isNotBlank( + value.getEntity().getId())) .write() .mode(SaveMode.Overwrite) .parquet(outputPath); } - private static Dataset> readAllEntities(SparkSession spark, String inputGraphPath, int numPartitions) { - Dataset publication = readPathEntity(spark, inputGraphPath + "/publication", Publication.class); - Dataset dataset = readPathEntity(spark, inputGraphPath + "/dataset", eu.dnetlib.dhp.schema.oaf.Dataset.class); - Dataset other = readPathEntity(spark, inputGraphPath + "/otherresearchproduct", OtherResearchProduct.class); - Dataset software = readPathEntity(spark, inputGraphPath + "/software", Software.class); - Dataset datasource = readPathEntity(spark, inputGraphPath + "/datasource", Datasource.class); - Dataset organization = readPathEntity(spark, inputGraphPath + "/organization", Organization.class); - Dataset project = readPathEntity(spark, inputGraphPath + "/project", Project.class); + private static Dataset> readAllEntities( + SparkSession spark, String inputGraphPath, int numPartitions) { + Dataset publication = + readPathEntity(spark, inputGraphPath + "/publication", Publication.class); + Dataset dataset = + readPathEntity( + spark, + inputGraphPath + "/dataset", + eu.dnetlib.dhp.schema.oaf.Dataset.class); + Dataset other = + readPathEntity( + spark, + inputGraphPath + "/otherresearchproduct", + OtherResearchProduct.class); + Dataset software = + readPathEntity(spark, inputGraphPath + "/software", Software.class); + Dataset datasource = + readPathEntity(spark, inputGraphPath + "/datasource", Datasource.class); + Dataset organization = + readPathEntity(spark, inputGraphPath + "/organization", Organization.class); + Dataset project = + readPathEntity(spark, inputGraphPath + "/project", Project.class); return publication .union(dataset) @@ -142,39 +179,55 @@ public class CreateRelatedEntitiesJob_phase2 { .union(datasource) .union(organization) .union(project) - .map((MapFunction>) + .map( + (MapFunction>) value -> new Tuple2<>(value.getId(), value), Encoders.tuple(Encoders.STRING(), Encoders.kryo(TypedRow.class))) .repartition(numPartitions); } - private static Dataset> readRelatedEntities(SparkSession spark, String inputRelatedEntitiesPath) { + private static Dataset> readRelatedEntities( + SparkSession spark, String inputRelatedEntitiesPath) { log.info("Reading related entities from: {}", inputRelatedEntitiesPath); - final List paths = HdfsSupport.listFiles(inputRelatedEntitiesPath, spark.sparkContext().hadoopConfiguration()); + final List paths = + HdfsSupport.listFiles( + inputRelatedEntitiesPath, spark.sparkContext().hadoopConfiguration()); log.info("Found paths: {}", String.join(",", paths)); return spark.read() .load(toSeq(paths)) .as(Encoders.bean(EntityRelEntity.class)) - .map((MapFunction>) + .map( + (MapFunction>) value -> new Tuple2<>(value.getRelation().getSource(), value), Encoders.tuple(Encoders.STRING(), Encoders.kryo(EntityRelEntity.class))); } - private static Dataset readPathEntity(SparkSession spark, String inputEntityPath, Class entityClazz) { + private static Dataset readPathEntity( + SparkSession spark, String inputEntityPath, Class entityClazz) { log.info("Reading Graph table from: {}", inputEntityPath); - return spark - .read() + return spark.read() .textFile(inputEntityPath) - .map((MapFunction) value -> OBJECT_MAPPER.readValue(value, entityClazz), Encoders.bean(entityClazz)) - .map((MapFunction) value -> getTypedRow(StringUtils.substringAfterLast(inputEntityPath, "/"), value), Encoders.bean(TypedRow.class)); + .map( + (MapFunction) + value -> OBJECT_MAPPER.readValue(value, entityClazz), + Encoders.bean(entityClazz)) + .map( + (MapFunction) + value -> + getTypedRow( + StringUtils.substringAfterLast( + inputEntityPath, "/"), + value), + Encoders.bean(TypedRow.class)); } - private static TypedRow getTypedRow(String type, OafEntity entity) throws JsonProcessingException { + private static TypedRow getTypedRow(String type, OafEntity entity) + throws JsonProcessingException { TypedRow t = new TypedRow(); t.setType(type); t.setDeleted(entity.getDataInfo().getDeletedbyinference()); @@ -190,5 +243,4 @@ public class CreateRelatedEntitiesJob_phase2 { private static Seq toSeq(List list) { return JavaConverters.asScalaIteratorConverter(list.iterator()).asScala().toSeq(); } - } diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/PrepareRelationsJob.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/PrepareRelationsJob.java index 95d2263b5..a6c261ec7 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/PrepareRelationsJob.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/PrepareRelationsJob.java @@ -1,5 +1,7 @@ package eu.dnetlib.dhp.oa.provision; +import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; + import com.fasterxml.jackson.databind.ObjectMapper; import com.google.common.collect.Iterables; import com.google.common.collect.Iterators; @@ -7,8 +9,8 @@ import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.common.HdfsSupport; import eu.dnetlib.dhp.oa.provision.model.SortableRelation; import eu.dnetlib.dhp.oa.provision.utils.RelationPartitioner; +import java.util.Optional; import org.apache.commons.io.IOUtils; -import org.apache.hadoop.io.compress.GzipCodec; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; @@ -22,40 +24,29 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; import scala.Tuple2; -import java.util.Optional; - -import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; - /** - * Joins the graph nodes by resolving the links of distance = 1 to create an adjacency list of linked objects. - * The operation considers all the entity types (publication, dataset, software, ORP, project, datasource, organization, - * and all the possible relationships (similarity links produced by the Dedup process are excluded). + * Joins the graph nodes by resolving the links of distance = 1 to create an adjacency list of + * linked objects. The operation considers all the entity types (publication, dataset, software, + * ORP, project, datasource, organization, and all the possible relationships (similarity links + * produced by the Dedup process are excluded). * - * The operation is implemented by sequentially joining one entity type at time (E) with the relationships (R), and again - * by E, finally grouped by E.id; + *

The operation is implemented by sequentially joining one entity type at time (E) with the + * relationships (R), and again by E, finally grouped by E.id; * - * The workflow is organized in different parts aimed to to reduce the complexity of the operation - * 1) PrepareRelationsJob: - * only consider relationships that are not virtually deleted ($.dataInfo.deletedbyinference == false), each entity - * can be linked at most to 100 other objects + *

The workflow is organized in different parts aimed to to reduce the complexity of the + * operation 1) PrepareRelationsJob: only consider relationships that are not virtually deleted + * ($.dataInfo.deletedbyinference == false), each entity can be linked at most to 100 other objects * - * 2) JoinRelationEntityByTargetJob: - * (phase 1): prepare tuples [relation - target entity] (R - T): - * for each entity type E_i - * map E_i as RelatedEntity T_i to simplify the model and extracting only the necessary information - * join (R.target = T_i.id) - * save the tuples (R_i, T_i) - * (phase 2): - * create the union of all the entity types E, hash by id - * read the tuples (R, T), hash by R.source - * join E.id = (R, T).source, where E becomes the Source Entity S - * save the tuples (S, R, T) + *

2) JoinRelationEntityByTargetJob: (phase 1): prepare tuples [relation - target entity] (R - + * T): for each entity type E_i map E_i as RelatedEntity T_i to simplify the model and extracting + * only the necessary information join (R.target = T_i.id) save the tuples (R_i, T_i) (phase 2): + * create the union of all the entity types E, hash by id read the tuples (R, T), hash by R.source + * join E.id = (R, T).source, where E becomes the Source Entity S save the tuples (S, R, T) * - * 3) AdjacencyListBuilderJob: - * given the tuple (S - R - T) we need to group by S.id -> List [ R - T ], mapping the result as JoinedEntity + *

3) AdjacencyListBuilderJob: given the tuple (S - R - T) we need to group by S.id -> List [ R - + * T ], mapping the result as JoinedEntity * - * 4) XmlConverterJob: - * convert the JoinedEntities as XML records + *

4) XmlConverterJob: convert the JoinedEntities as XML records */ public class PrepareRelationsJob { @@ -66,16 +57,17 @@ public class PrepareRelationsJob { public static final int MAX_RELS = 100; public static void main(String[] args) throws Exception { - String jsonConfiguration = IOUtils.toString( - PrepareRelationsJob.class - .getResourceAsStream("/eu/dnetlib/dhp/oa/provision/input_params_prepare_relations.json")); + String jsonConfiguration = + IOUtils.toString( + PrepareRelationsJob.class.getResourceAsStream( + "/eu/dnetlib/dhp/oa/provision/input_params_prepare_relations.json")); final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); parser.parseArgument(args); - Boolean isSparkSessionManaged = Optional - .ofNullable(parser.get("isSparkSessionManaged")) - .map(Boolean::valueOf) - .orElse(Boolean.TRUE); + Boolean isSparkSessionManaged = + Optional.ofNullable(parser.get("isSparkSessionManaged")) + .map(Boolean::valueOf) + .orElse(Boolean.TRUE); log.info("isSparkSessionManaged: {}", isSparkSessionManaged); String inputRelationsPath = parser.get("inputRelationsPath"); @@ -89,18 +81,28 @@ public class PrepareRelationsJob { SparkConf conf = new SparkConf(); - runWithSparkSession(conf, isSparkSessionManaged, + runWithSparkSession( + conf, + isSparkSessionManaged, spark -> { removeOutputDir(spark, outputPath); prepareRelationsFromPaths(spark, inputRelationsPath, outputPath, numPartitions); }); } - private static void prepareRelationsFromPaths(SparkSession spark, String inputRelationsPath, String outputPath, int numPartitions) { + private static void prepareRelationsFromPaths( + SparkSession spark, String inputRelationsPath, String outputPath, int numPartitions) { readPathRelation(spark, inputRelationsPath) - .filter((FilterFunction) value -> value.getDataInfo().getDeletedbyinference() == false) - .groupByKey((MapFunction) value -> value.getSource(), Encoders.STRING()) - .flatMapGroups((FlatMapGroupsFunction) (key, values) -> Iterators.limit(values, MAX_RELS), Encoders.bean(SortableRelation.class)) + .filter( + (FilterFunction) + value -> value.getDataInfo().getDeletedbyinference() == false) + .groupByKey( + (MapFunction) value -> value.getSource(), + Encoders.STRING()) + .flatMapGroups( + (FlatMapGroupsFunction) + (key, values) -> Iterators.limit(values, MAX_RELS), + Encoders.bean(SortableRelation.class)) .repartition(numPartitions) .write() .mode(SaveMode.Overwrite) @@ -108,30 +110,43 @@ public class PrepareRelationsJob { } /** - * Reads a Dataset of eu.dnetlib.dhp.oa.provision.model.SortableRelation objects from a newline delimited json text file, + * Reads a Dataset of eu.dnetlib.dhp.oa.provision.model.SortableRelation objects from a newline + * delimited json text file, * * @param spark * @param inputPath * @return the Dataset containing all the relationships */ - private static Dataset readPathRelation(SparkSession spark, final String inputPath) { + private static Dataset readPathRelation( + SparkSession spark, final String inputPath) { return spark.read() .textFile(inputPath) - .map((MapFunction) value -> OBJECT_MAPPER.readValue(value, SortableRelation.class), Encoders.bean(SortableRelation.class)); + .map( + (MapFunction) + value -> OBJECT_MAPPER.readValue(value, SortableRelation.class), + Encoders.bean(SortableRelation.class)); } - //TODO work in progress - private static void prepareRelationsRDDFromPaths(SparkSession spark, String inputRelationsPath, String outputPath, int numPartitions) { - JavaRDD rels = readPathRelationRDD(spark, inputRelationsPath) - .repartition(numPartitions); + // TODO work in progress + private static void prepareRelationsRDDFromPaths( + SparkSession spark, String inputRelationsPath, String outputPath, int numPartitions) { + JavaRDD rels = + readPathRelationRDD(spark, inputRelationsPath).repartition(numPartitions); - RDD d = rels - .filter(rel -> !rel.getDataInfo().getDeletedbyinference()) //only consider those that are not virtually deleted - .mapToPair((PairFunction) rel -> new Tuple2<>(rel, rel)) - .groupByKey(new RelationPartitioner(rels.getNumPartitions())) - .map(p -> Iterables.limit(p._2(), MAX_RELS)) - .flatMap(p -> p.iterator()) - .rdd(); + RDD d = + rels.filter( + rel -> + !rel.getDataInfo() + .getDeletedbyinference()) // only consider those + // that are not virtually + // deleted + .mapToPair( + (PairFunction) + rel -> new Tuple2<>(rel, rel)) + .groupByKey(new RelationPartitioner(rels.getNumPartitions())) + .map(p -> Iterables.limit(p._2(), MAX_RELS)) + .flatMap(p -> p.iterator()) + .rdd(); spark.createDataset(d, Encoders.bean(SortableRelation.class)) .write() @@ -139,14 +154,13 @@ public class PrepareRelationsJob { .parquet(outputPath); } - private static JavaRDD readPathRelationRDD(SparkSession spark, final String inputPath) { + private static JavaRDD readPathRelationRDD( + SparkSession spark, final String inputPath) { JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); - return sc.textFile(inputPath) - .map(s -> OBJECT_MAPPER.readValue(s, SortableRelation.class)); + return sc.textFile(inputPath).map(s -> OBJECT_MAPPER.readValue(s, SortableRelation.class)); } private static void removeOutputDir(SparkSession spark, String path) { HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration()); } - } diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/XmlConverterJob.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/XmlConverterJob.java index 059cb31f2..5ea267d62 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/XmlConverterJob.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/XmlConverterJob.java @@ -1,68 +1,54 @@ package eu.dnetlib.dhp.oa.provision; +import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; + import com.fasterxml.jackson.databind.ObjectMapper; import com.google.common.collect.Maps; import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.common.HdfsSupport; import eu.dnetlib.dhp.oa.provision.model.*; import eu.dnetlib.dhp.oa.provision.utils.ContextMapper; -import eu.dnetlib.dhp.oa.provision.utils.GraphMappingUtils; import eu.dnetlib.dhp.oa.provision.utils.XmlRecordFactory; import eu.dnetlib.dhp.schema.oaf.*; +import java.util.ArrayList; +import java.util.Map; +import java.util.Optional; +import java.util.stream.Collectors; import org.apache.commons.io.IOUtils; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.compress.GzipCodec; import org.apache.hadoop.mapred.SequenceFileOutputFormat; import org.apache.spark.SparkConf; import org.apache.spark.SparkContext; -import org.apache.spark.api.java.function.Function; -import org.apache.spark.api.java.function.Function2; import org.apache.spark.api.java.function.MapFunction; import org.apache.spark.api.java.function.PairFunction; -import org.apache.spark.rdd.RDD; import org.apache.spark.sql.Encoders; -import org.apache.spark.sql.SaveMode; import org.apache.spark.sql.SparkSession; import org.apache.spark.util.LongAccumulator; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import scala.Tuple2; -import java.util.ArrayList; -import java.util.List; -import java.util.Map; -import java.util.Optional; -import java.util.stream.Collectors; - -import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; - /** - * Joins the graph nodes by resolving the links of distance = 1 to create an adjacency list of linked objects. - * The operation considers all the entity types (publication, dataset, software, ORP, project, datasource, organization, - * and all the possible relationships (similarity links produced by the Dedup process are excluded). + * Joins the graph nodes by resolving the links of distance = 1 to create an adjacency list of + * linked objects. The operation considers all the entity types (publication, dataset, software, + * ORP, project, datasource, organization, and all the possible relationships (similarity links + * produced by the Dedup process are excluded). * - * The workflow is organized in different parts aimed to to reduce the complexity of the operation - * 1) PrepareRelationsJob: - * only consider relationships that are not virtually deleted ($.dataInfo.deletedbyinference == false), each entity - * can be linked at most to 100 other objects + *

The workflow is organized in different parts aimed to to reduce the complexity of the + * operation 1) PrepareRelationsJob: only consider relationships that are not virtually deleted + * ($.dataInfo.deletedbyinference == false), each entity can be linked at most to 100 other objects * - * 2) JoinRelationEntityByTargetJob: - * (phase 1): prepare tuples [relation - target entity] (R - T): - * for each entity type E_i - * map E_i as RelatedEntity T_i to simplify the model and extracting only the necessary information - * join (R.target = T_i.id) - * save the tuples (R_i, T_i) - * (phase 2): - * create the union of all the entity types E, hash by id - * read the tuples (R, T), hash by R.source - * join E.id = (R, T).source, where E becomes the Source Entity S - * save the tuples (S, R, T) + *

2) JoinRelationEntityByTargetJob: (phase 1): prepare tuples [relation - target entity] (R - + * T): for each entity type E_i map E_i as RelatedEntity T_i to simplify the model and extracting + * only the necessary information join (R.target = T_i.id) save the tuples (R_i, T_i) (phase 2): + * create the union of all the entity types E, hash by id read the tuples (R, T), hash by R.source + * join E.id = (R, T).source, where E becomes the Source Entity S save the tuples (S, R, T) * - * 3) AdjacencyListBuilderJob: - * given the tuple (S - R - T) we need to group by S.id -> List [ R - T ], mapping the result as JoinedEntity + *

3) AdjacencyListBuilderJob: given the tuple (S - R - T) we need to group by S.id -> List [ R - + * T ], mapping the result as JoinedEntity * - * 4) XmlConverterJob: - * convert the JoinedEntities as XML records + *

4) XmlConverterJob: convert the JoinedEntities as XML records */ public class XmlConverterJob { @@ -74,16 +60,17 @@ public class XmlConverterJob { public static void main(String[] args) throws Exception { - final ArgumentApplicationParser parser = new ArgumentApplicationParser( - IOUtils.toString( - XmlConverterJob.class - .getResourceAsStream("/eu/dnetlib/dhp/oa/provision/input_params_xml_converter.json"))); + final ArgumentApplicationParser parser = + new ArgumentApplicationParser( + IOUtils.toString( + XmlConverterJob.class.getResourceAsStream( + "/eu/dnetlib/dhp/oa/provision/input_params_xml_converter.json"))); parser.parseArgument(args); - Boolean isSparkSessionManaged = Optional - .ofNullable(parser.get("isSparkSessionManaged")) - .map(Boolean::valueOf) - .orElse(Boolean.TRUE); + Boolean isSparkSessionManaged = + Optional.ofNullable(parser.get("isSparkSessionManaged")) + .map(Boolean::valueOf) + .orElse(Boolean.TRUE); log.info("isSparkSessionManaged: {}", isSparkSessionManaged); String inputPath = parser.get("inputPath"); @@ -100,37 +87,71 @@ public class XmlConverterJob { SparkConf conf = new SparkConf(); - runWithSparkSession(conf, isSparkSessionManaged, + runWithSparkSession( + conf, + isSparkSessionManaged, spark -> { removeOutputDir(spark, outputPath); - convertToXml(spark, inputPath, outputPath, ContextMapper.fromIS(isLookupUrl), otherDsTypeId); + convertToXml( + spark, + inputPath, + outputPath, + ContextMapper.fromIS(isLookupUrl), + otherDsTypeId); }); - } - private static void convertToXml(SparkSession spark, String inputPath, String outputPath, ContextMapper contextMapper, String otherDsTypeId) { + private static void convertToXml( + SparkSession spark, + String inputPath, + String outputPath, + ContextMapper contextMapper, + String otherDsTypeId) { - final XmlRecordFactory recordFactory = new XmlRecordFactory(prepareAccumulators(spark.sparkContext()), contextMapper, false, schemaLocation, otherDsTypeId); + final XmlRecordFactory recordFactory = + new XmlRecordFactory( + prepareAccumulators(spark.sparkContext()), + contextMapper, + false, + schemaLocation, + otherDsTypeId); spark.read() .load(inputPath) .as(Encoders.bean(JoinedEntity.class)) - .map((MapFunction) j -> { - if (j.getLinks() != null) { - j.setLinks(j.getLinks() - .stream() - .filter(t -> t.getRelation() != null & t.getRelatedEntity() != null) - .collect(Collectors.toCollection(ArrayList::new))); - } - return j; - }, Encoders.bean(JoinedEntity.class)) - .map((MapFunction>) je -> new Tuple2<>( - je.getEntity().getId(), - recordFactory.build(je) - ), Encoders.tuple(Encoders.STRING(), Encoders.STRING())) + .map( + (MapFunction) + j -> { + if (j.getLinks() != null) { + j.setLinks( + j.getLinks().stream() + .filter( + t -> + t.getRelation() != null + & t + .getRelatedEntity() + != null) + .collect( + Collectors.toCollection( + ArrayList::new))); + } + return j; + }, + Encoders.bean(JoinedEntity.class)) + .map( + (MapFunction>) + je -> new Tuple2<>(je.getEntity().getId(), recordFactory.build(je)), + Encoders.tuple(Encoders.STRING(), Encoders.STRING())) .javaRDD() - .mapToPair((PairFunction, Text, Text>) t -> new Tuple2<>(new Text(t._1()), new Text(t._2()))) - .saveAsHadoopFile(outputPath, Text.class, Text.class, SequenceFileOutputFormat.class, GzipCodec.class); + .mapToPair( + (PairFunction, Text, Text>) + t -> new Tuple2<>(new Text(t._1()), new Text(t._2()))) + .saveAsHadoopFile( + outputPath, + Text.class, + Text.class, + SequenceFileOutputFormat.class, + GzipCodec.class); } private static void removeOutputDir(SparkSession spark, String path) { @@ -139,28 +160,62 @@ public class XmlConverterJob { private static Map prepareAccumulators(SparkContext sc) { Map accumulators = Maps.newHashMap(); - accumulators.put("resultResult_similarity_isAmongTopNSimilarDocuments", sc.longAccumulator("resultResult_similarity_isAmongTopNSimilarDocuments")); - accumulators.put("resultResult_similarity_hasAmongTopNSimilarDocuments", sc.longAccumulator("resultResult_similarity_hasAmongTopNSimilarDocuments")); - accumulators.put("resultResult_supplement_isSupplementTo", sc.longAccumulator("resultResult_supplement_isSupplementTo")); - accumulators.put("resultResult_supplement_isSupplementedBy", sc.longAccumulator("resultResult_supplement_isSupplementedBy")); - accumulators.put("resultResult_dedup_isMergedIn", sc.longAccumulator("resultResult_dedup_isMergedIn")); - accumulators.put("resultResult_dedup_merges", sc.longAccumulator("resultResult_dedup_merges")); + accumulators.put( + "resultResult_similarity_isAmongTopNSimilarDocuments", + sc.longAccumulator("resultResult_similarity_isAmongTopNSimilarDocuments")); + accumulators.put( + "resultResult_similarity_hasAmongTopNSimilarDocuments", + sc.longAccumulator("resultResult_similarity_hasAmongTopNSimilarDocuments")); + accumulators.put( + "resultResult_supplement_isSupplementTo", + sc.longAccumulator("resultResult_supplement_isSupplementTo")); + accumulators.put( + "resultResult_supplement_isSupplementedBy", + sc.longAccumulator("resultResult_supplement_isSupplementedBy")); + accumulators.put( + "resultResult_dedup_isMergedIn", + sc.longAccumulator("resultResult_dedup_isMergedIn")); + accumulators.put( + "resultResult_dedup_merges", sc.longAccumulator("resultResult_dedup_merges")); - accumulators.put("resultResult_publicationDataset_isRelatedTo", sc.longAccumulator("resultResult_publicationDataset_isRelatedTo")); - accumulators.put("resultResult_relationship_isRelatedTo", sc.longAccumulator("resultResult_relationship_isRelatedTo")); - accumulators.put("resultProject_outcome_isProducedBy", sc.longAccumulator("resultProject_outcome_isProducedBy")); - accumulators.put("resultProject_outcome_produces", sc.longAccumulator("resultProject_outcome_produces")); - accumulators.put("resultOrganization_affiliation_isAuthorInstitutionOf", sc.longAccumulator("resultOrganization_affiliation_isAuthorInstitutionOf")); + accumulators.put( + "resultResult_publicationDataset_isRelatedTo", + sc.longAccumulator("resultResult_publicationDataset_isRelatedTo")); + accumulators.put( + "resultResult_relationship_isRelatedTo", + sc.longAccumulator("resultResult_relationship_isRelatedTo")); + accumulators.put( + "resultProject_outcome_isProducedBy", + sc.longAccumulator("resultProject_outcome_isProducedBy")); + accumulators.put( + "resultProject_outcome_produces", + sc.longAccumulator("resultProject_outcome_produces")); + accumulators.put( + "resultOrganization_affiliation_isAuthorInstitutionOf", + sc.longAccumulator("resultOrganization_affiliation_isAuthorInstitutionOf")); - accumulators.put("resultOrganization_affiliation_hasAuthorInstitution", sc.longAccumulator("resultOrganization_affiliation_hasAuthorInstitution")); - accumulators.put("projectOrganization_participation_hasParticipant", sc.longAccumulator("projectOrganization_participation_hasParticipant")); - accumulators.put("projectOrganization_participation_isParticipant", sc.longAccumulator("projectOrganization_participation_isParticipant")); - accumulators.put("organizationOrganization_dedup_isMergedIn", sc.longAccumulator("organizationOrganization_dedup_isMergedIn")); - accumulators.put("organizationOrganization_dedup_merges", sc.longAccumulator("resultProject_outcome_produces")); - accumulators.put("datasourceOrganization_provision_isProvidedBy", sc.longAccumulator("datasourceOrganization_provision_isProvidedBy")); - accumulators.put("datasourceOrganization_provision_provides", sc.longAccumulator("datasourceOrganization_provision_provides")); + accumulators.put( + "resultOrganization_affiliation_hasAuthorInstitution", + sc.longAccumulator("resultOrganization_affiliation_hasAuthorInstitution")); + accumulators.put( + "projectOrganization_participation_hasParticipant", + sc.longAccumulator("projectOrganization_participation_hasParticipant")); + accumulators.put( + "projectOrganization_participation_isParticipant", + sc.longAccumulator("projectOrganization_participation_isParticipant")); + accumulators.put( + "organizationOrganization_dedup_isMergedIn", + sc.longAccumulator("organizationOrganization_dedup_isMergedIn")); + accumulators.put( + "organizationOrganization_dedup_merges", + sc.longAccumulator("resultProject_outcome_produces")); + accumulators.put( + "datasourceOrganization_provision_isProvidedBy", + sc.longAccumulator("datasourceOrganization_provision_isProvidedBy")); + accumulators.put( + "datasourceOrganization_provision_provides", + sc.longAccumulator("datasourceOrganization_provision_provides")); return accumulators; } - } diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/XmlIndexingJob.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/XmlIndexingJob.java index 84538c924..2215eaad2 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/XmlIndexingJob.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/XmlIndexingJob.java @@ -1,5 +1,7 @@ package eu.dnetlib.dhp.oa.provision; +import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; + import com.lucidworks.spark.util.SolrSupport; import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.oa.provision.utils.StreamingInputDocumentFactory; @@ -8,6 +10,16 @@ import eu.dnetlib.dhp.utils.saxon.SaxonTransformerFactory; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpDocumentNotFoundException; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; +import java.io.IOException; +import java.io.StringReader; +import java.io.StringWriter; +import java.text.SimpleDateFormat; +import java.util.Date; +import java.util.Optional; +import javax.xml.transform.Transformer; +import javax.xml.transform.TransformerException; +import javax.xml.transform.stream.StreamResult; +import javax.xml.transform.stream.StreamSource; import org.apache.commons.io.IOUtils; import org.apache.commons.lang3.StringUtils; import org.apache.hadoop.io.Text; @@ -18,19 +30,6 @@ import org.apache.spark.rdd.RDD; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import javax.xml.transform.Transformer; -import javax.xml.transform.TransformerException; -import javax.xml.transform.stream.StreamResult; -import javax.xml.transform.stream.StreamSource; -import java.io.IOException; -import java.io.StringReader; -import java.io.StringWriter; -import java.text.SimpleDateFormat; -import java.util.Date; -import java.util.Optional; - -import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; - public class XmlIndexingJob { private static final Logger log = LoggerFactory.getLogger(XmlIndexingJob.class); @@ -44,16 +43,17 @@ public class XmlIndexingJob { public static void main(String[] args) throws Exception { - final ArgumentApplicationParser parser = new ArgumentApplicationParser( - IOUtils.toString( - XmlIndexingJob.class.getResourceAsStream( - "/eu/dnetlib/dhp/oa/provision/input_params_update_index.json"))); + final ArgumentApplicationParser parser = + new ArgumentApplicationParser( + IOUtils.toString( + XmlIndexingJob.class.getResourceAsStream( + "/eu/dnetlib/dhp/oa/provision/input_params_update_index.json"))); parser.parseArgument(args); - Boolean isSparkSessionManaged = Optional - .ofNullable(parser.get("isSparkSessionManaged")) - .map(Boolean::valueOf) - .orElse(Boolean.TRUE); + Boolean isSparkSessionManaged = + Optional.ofNullable(parser.get("isSparkSessionManaged")) + .map(Boolean::valueOf) + .orElse(Boolean.TRUE); log.info("isSparkSessionManaged: {}", isSparkSessionManaged); final String inputPath = parser.get("inputPath"); @@ -65,7 +65,10 @@ public class XmlIndexingJob { final String format = parser.get("format"); log.info("format: {}", format); - final Integer batchSize = parser.getObjectMap().containsKey("batchSize") ? Integer.valueOf(parser.get("batchSize")) : DEFAULT_BATCH_SIZE; + final Integer batchSize = + parser.getObjectMap().containsKey("batchSize") + ? Integer.valueOf(parser.get("batchSize")) + : DEFAULT_BATCH_SIZE; log.info("batchSize: {}", batchSize); final ISLookUpService isLookup = ISLookupClientFactory.getLookUpService(isLookupUrl); @@ -87,17 +90,30 @@ public class XmlIndexingJob { final SparkConf conf = new SparkConf(); - runWithSparkSession(conf, isSparkSessionManaged, + runWithSparkSession( + conf, + isSparkSessionManaged, spark -> { - final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); + final JavaSparkContext sc = + JavaSparkContext.fromSparkContext(spark.sparkContext()); - RDD docs = sc.sequenceFile(inputPath, Text.class, Text.class) - .map(t -> t._2().toString()) - .map(s -> toIndexRecord(SaxonTransformerFactory.newInstance(indexRecordXslt), s)) - .map(s -> new StreamingInputDocumentFactory(version, dsId).parseDocument(s)) - .rdd(); + RDD docs = + sc.sequenceFile(inputPath, Text.class, Text.class) + .map(t -> t._2().toString()) + .map( + s -> + toIndexRecord( + SaxonTransformerFactory.newInstance( + indexRecordXslt), + s)) + .map( + s -> + new StreamingInputDocumentFactory(version, dsId) + .parseDocument(s)) + .rdd(); - final String collection = format + SEPARATOR + LAYOUT + SEPARATOR + INTERPRETATION; + final String collection = + format + SEPARATOR + LAYOUT + SEPARATOR + INTERPRETATION; SolrSupport.indexDocs(zkHost, collection, batchSize, docs); }); } @@ -116,15 +132,16 @@ public class XmlIndexingJob { /** * Creates the XSLT responsible for building the index xml records. * - * @param format Metadata format name (DMF|TMF) - * @param xslt xslt for building the index record transformer - * @param fields the list of fields + * @param format Metadata format name (DMF|TMF) + * @param xslt xslt for building the index record transformer + * @param fields the list of fields * @return the javax.xml.transform.Transformer - * @throws ISLookUpException could happen - * @throws IOException could happen + * @throws ISLookUpException could happen + * @throws IOException could happen * @throws TransformerException could happen */ - private static String getLayoutTransformer(String format, String fields, String xslt) throws TransformerException { + private static String getLayoutTransformer(String format, String fields, String xslt) + throws TransformerException { final Transformer layoutTransformer = SaxonTransformerFactory.newInstance(xslt); final StreamResult layoutToXsltXslt = new StreamResult(new StringWriter()); @@ -136,7 +153,9 @@ public class XmlIndexingJob { } /** - * method return a solr-compatible string representation of a date, used to mark all records as indexed today + * method return a solr-compatible string representation of a date, used to mark all records as + * indexed today + * * @return the parsed date */ public static String getRecordDatestamp() { @@ -144,18 +163,22 @@ public class XmlIndexingJob { } /** - * Method retrieves from the information system the list of fields associated to the given MDFormat name + * Method retrieves from the information system the list of fields associated to the given + * MDFormat name * * @param isLookup the ISLookup service stub * @param format the Metadata format name * @return the string representation of the list of fields to be indexed - * * @throws ISLookUpDocumentNotFoundException * @throws ISLookUpException */ - private static String getLayoutSource(final ISLookUpService isLookup, final String format) throws ISLookUpDocumentNotFoundException, ISLookUpException { - return doLookup(isLookup, String.format( - "collection('')//RESOURCE_PROFILE[.//RESOURCE_TYPE/@value = 'MDFormatDSResourceType' and .//NAME='%s']//LAYOUT[@name='%s']", format, LAYOUT)); + private static String getLayoutSource(final ISLookUpService isLookup, final String format) + throws ISLookUpDocumentNotFoundException, ISLookUpException { + return doLookup( + isLookup, + String.format( + "collection('')//RESOURCE_PROFILE[.//RESOURCE_TYPE/@value = 'MDFormatDSResourceType' and .//NAME='%s']//LAYOUT[@name='%s']", + format, LAYOUT)); } /** @@ -163,42 +186,54 @@ public class XmlIndexingJob { * * @param isLookup the ISLookup service stub * @return the string representation of the XSLT contained in the transformation rule profile - * * @throws ISLookUpDocumentNotFoundException * @throws ISLookUpException */ private static String getLayoutTransformer(ISLookUpService isLookup) throws ISLookUpException { - return doLookup(isLookup, "collection('/db/DRIVER/TransformationRuleDSResources/TransformationRuleDSResourceType')" + - "//RESOURCE_PROFILE[./BODY/CONFIGURATION/SCRIPT/TITLE/text() = 'openaireLayoutToRecordStylesheet']//CODE/node()"); + return doLookup( + isLookup, + "collection('/db/DRIVER/TransformationRuleDSResources/TransformationRuleDSResourceType')" + + "//RESOURCE_PROFILE[./BODY/CONFIGURATION/SCRIPT/TITLE/text() = 'openaireLayoutToRecordStylesheet']//CODE/node()"); } /** - * Method retrieves from the information system the IndexDS profile ID associated to the given MDFormat name + * Method retrieves from the information system the IndexDS profile ID associated to the given + * MDFormat name + * * @param format * @param isLookup * @return the IndexDS identifier * @throws ISLookUpException */ - private static String getDsId(String format, ISLookUpService isLookup) throws ISLookUpException { - return doLookup(isLookup, String.format("collection('/db/DRIVER/IndexDSResources/IndexDSResourceType')" + - "//RESOURCE_PROFILE[./BODY/CONFIGURATION/METADATA_FORMAT/text() = '%s']//RESOURCE_IDENTIFIER/@value/string()", format)); + private static String getDsId(String format, ISLookUpService isLookup) + throws ISLookUpException { + return doLookup( + isLookup, + String.format( + "collection('/db/DRIVER/IndexDSResources/IndexDSResourceType')" + + "//RESOURCE_PROFILE[./BODY/CONFIGURATION/METADATA_FORMAT/text() = '%s']//RESOURCE_IDENTIFIER/@value/string()", + format)); } /** * Method retrieves from the information system the zookeeper quorum of the Solr server + * * @param isLookup * @return the zookeeper quorum of the Solr server * @throws ISLookUpException */ private static String getZkHost(ISLookUpService isLookup) throws ISLookUpException { - return doLookup(isLookup, "for $x in /RESOURCE_PROFILE[.//RESOURCE_TYPE/@value='IndexServiceResourceType'] return $x//PROTOCOL[./@name='solr']/@address/string()"); + return doLookup( + isLookup, + "for $x in /RESOURCE_PROFILE[.//RESOURCE_TYPE/@value='IndexServiceResourceType'] return $x//PROTOCOL[./@name='solr']/@address/string()"); } - private static String doLookup(ISLookUpService isLookup, String xquery) throws ISLookUpException { + private static String doLookup(ISLookUpService isLookup, String xquery) + throws ISLookUpException { log.info(String.format("running xquery: %s", xquery)); final String res = isLookup.getResourceProfileByQuery(xquery); - log.info(String.format("got response (100 chars): %s", StringUtils.left(res, 100) + " ...")); + log.info( + String.format("got response (100 chars): %s", StringUtils.left(res, 100) + " ...")); return res; } - } diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/EntityRelEntity.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/EntityRelEntity.java index e1ca8e316..5873155cf 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/EntityRelEntity.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/EntityRelEntity.java @@ -1,7 +1,6 @@ package eu.dnetlib.dhp.oa.provision.model; import com.google.common.base.Objects; - import java.io.Serializable; public class EntityRelEntity implements Serializable { @@ -10,8 +9,7 @@ public class EntityRelEntity implements Serializable { private SortableRelation relation; private RelatedEntity target; - public EntityRelEntity() { - } + public EntityRelEntity() {} public EntityRelEntity(SortableRelation relation, RelatedEntity target) { this(null, relation, target); @@ -52,9 +50,9 @@ public class EntityRelEntity implements Serializable { if (this == o) return true; if (o == null || getClass() != o.getClass()) return false; EntityRelEntity that = (EntityRelEntity) o; - return Objects.equal(entity, that.entity) && - Objects.equal(relation, that.relation) && - Objects.equal(target, that.target); + return Objects.equal(entity, that.entity) + && Objects.equal(relation, that.relation) + && Objects.equal(target, that.target); } @Override diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/JoinedEntity.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/JoinedEntity.java index b6e97a503..519d7f2bd 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/JoinedEntity.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/JoinedEntity.java @@ -9,8 +9,7 @@ public class JoinedEntity implements Serializable { private List links; - public JoinedEntity() { - } + public JoinedEntity() {} public TypedRow getEntity() { return entity; diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/RelatedEntity.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/RelatedEntity.java index 011d9276d..2964c51ee 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/RelatedEntity.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/RelatedEntity.java @@ -5,7 +5,6 @@ import eu.dnetlib.dhp.schema.oaf.Instance; import eu.dnetlib.dhp.schema.oaf.KeyValue; import eu.dnetlib.dhp.schema.oaf.Qualifier; import eu.dnetlib.dhp.schema.oaf.StructuredProperty; - import java.io.Serializable; import java.util.List; @@ -32,7 +31,7 @@ public class RelatedEntity implements Serializable { private Qualifier datasourcetype; private Qualifier datasourcetypeui; private Qualifier openairecompatibility; - //private String aggregatortype; + // private String aggregatortype; // organization private String legalname; @@ -235,33 +234,56 @@ public class RelatedEntity implements Serializable { if (this == o) return true; if (o == null || getClass() != o.getClass()) return false; RelatedEntity that = (RelatedEntity) o; - return Objects.equal(id, that.id) && - Objects.equal(type, that.type) && - Objects.equal(title, that.title) && - Objects.equal(websiteurl, that.websiteurl) && - Objects.equal(dateofacceptance, that.dateofacceptance) && - Objects.equal(publisher, that.publisher) && - Objects.equal(pid, that.pid) && - Objects.equal(codeRepositoryUrl, that.codeRepositoryUrl) && - Objects.equal(resulttype, that.resulttype) && - Objects.equal(collectedfrom, that.collectedfrom) && - Objects.equal(instances, that.instances) && - Objects.equal(officialname, that.officialname) && - Objects.equal(datasourcetype, that.datasourcetype) && - Objects.equal(datasourcetypeui, that.datasourcetypeui) && - Objects.equal(openairecompatibility, that.openairecompatibility) && - Objects.equal(legalname, that.legalname) && - Objects.equal(legalshortname, that.legalshortname) && - Objects.equal(country, that.country) && - Objects.equal(projectTitle, that.projectTitle) && - Objects.equal(code, that.code) && - Objects.equal(acronym, that.acronym) && - Objects.equal(contracttype, that.contracttype) && - Objects.equal(fundingtree, that.fundingtree); + return Objects.equal(id, that.id) + && Objects.equal(type, that.type) + && Objects.equal(title, that.title) + && Objects.equal(websiteurl, that.websiteurl) + && Objects.equal(dateofacceptance, that.dateofacceptance) + && Objects.equal(publisher, that.publisher) + && Objects.equal(pid, that.pid) + && Objects.equal(codeRepositoryUrl, that.codeRepositoryUrl) + && Objects.equal(resulttype, that.resulttype) + && Objects.equal(collectedfrom, that.collectedfrom) + && Objects.equal(instances, that.instances) + && Objects.equal(officialname, that.officialname) + && Objects.equal(datasourcetype, that.datasourcetype) + && Objects.equal(datasourcetypeui, that.datasourcetypeui) + && Objects.equal(openairecompatibility, that.openairecompatibility) + && Objects.equal(legalname, that.legalname) + && Objects.equal(legalshortname, that.legalshortname) + && Objects.equal(country, that.country) + && Objects.equal(projectTitle, that.projectTitle) + && Objects.equal(code, that.code) + && Objects.equal(acronym, that.acronym) + && Objects.equal(contracttype, that.contracttype) + && Objects.equal(fundingtree, that.fundingtree); } @Override public int hashCode() { - return Objects.hashCode(id, type, title, websiteurl, dateofacceptance, publisher, pid, codeRepositoryUrl, resulttype, collectedfrom, instances, officialname, datasourcetype, datasourcetypeui, openairecompatibility, legalname, legalshortname, country, projectTitle, code, acronym, contracttype, fundingtree); + return Objects.hashCode( + id, + type, + title, + websiteurl, + dateofacceptance, + publisher, + pid, + codeRepositoryUrl, + resulttype, + collectedfrom, + instances, + officialname, + datasourcetype, + datasourcetypeui, + openairecompatibility, + legalname, + legalshortname, + country, + projectTitle, + code, + acronym, + contracttype, + fundingtree); } -} \ No newline at end of file +} diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/SortableRelation.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/SortableRelation.java index b294a6633..467bd20d5 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/SortableRelation.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/SortableRelation.java @@ -3,13 +3,12 @@ package eu.dnetlib.dhp.oa.provision.model; import com.google.common.collect.ComparisonChain; import com.google.common.collect.Maps; import eu.dnetlib.dhp.schema.oaf.Relation; - import java.io.Serializable; import java.util.Map; public class SortableRelation extends Relation implements Comparable, Serializable { - private final static Map weights = Maps.newHashMap(); + private static final Map weights = Maps.newHashMap(); static { weights.put("outcome", 0); diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/Tuple2.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/Tuple2.java index 942acaea1..30de29038 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/Tuple2.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/Tuple2.java @@ -1,7 +1,6 @@ package eu.dnetlib.dhp.oa.provision.model; import eu.dnetlib.dhp.schema.oaf.Relation; - import java.io.Serializable; import java.util.Objects; @@ -11,8 +10,7 @@ public class Tuple2 implements Serializable { private RelatedEntity relatedEntity; - public Tuple2() { - } + public Tuple2() {} public Tuple2(Relation relation, RelatedEntity relatedEntity) { this.relation = relation; @@ -47,5 +45,4 @@ public class Tuple2 implements Serializable { public int hashCode() { return Objects.hash(getRelation().hashCode()); } - } diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/TypedRow.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/TypedRow.java index 54f34802f..01467ab7c 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/TypedRow.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/TypedRow.java @@ -1,7 +1,6 @@ package eu.dnetlib.dhp.oa.provision.model; import com.google.common.base.Objects; - import java.io.Serializable; public class TypedRow implements Serializable { diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/ContextDef.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/ContextDef.java index fba3a8e7b..f85606b2e 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/ContextDef.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/ContextDef.java @@ -48,4 +48,4 @@ public class ContextDef implements Serializable { public void setType(final String type) { this.type = type; } -} \ No newline at end of file +} diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/ContextMapper.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/ContextMapper.java index bdeacf45e..fc71d8861 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/ContextMapper.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/ContextMapper.java @@ -4,22 +4,23 @@ import com.google.common.base.Joiner; import eu.dnetlib.dhp.utils.ISLookupClientFactory; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; +import java.io.Serializable; +import java.io.StringReader; +import java.util.HashMap; import org.dom4j.Document; import org.dom4j.DocumentException; import org.dom4j.Node; import org.dom4j.io.SAXReader; -import java.io.Serializable; -import java.io.StringReader; -import java.util.HashMap; - public class ContextMapper extends HashMap implements Serializable { private static final long serialVersionUID = 2159682308502487305L; - private final static String XQUERY = "for $x in //RESOURCE_PROFILE[.//RESOURCE_TYPE/@value='ContextDSResourceType']//*[name()='context' or name()='category' or name()='concept'] return "; + private static final String XQUERY = + "for $x in //RESOURCE_PROFILE[.//RESOURCE_TYPE/@value='ContextDSResourceType']//*[name()='context' or name()='category' or name()='concept'] return "; - public static ContextMapper fromIS(final String isLookupUrl) throws DocumentException, ISLookUpException { + public static ContextMapper fromIS(final String isLookupUrl) + throws DocumentException, ISLookUpException { ISLookUpService isLookUp = ISLookupClientFactory.getLookUpService(isLookupUrl); StringBuilder sb = new StringBuilder(""); Joiner.on("").appendTo(sb, isLookUp.quickSearchProfile(XQUERY)); @@ -42,5 +43,4 @@ public class ContextMapper extends HashMap implements Serial } return contextMapper; } - } diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/GraphMappingUtils.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/GraphMappingUtils.java index 42174ac94..cea3539bc 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/GraphMappingUtils.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/GraphMappingUtils.java @@ -1,18 +1,17 @@ package eu.dnetlib.dhp.oa.provision.utils; +import static org.apache.commons.lang3.StringUtils.substringAfter; + import com.google.common.collect.Sets; import eu.dnetlib.dhp.oa.provision.model.RelatedEntity; import eu.dnetlib.dhp.schema.common.EntityType; import eu.dnetlib.dhp.schema.oaf.*; - import java.util.List; import java.util.Objects; import java.util.Optional; import java.util.Set; import java.util.stream.Collectors; -import static org.apache.commons.lang3.StringUtils.substringAfter; - public class GraphMappingUtils { public static final String SEPARATOR = "_"; @@ -33,7 +32,6 @@ public class GraphMappingUtils { case dataset: case otherresearchproduct: case software: - Result result = (Result) entity; if (result.getTitle() == null && !result.getTitle().isEmpty()) { @@ -45,8 +43,8 @@ public class GraphMappingUtils { re.setResulttype(result.getResulttype()); re.setInstances(result.getInstance()); - //TODO still to be mapped - //re.setCodeRepositoryUrl(j.read("$.coderepositoryurl")); + // TODO still to be mapped + // re.setCodeRepositoryUrl(j.read("$.coderepositoryurl")); break; case datasource: @@ -76,9 +74,8 @@ public class GraphMappingUtils { List> f = p.getFundingtree(); if (!f.isEmpty()) { - re.setFundingtree(f.stream() - .map(s -> s.getValue()) - .collect(Collectors.toList())); + re.setFundingtree( + f.stream().map(s -> s.getValue()).collect(Collectors.toList())); } break; } @@ -104,5 +101,4 @@ public class GraphMappingUtils { public static String getRelDescriptor(String relType, String subRelType, String relClass) { return relType + SEPARATOR + subRelType + SEPARATOR + relClass; } - } diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/LicenseComparator.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/LicenseComparator.java index 17073038d..9415cbd06 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/LicenseComparator.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/LicenseComparator.java @@ -1,7 +1,6 @@ package eu.dnetlib.dhp.oa.provision.utils; import eu.dnetlib.dhp.schema.oaf.Qualifier; - import java.util.Comparator; public class LicenseComparator implements Comparator { @@ -45,5 +44,4 @@ public class LicenseComparator implements Comparator { // Else (but unlikely), lexicographical ordering will do. return lClass.compareTo(rClass); } - } diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/RelationPartitioner.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/RelationPartitioner.java index c8e7a2429..ca0a4ca51 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/RelationPartitioner.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/RelationPartitioner.java @@ -5,8 +5,9 @@ import org.apache.spark.Partitioner; import org.apache.spark.util.Utils; /** - * Used in combination with SortableRelationKey, allows to partition the records by source id, therefore - * allowing to sort relations sharing the same source id by the ordering defined in SortableRelationKey. + * Used in combination with SortableRelationKey, allows to partition the records by source id, + * therefore allowing to sort relations sharing the same source id by the ordering defined in + * SortableRelationKey. */ public class RelationPartitioner extends Partitioner { @@ -23,7 +24,7 @@ public class RelationPartitioner extends Partitioner { @Override public int getPartition(Object key) { - return Utils.nonNegativeMod(((SortableRelation) key).getSource().hashCode(), numPartitions()); + return Utils.nonNegativeMod( + ((SortableRelation) key).getSource().hashCode(), numPartitions()); } - } diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/StreamingInputDocumentFactory.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/StreamingInputDocumentFactory.java index f0499781f..e86d37fa1 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/StreamingInputDocumentFactory.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/StreamingInputDocumentFactory.java @@ -1,5 +1,6 @@ package eu.dnetlib.dhp.oa.provision.utils; +import com.google.common.collect.Lists; import java.io.StringReader; import java.io.StringWriter; import java.util.Arrays; @@ -10,29 +11,23 @@ import javax.xml.stream.*; import javax.xml.stream.events.Namespace; import javax.xml.stream.events.StartElement; import javax.xml.stream.events.XMLEvent; - -import com.google.common.collect.Lists; import org.apache.solr.common.SolrInputDocument; /** * Optimized version of the document parser, drop in replacement of InputDocumentFactory. * - *

- * Faster because: - *

+ *

Faster because: + * *

    - *
  • Doesn't create a DOM for the full document
  • - *
  • Doesn't execute xpaths agains the DOM
  • - *
  • Quickly serialize the 'result' element directly in a string.
  • - *
  • Uses less memory: less pressure on GC and allows more threads to process this in parallel
  • + *
  • Doesn't create a DOM for the full document + *
  • Doesn't execute xpaths agains the DOM + *
  • Quickly serialize the 'result' element directly in a string. + *
  • Uses less memory: less pressure on GC and allows more threads to process this in parallel *
* - *

- * This class is fully reentrant and can be invoked in parallel. - *

+ *

This class is fully reentrant and can be invoked in parallel. * * @author claudio - * */ public class StreamingInputDocumentFactory { @@ -50,7 +45,9 @@ public class StreamingInputDocumentFactory { private static final String outFormat = new String("yyyy-MM-dd'T'hh:mm:ss'Z'"); - private final static List dateFormats = Arrays.asList("yyyy-MM-dd'T'hh:mm:ss", "yyyy-MM-dd", "dd-MM-yyyy", "dd/MM/yyyy", "yyyy"); + private static final List dateFormats = + Arrays.asList( + "yyyy-MM-dd'T'hh:mm:ss", "yyyy-MM-dd", "dd-MM-yyyy", "dd/MM/yyyy", "yyyy"); private static final String DEFAULTDNETRESULT = "dnetResult"; @@ -62,11 +59,14 @@ public class StreamingInputDocumentFactory { private static final int MAX_FIELD_LENGTH = 25000; - private ThreadLocal inputFactory = ThreadLocal.withInitial(() -> XMLInputFactory.newInstance()); + private ThreadLocal inputFactory = + ThreadLocal.withInitial(() -> XMLInputFactory.newInstance()); - private ThreadLocal outputFactory = ThreadLocal.withInitial(() -> XMLOutputFactory.newInstance()); + private ThreadLocal outputFactory = + ThreadLocal.withInitial(() -> XMLOutputFactory.newInstance()); - private ThreadLocal eventFactory = ThreadLocal.withInitial(() -> XMLEventFactory.newInstance()); + private ThreadLocal eventFactory = + ThreadLocal.withInitial(() -> XMLEventFactory.newInstance()); private String version; @@ -78,7 +78,8 @@ public class StreamingInputDocumentFactory { this(version, dsId, DEFAULTDNETRESULT); } - public StreamingInputDocumentFactory(final String version, final String dsId, final String resultName) { + public StreamingInputDocumentFactory( + final String version, final String dsId, final String resultName) { this.version = version; this.dsId = dsId; this.resultName = resultName; @@ -90,7 +91,8 @@ public class StreamingInputDocumentFactory { final List nsList = Lists.newLinkedList(); try { - XMLEventReader parser = inputFactory.get().createXMLEventReader(new StringReader(inputDocument)); + XMLEventReader parser = + inputFactory.get().createXMLEventReader(new StringReader(inputDocument)); final SolrInputDocument indexDocument = new SolrInputDocument(new HashMap<>()); @@ -150,13 +152,16 @@ public class StreamingInputDocumentFactory { * @param parser * @throws XMLStreamException */ - protected void parseTargetFields(final SolrInputDocument indexDocument, final XMLEventReader parser) throws XMLStreamException { + protected void parseTargetFields( + final SolrInputDocument indexDocument, final XMLEventReader parser) + throws XMLStreamException { boolean hasFields = false; while (parser.hasNext()) { final XMLEvent targetEvent = parser.nextEvent(); - if (targetEvent.isEndElement() && targetEvent.asEndElement().getName().getLocalPart().equals(TARGETFIELDS)) { + if (targetEvent.isEndElement() + && targetEvent.asEndElement().getName().getLocalPart().equals(TARGETFIELDS)) { break; } @@ -185,18 +190,21 @@ public class StreamingInputDocumentFactory { * @param nsList * @throws XMLStreamException */ - protected void copyResult(final SolrInputDocument indexDocument, - final StringWriter results, - final XMLEventReader parser, - final List nsList, - final String dnetResult) throws XMLStreamException { + protected void copyResult( + final SolrInputDocument indexDocument, + final StringWriter results, + final XMLEventReader parser, + final List nsList, + final String dnetResult) + throws XMLStreamException { final XMLEventWriter writer = outputFactory.get().createXMLEventWriter(results); for (Namespace ns : nsList) { eventFactory.get().createNamespace(ns.getPrefix(), ns.getNamespaceURI()); } - StartElement newRecord = eventFactory.get().createStartElement("", null, RESULT, null, nsList.iterator()); + StartElement newRecord = + eventFactory.get().createStartElement("", null, RESULT, null, nsList.iterator()); // new root record writer.add(newRecord); @@ -206,7 +214,8 @@ public class StreamingInputDocumentFactory { final XMLEvent resultEvent = parser.nextEvent(); // TODO: replace with depth tracking instead of close tag tracking. - if (resultEvent.isEndElement() && resultEvent.asEndElement().getName().getLocalPart().equals(dnetResult)) { + if (resultEvent.isEndElement() + && resultEvent.asEndElement().getName().getLocalPart().equals(dnetResult)) { writer.add(eventFactory.get().createEndElement("", null, RESULT)); break; } @@ -224,7 +233,8 @@ public class StreamingInputDocumentFactory { * @param field * @param value */ - private final void addField(final SolrInputDocument indexDocument, final String field, final String value) { + private final void addField( + final SolrInputDocument indexDocument, final String field, final String value) { String cleaned = value.trim(); if (!cleaned.isEmpty()) { // log.info("\n\n adding field " + field.toLowerCase() + " value: " + cleaned + "\n"); @@ -239,7 +249,8 @@ public class StreamingInputDocumentFactory { * @return the */ protected final String getText(final XMLEvent text) { - if (text.isEndElement()) // log.warn("skipping because isEndOfElement " + text.asEndElement().getName().getLocalPart()); + if (text.isEndElement()) // log.warn("skipping because isEndOfElement " + + // text.asEndElement().getName().getLocalPart()); return ""; final String data = text.asCharacters().getData(); @@ -249,5 +260,4 @@ public class StreamingInputDocumentFactory { return data; } - } diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/TemplateFactory.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/TemplateFactory.java index c9d623a48..efb1dac6c 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/TemplateFactory.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/TemplateFactory.java @@ -1,23 +1,22 @@ package eu.dnetlib.dhp.oa.provision.utils; +import static eu.dnetlib.dhp.oa.provision.utils.GraphMappingUtils.removePrefix; +import static eu.dnetlib.dhp.oa.provision.utils.XmlSerializationUtils.escapeXml; + import eu.dnetlib.dhp.schema.oaf.DataInfo; import eu.dnetlib.dhp.schema.oaf.OafEntity; -import org.apache.commons.lang3.StringUtils; -import org.stringtemplate.v4.ST; - import java.io.IOException; import java.util.Collection; import java.util.List; import java.util.stream.Collectors; - -import static eu.dnetlib.dhp.oa.provision.utils.GraphMappingUtils.removePrefix; -import static eu.dnetlib.dhp.oa.provision.utils.XmlSerializationUtils.escapeXml; +import org.apache.commons.lang3.StringUtils; +import org.stringtemplate.v4.ST; public class TemplateFactory { private TemplateResources resources; - private final static char DELIMITER = '$'; + private static final char DELIMITER = '$'; public TemplateFactory() { try { @@ -27,7 +26,12 @@ public class TemplateFactory { } } - public String buildBody(final String type, final List metadata, final List rels, final List children, final List extraInfo) { + public String buildBody( + final String type, + final List metadata, + final List rels, + final List children, + final List extraInfo) { ST body = getTemplate(resources.getEntity()); body.add("name", type); @@ -41,61 +45,68 @@ public class TemplateFactory { public String getChild(final String name, final String id, final List metadata) { return getTemplate(resources.getChild()) - .add("name", name) - .add("hasId", !(id == null)) - .add("id", id != null ? escapeXml(removePrefix(id)) : "") - .add("metadata", metadata) - .render(); + .add("name", name) + .add("hasId", !(id == null)) + .add("id", id != null ? escapeXml(removePrefix(id)) : "") + .add("metadata", metadata) + .render(); } public String buildRecord( - final OafEntity entity, - final String schemaLocation, - final String body) { + final OafEntity entity, final String schemaLocation, final String body) { return getTemplate(resources.getRecord()) - .add("id", escapeXml(removePrefix(entity.getId()))) - .add("dateofcollection", entity.getDateofcollection()) - .add("dateoftransformation", entity.getDateoftransformation()) - .add("schemaLocation", schemaLocation) - .add("it", body) - .render(); + .add("id", escapeXml(removePrefix(entity.getId()))) + .add("dateofcollection", entity.getDateofcollection()) + .add("dateoftransformation", entity.getDateoftransformation()) + .add("schemaLocation", schemaLocation) + .add("it", body) + .render(); } - public String getRel(final String type, - final String objIdentifier, - final Collection fields, - final String semanticclass, - final String semantischeme, - final DataInfo info) { + public String getRel( + final String type, + final String objIdentifier, + final Collection fields, + final String semanticclass, + final String semantischeme, + final DataInfo info) { return getTemplate(resources.getRel()) - .add("type", type) - .add("objIdentifier", escapeXml(removePrefix(objIdentifier))) - .add("class", semanticclass) - .add("scheme", semantischeme) - .add("metadata", fields) - .add("inferred", info.getInferred()) - .add("trust", info.getTrust()) - .add("inferenceprovenance", info.getInferenceprovenance()) - .add("provenanceaction", info.getProvenanceaction() != null ? info.getProvenanceaction().getClassid() : "") - .render(); + .add("type", type) + .add("objIdentifier", escapeXml(removePrefix(objIdentifier))) + .add("class", semanticclass) + .add("scheme", semantischeme) + .add("metadata", fields) + .add("inferred", info.getInferred()) + .add("trust", info.getTrust()) + .add("inferenceprovenance", info.getInferenceprovenance()) + .add( + "provenanceaction", + info.getProvenanceaction() != null + ? info.getProvenanceaction().getClassid() + : "") + .render(); } - public String getInstance(final String resultId, final List instancemetadata, final List webresources) { + public String getInstance( + final String resultId, + final List instancemetadata, + final List webresources) { return getTemplate(resources.getInstance()) - .add("instanceId", escapeXml(removePrefix(resultId))) - .add("metadata", instancemetadata) - .add("webresources", webresources - .stream() - .filter(StringUtils::isNotBlank) - .map(w -> getWebResource(w)) - .collect(Collectors.toList())) - .render(); + .add("instanceId", escapeXml(removePrefix(resultId))) + .add("metadata", instancemetadata) + .add( + "webresources", + webresources.stream() + .filter(StringUtils::isNotBlank) + .map(w -> getWebResource(w)) + .collect(Collectors.toList())) + .render(); } private String getWebResource(final String identifier) { return getTemplate(resources.getWebresource()) - .add("identifier", escapeXml(identifier)) - .render(); + .add("identifier", escapeXml(identifier)) + .render(); } // HELPERS @@ -103,5 +114,4 @@ public class TemplateFactory { private ST getTemplate(final String res) { return new ST(res, DELIMITER, DELIMITER); } - } diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/TemplateResources.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/TemplateResources.java index b22e083ce..ff7d9e322 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/TemplateResources.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/TemplateResources.java @@ -1,7 +1,6 @@ package eu.dnetlib.dhp.oa.provision.utils; import com.google.common.io.Resources; - import java.io.IOException; import java.nio.charset.StandardCharsets; @@ -23,9 +22,7 @@ public class TemplateResources { return Resources.toString(Resources.getResource(classpathResource), StandardCharsets.UTF_8); } - public TemplateResources() throws IOException { - - } + public TemplateResources() throws IOException {} public String getEntity() { return entity; @@ -50,5 +47,4 @@ public class TemplateResources { public String getChild() { return child; } - } diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/XmlRecordFactory.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/XmlRecordFactory.java index aced9ac0a..3455cb5f3 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/XmlRecordFactory.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/XmlRecordFactory.java @@ -1,5 +1,9 @@ package eu.dnetlib.dhp.oa.provision.utils; +import static eu.dnetlib.dhp.oa.provision.utils.GraphMappingUtils.*; +import static org.apache.commons.lang3.StringUtils.isNotBlank; +import static org.apache.commons.lang3.StringUtils.substringBefore; + import com.fasterxml.jackson.databind.ObjectMapper; import com.google.common.base.Joiner; import com.google.common.base.Splitter; @@ -12,22 +16,8 @@ import eu.dnetlib.dhp.oa.provision.model.*; import eu.dnetlib.dhp.schema.common.EntityType; import eu.dnetlib.dhp.schema.common.MainEntityType; import eu.dnetlib.dhp.schema.common.ModelSupport; -import eu.dnetlib.dhp.schema.oaf.Result; import eu.dnetlib.dhp.schema.oaf.*; -import org.apache.commons.lang3.StringUtils; -import org.apache.spark.util.LongAccumulator; -import org.codehaus.janino.Mod; -import org.dom4j.Document; -import org.dom4j.DocumentException; -import org.dom4j.Element; -import org.dom4j.Node; -import org.dom4j.io.OutputFormat; -import org.dom4j.io.SAXReader; -import org.dom4j.io.XMLWriter; - -import javax.xml.transform.*; -import javax.xml.transform.dom.DOMSource; -import javax.xml.transform.stream.StreamResult; +import eu.dnetlib.dhp.schema.oaf.Result; import java.io.IOException; import java.io.Serializable; import java.io.StringReader; @@ -36,14 +26,22 @@ import java.util.List; import java.util.Map; import java.util.Set; import java.util.stream.Collectors; - -import static eu.dnetlib.dhp.oa.provision.utils.GraphMappingUtils.*; -import static org.apache.commons.lang3.StringUtils.isNotBlank; -import static org.apache.commons.lang3.StringUtils.substringBefore; +import javax.xml.transform.*; +import javax.xml.transform.dom.DOMSource; +import javax.xml.transform.stream.StreamResult; +import org.apache.commons.lang3.StringUtils; +import org.apache.spark.util.LongAccumulator; +import org.dom4j.Document; +import org.dom4j.DocumentException; +import org.dom4j.Element; +import org.dom4j.Node; +import org.dom4j.io.OutputFormat; +import org.dom4j.io.SAXReader; +import org.dom4j.io.XMLWriter; public class XmlRecordFactory implements Serializable { - private Map accumulators; + private Map accumulators; private Set specialDatasourceTypes; @@ -56,21 +54,26 @@ public class XmlRecordFactory implements Serializable { private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); public XmlRecordFactory( - final ContextMapper contextMapper, final boolean indent, - final String schemaLocation, final String otherDatasourceTypesUForUI) { + final ContextMapper contextMapper, + final boolean indent, + final String schemaLocation, + final String otherDatasourceTypesUForUI) { this(Maps.newHashMap(), contextMapper, indent, schemaLocation, otherDatasourceTypesUForUI); } public XmlRecordFactory( - final Map accumulators, - final ContextMapper contextMapper, final boolean indent, - final String schemaLocation, final String otherDatasourceTypesUForUI) { + final Map accumulators, + final ContextMapper contextMapper, + final boolean indent, + final String schemaLocation, + final String otherDatasourceTypesUForUI) { this.accumulators = accumulators; this.contextMapper = contextMapper; this.schemaLocation = schemaLocation; - this.specialDatasourceTypes = Sets.newHashSet(Splitter.on(",").trimResults().split(otherDatasourceTypesUForUI)); + this.specialDatasourceTypes = + Sets.newHashSet(Splitter.on(",").trimResults().split(otherDatasourceTypesUForUI)); this.indent = indent; } @@ -85,22 +88,27 @@ public class XmlRecordFactory implements Serializable { final EntityType type = EntityType.valueOf(je.getEntity().getType()); final List metadata = metadata(type, entity, contexts); - // rels has to be processed before the contexts because they enrich the contextMap with the funding info. + // rels has to be processed before the contexts because they enrich the contextMap with + // the + // funding info. final List relations = listRelations(je, templateFactory, contexts); final String mainType = ModelSupport.getMainType(type); metadata.addAll(buildContexts(mainType, contexts)); metadata.add(XmlSerializationUtils.parseDataInfo(entity.getDataInfo())); - final String body = templateFactory.buildBody( - mainType, - metadata, - relations, - listChildren(entity, je.getEntity().getType(), templateFactory), listExtraInfo(entity)); + final String body = + templateFactory.buildBody( + mainType, + metadata, + relations, + listChildren(entity, je.getEntity().getType(), templateFactory), + listExtraInfo(entity)); return printXML(templateFactory.buildRecord(entity, schemaLocation, body), indent); } catch (final Throwable e) { - throw new RuntimeException(String.format("error building record '%s'", entity.getId()), e); + throw new RuntimeException( + String.format("error building record '%s'", entity.getId()), e); } } @@ -136,7 +144,8 @@ public class XmlRecordFactory implements Serializable { private String printXML(String xml, boolean indent) { try { final Document doc = new SAXReader().read(new StringReader(xml)); - OutputFormat format = indent ? OutputFormat.createPrettyPrint() : OutputFormat.createCompactFormat(); + OutputFormat format = + indent ? OutputFormat.createPrettyPrint() : OutputFormat.createCompactFormat(); format.setExpandEmptyElements(false); format.setSuppressDeclaration(true); StringWriter sw = new StringWriter(); @@ -148,38 +157,36 @@ public class XmlRecordFactory implements Serializable { } } - private List metadata(final EntityType type, final OafEntity entity, final Set contexts) { + private List metadata( + final EntityType type, final OafEntity entity, final Set contexts) { final List metadata = Lists.newArrayList(); - if (entity.getCollectedfrom() != null) { - metadata.addAll(entity.getCollectedfrom() - .stream() - .map(kv -> XmlSerializationUtils.mapKeyValue("collectedfrom", kv)) - .collect(Collectors.toList())); + metadata.addAll( + entity.getCollectedfrom().stream() + .map(kv -> XmlSerializationUtils.mapKeyValue("collectedfrom", kv)) + .collect(Collectors.toList())); } if (entity.getOriginalId() != null) { - metadata.addAll(entity.getOriginalId() - .stream() - .map(s -> XmlSerializationUtils.asXmlElement("originalId", s)) - .collect(Collectors.toList())); + metadata.addAll( + entity.getOriginalId().stream() + .map(s -> XmlSerializationUtils.asXmlElement("originalId", s)) + .collect(Collectors.toList())); } if (entity.getPid() != null) { - metadata.addAll(entity.getPid() - .stream() - .map(p -> XmlSerializationUtils.mapStructuredProperty("pid", p)) - .collect(Collectors.toList())); + metadata.addAll( + entity.getPid().stream() + .map(p -> XmlSerializationUtils.mapStructuredProperty("pid", p)) + .collect(Collectors.toList())); } if (ModelSupport.isResult(type)) { final Result r = (Result) entity; if (r.getContext() != null) { - contexts.addAll(r.getContext() - .stream() - .map(c -> c.getId()) - .collect(Collectors.toList())); + contexts.addAll( + r.getContext().stream().map(c -> c.getId()).collect(Collectors.toList())); /* FIXME: Workaround for CLARIN mining issue: #3670#note-29 */ if (contexts.contains("dh-ch::subcommunity::2")) { contexts.add("clarin"); @@ -187,114 +194,197 @@ public class XmlRecordFactory implements Serializable { } if (r.getTitle() != null) { - metadata.addAll(r.getTitle() - .stream() - .map(t -> XmlSerializationUtils.mapStructuredProperty("title", t)) - .collect(Collectors.toList())); + metadata.addAll( + r.getTitle().stream() + .map(t -> XmlSerializationUtils.mapStructuredProperty("title", t)) + .collect(Collectors.toList())); } if (r.getBestaccessright() != null) { - metadata.add(XmlSerializationUtils.mapQualifier("bestaccessright", r.getBestaccessright())); + metadata.add( + XmlSerializationUtils.mapQualifier( + "bestaccessright", r.getBestaccessright())); } if (r.getAuthor() != null) { - metadata.addAll(r.getAuthor() - .stream() - .map(a -> { - final StringBuilder sb = new StringBuilder(" isNotBlank(sp.getQualifier().getClassid()) && isNotBlank(sp.getValue())) - .forEach(sp -> { - String pidType = XmlSerializationUtils.escapeXml(sp.getQualifier().getClassid()).replaceAll("\\W", ""); - String pidValue = XmlSerializationUtils.escapeXml(sp.getValue()); - - // ugly hack: some records provide swapped pidtype and pidvalue - if (authorPidTypes.contains(pidValue.toLowerCase().trim())) { - sb.append(String.format(" %s=\"%s\"", pidValue, pidType)); - } else { - pidType = pidType.replaceAll("\\W", "").replaceAll("\\d", ""); - if (isNotBlank(pidType)) { - sb.append(String.format(" %s=\"%s\"", - pidType, - pidValue.toLowerCase().replaceAll("orcid", ""))); + metadata.addAll( + r.getAuthor().stream() + .map( + a -> { + final StringBuilder sb = + new StringBuilder( + "" + XmlSerializationUtils.escapeXml(a.getFullname()) + ""); - return sb.toString(); - }).collect(Collectors.toList())); + if (isNotBlank(a.getSurname())) { + sb.append( + " surname=\"" + + XmlSerializationUtils.escapeXml( + a.getSurname()) + + "\""); + } + if (a.getPid() != null) { + a.getPid().stream() + .filter( + sp -> + isNotBlank( + sp.getQualifier() + .getClassid()) + && isNotBlank( + sp + .getValue())) + .forEach( + sp -> { + String pidType = + XmlSerializationUtils + .escapeXml( + sp.getQualifier() + .getClassid()) + .replaceAll( + "\\W", + ""); + String pidValue = + XmlSerializationUtils + .escapeXml( + sp + .getValue()); + + // ugly hack: some records + // provide swapped pidtype and + // pidvalue + if (authorPidTypes.contains( + pidValue.toLowerCase() + .trim())) { + sb.append( + String.format( + " %s=\"%s\"", + pidValue, + pidType)); + } else { + pidType = + pidType.replaceAll( + "\\W", + "") + .replaceAll( + "\\d", + ""); + if (isNotBlank(pidType)) { + sb.append( + String.format( + " %s=\"%s\"", + pidType, + pidValue.toLowerCase() + .replaceAll( + "orcid", + ""))); + } + } + }); + } + sb.append( + ">" + + XmlSerializationUtils.escapeXml( + a.getFullname()) + + ""); + return sb.toString(); + }) + .collect(Collectors.toList())); } if (r.getContributor() != null) { - metadata.addAll(r.getContributor() - .stream() - .map(c -> XmlSerializationUtils.asXmlElement("contributor", c.getValue())) - .collect(Collectors.toList())); + metadata.addAll( + r.getContributor().stream() + .map( + c -> + XmlSerializationUtils.asXmlElement( + "contributor", c.getValue())) + .collect(Collectors.toList())); } if (r.getCountry() != null) { - metadata.addAll(r.getCountry() - .stream() - .map(c -> XmlSerializationUtils.mapQualifier("country", c)) - .collect(Collectors.toList())); + metadata.addAll( + r.getCountry().stream() + .map(c -> XmlSerializationUtils.mapQualifier("country", c)) + .collect(Collectors.toList())); } if (r.getCoverage() != null) { - metadata.addAll(r.getCoverage() - .stream() - .map(c -> XmlSerializationUtils.asXmlElement("coverage", c.getValue())) - .collect(Collectors.toList())); + metadata.addAll( + r.getCoverage().stream() + .map( + c -> + XmlSerializationUtils.asXmlElement( + "coverage", c.getValue())) + .collect(Collectors.toList())); } if (r.getDateofacceptance() != null) { - metadata.add(XmlSerializationUtils.asXmlElement("dateofacceptance", r.getDateofacceptance().getValue())); + metadata.add( + XmlSerializationUtils.asXmlElement( + "dateofacceptance", r.getDateofacceptance().getValue())); } if (r.getDescription() != null) { - metadata.addAll(r.getDescription() - .stream() - .map(c -> XmlSerializationUtils.asXmlElement("description", c.getValue())) - .collect(Collectors.toList())); + metadata.addAll( + r.getDescription().stream() + .map( + c -> + XmlSerializationUtils.asXmlElement( + "description", c.getValue())) + .collect(Collectors.toList())); } if (r.getEmbargoenddate() != null) { - metadata.add(XmlSerializationUtils.asXmlElement("embargoenddate", r.getEmbargoenddate().getValue())); + metadata.add( + XmlSerializationUtils.asXmlElement( + "embargoenddate", r.getEmbargoenddate().getValue())); } if (r.getSubject() != null) { - metadata.addAll(r.getSubject() - .stream() - .map(s -> XmlSerializationUtils.mapStructuredProperty("subject", s)) - .collect(Collectors.toList())); + metadata.addAll( + r.getSubject().stream() + .map(s -> XmlSerializationUtils.mapStructuredProperty("subject", s)) + .collect(Collectors.toList())); } if (r.getLanguage() != null) { metadata.add(XmlSerializationUtils.mapQualifier("language", r.getLanguage())); } if (r.getRelevantdate() != null) { - metadata.addAll(r.getRelevantdate() - .stream() - .map(s -> XmlSerializationUtils.mapStructuredProperty("relevantdate", s)) - .collect(Collectors.toList())); + metadata.addAll( + r.getRelevantdate().stream() + .map( + s -> + XmlSerializationUtils.mapStructuredProperty( + "relevantdate", s)) + .collect(Collectors.toList())); } if (r.getPublisher() != null) { - metadata.add(XmlSerializationUtils.asXmlElement("publisher", r.getPublisher().getValue())); + metadata.add( + XmlSerializationUtils.asXmlElement( + "publisher", r.getPublisher().getValue())); } if (r.getSource() != null) { - metadata.addAll(r.getSource() - .stream() - .map(c -> XmlSerializationUtils.asXmlElement("source", c.getValue())) - .collect(Collectors.toList())); + metadata.addAll( + r.getSource().stream() + .map( + c -> + XmlSerializationUtils.asXmlElement( + "source", c.getValue())) + .collect(Collectors.toList())); } if (r.getFormat() != null) { - metadata.addAll(r.getFormat() - .stream() - .map(c -> XmlSerializationUtils.asXmlElement("format", c.getValue())) - .collect(Collectors.toList())); + metadata.addAll( + r.getFormat().stream() + .map( + c -> + XmlSerializationUtils.asXmlElement( + "format", c.getValue())) + .collect(Collectors.toList())); } if (r.getResulttype() != null) { metadata.add(XmlSerializationUtils.mapQualifier("resulttype", r.getResulttype())); } if (r.getResourcetype() != null) { - metadata.add(XmlSerializationUtils.mapQualifier("resourcetype", r.getResourcetype())); + metadata.add( + XmlSerializationUtils.mapQualifier("resourcetype", r.getResourcetype())); } } @@ -311,69 +401,99 @@ public class XmlRecordFactory implements Serializable { case dataset: final Dataset d = (Dataset) entity; if (d.getDevice() != null) { - metadata.add(XmlSerializationUtils.asXmlElement("device", d.getDevice().getValue())); + metadata.add( + XmlSerializationUtils.asXmlElement("device", d.getDevice().getValue())); } if (d.getLastmetadataupdate() != null) { - metadata.add(XmlSerializationUtils.asXmlElement("lastmetadataupdate", d.getLastmetadataupdate().getValue())); + metadata.add( + XmlSerializationUtils.asXmlElement( + "lastmetadataupdate", d.getLastmetadataupdate().getValue())); } if (d.getMetadataversionnumber() != null) { - metadata.add(XmlSerializationUtils.asXmlElement("metadataversionnumber", d.getMetadataversionnumber().getValue())); + metadata.add( + XmlSerializationUtils.asXmlElement( + "metadataversionnumber", + d.getMetadataversionnumber().getValue())); } if (d.getSize() != null) { - metadata.add(XmlSerializationUtils.asXmlElement("size", d.getSize().getValue())); + metadata.add( + XmlSerializationUtils.asXmlElement("size", d.getSize().getValue())); } if (d.getStoragedate() != null) { - metadata.add(XmlSerializationUtils.asXmlElement("storagedate", d.getStoragedate().getValue())); + metadata.add( + XmlSerializationUtils.asXmlElement( + "storagedate", d.getStoragedate().getValue())); } if (d.getVersion() != null) { - metadata.add(XmlSerializationUtils.asXmlElement("version", d.getVersion().getValue())); + metadata.add( + XmlSerializationUtils.asXmlElement( + "version", d.getVersion().getValue())); } - //TODO d.getGeolocation() + // TODO d.getGeolocation() break; case otherresearchproduct: final OtherResearchProduct orp = (OtherResearchProduct) entity; if (orp.getContactperson() != null) { - metadata.addAll(orp.getContactperson() - .stream() - .map(c -> XmlSerializationUtils.asXmlElement("contactperson", c.getValue())) - .collect(Collectors.toList())); + metadata.addAll( + orp.getContactperson().stream() + .map( + c -> + XmlSerializationUtils.asXmlElement( + "contactperson", c.getValue())) + .collect(Collectors.toList())); } if (orp.getContactgroup() != null) { - metadata.addAll(orp.getContactgroup() - .stream() - .map(c -> XmlSerializationUtils.asXmlElement("contactgroup", c.getValue())) - .collect(Collectors.toList())); + metadata.addAll( + orp.getContactgroup().stream() + .map( + c -> + XmlSerializationUtils.asXmlElement( + "contactgroup", c.getValue())) + .collect(Collectors.toList())); } if (orp.getTool() != null) { - metadata.addAll(orp.getTool() - .stream() - .map(c -> XmlSerializationUtils.asXmlElement("tool", c.getValue())) - .collect(Collectors.toList())); + metadata.addAll( + orp.getTool().stream() + .map( + c -> + XmlSerializationUtils.asXmlElement( + "tool", c.getValue())) + .collect(Collectors.toList())); } break; case software: final Software s = (Software) entity; if (s.getDocumentationUrl() != null) { - metadata.addAll(s.getDocumentationUrl() - .stream() - .map(c -> XmlSerializationUtils.asXmlElement("documentationUrl", c.getValue())) - .collect(Collectors.toList())); + metadata.addAll( + s.getDocumentationUrl().stream() + .map( + c -> + XmlSerializationUtils.asXmlElement( + "documentationUrl", c.getValue())) + .collect(Collectors.toList())); } if (s.getLicense() != null) { - metadata.addAll(s.getLicense() - .stream() - .map(l -> XmlSerializationUtils.mapStructuredProperty("license", l)) - .collect(Collectors.toList())); + metadata.addAll( + s.getLicense().stream() + .map( + l -> + XmlSerializationUtils.mapStructuredProperty( + "license", l)) + .collect(Collectors.toList())); } if (s.getCodeRepositoryUrl() != null) { - metadata.add(XmlSerializationUtils.asXmlElement("codeRepositoryUrl", s.getCodeRepositoryUrl().getValue())); + metadata.add( + XmlSerializationUtils.asXmlElement( + "codeRepositoryUrl", s.getCodeRepositoryUrl().getValue())); } if (s.getProgrammingLanguage() != null) { - metadata.add(XmlSerializationUtils.mapQualifier("programmingLanguage", s.getProgrammingLanguage())); + metadata.add( + XmlSerializationUtils.mapQualifier( + "programmingLanguage", s.getProgrammingLanguage())); } break; case datasource: @@ -383,121 +503,194 @@ public class XmlRecordFactory implements Serializable { mapDatasourceType(metadata, ds.getDatasourcetype()); } if (ds.getOpenairecompatibility() != null) { - metadata.add(XmlSerializationUtils.mapQualifier("openairecompatibility", ds.getOpenairecompatibility())); + metadata.add( + XmlSerializationUtils.mapQualifier( + "openairecompatibility", ds.getOpenairecompatibility())); } if (ds.getOfficialname() != null) { - metadata.add(XmlSerializationUtils.asXmlElement("officialname", ds.getOfficialname().getValue())); + metadata.add( + XmlSerializationUtils.asXmlElement( + "officialname", ds.getOfficialname().getValue())); } if (ds.getEnglishname() != null) { - metadata.add(XmlSerializationUtils.asXmlElement("englishname", ds.getEnglishname().getValue())); + metadata.add( + XmlSerializationUtils.asXmlElement( + "englishname", ds.getEnglishname().getValue())); } if (ds.getWebsiteurl() != null) { - metadata.add(XmlSerializationUtils.asXmlElement("websiteurl", ds.getWebsiteurl().getValue())); + metadata.add( + XmlSerializationUtils.asXmlElement( + "websiteurl", ds.getWebsiteurl().getValue())); } if (ds.getLogourl() != null) { - metadata.add(XmlSerializationUtils.asXmlElement("logourl", ds.getLogourl().getValue())); + metadata.add( + XmlSerializationUtils.asXmlElement( + "logourl", ds.getLogourl().getValue())); } if (ds.getContactemail() != null) { - metadata.add(XmlSerializationUtils.asXmlElement("contactemail", ds.getContactemail().getValue())); + metadata.add( + XmlSerializationUtils.asXmlElement( + "contactemail", ds.getContactemail().getValue())); } if (ds.getNamespaceprefix() != null) { - metadata.add(XmlSerializationUtils.asXmlElement("namespaceprefix", ds.getNamespaceprefix().getValue())); + metadata.add( + XmlSerializationUtils.asXmlElement( + "namespaceprefix", ds.getNamespaceprefix().getValue())); } if (ds.getLatitude() != null) { - metadata.add(XmlSerializationUtils.asXmlElement("latitude", ds.getLatitude().getValue())); + metadata.add( + XmlSerializationUtils.asXmlElement( + "latitude", ds.getLatitude().getValue())); } if (ds.getLongitude() != null) { - metadata.add(XmlSerializationUtils.asXmlElement("longitude", ds.getLongitude().getValue())); + metadata.add( + XmlSerializationUtils.asXmlElement( + "longitude", ds.getLongitude().getValue())); } if (ds.getDateofvalidation() != null) { - metadata.add(XmlSerializationUtils.asXmlElement("dateofvalidation", ds.getDateofvalidation().getValue())); + metadata.add( + XmlSerializationUtils.asXmlElement( + "dateofvalidation", ds.getDateofvalidation().getValue())); } if (ds.getDescription() != null) { - metadata.add(XmlSerializationUtils.asXmlElement("description", ds.getDescription().getValue())); + metadata.add( + XmlSerializationUtils.asXmlElement( + "description", ds.getDescription().getValue())); } if (ds.getOdnumberofitems() != null) { - metadata.add(XmlSerializationUtils.asXmlElement("odnumberofitems", ds.getOdnumberofitems().getValue())); + metadata.add( + XmlSerializationUtils.asXmlElement( + "odnumberofitems", ds.getOdnumberofitems().getValue())); } if (ds.getOdnumberofitemsdate() != null) { - metadata.add(XmlSerializationUtils.asXmlElement("odnumberofitemsdate", ds.getOdnumberofitemsdate().getValue())); + metadata.add( + XmlSerializationUtils.asXmlElement( + "odnumberofitemsdate", ds.getOdnumberofitemsdate().getValue())); } if (ds.getOdpolicies() != null) { - metadata.add(XmlSerializationUtils.asXmlElement("odpolicies", ds.getOdpolicies().getValue())); + metadata.add( + XmlSerializationUtils.asXmlElement( + "odpolicies", ds.getOdpolicies().getValue())); } if (ds.getOdlanguages() != null) { - metadata.addAll(ds.getOdlanguages() - .stream() - .map(c -> XmlSerializationUtils.asXmlElement("odlanguages", c.getValue())) - .collect(Collectors.toList())); + metadata.addAll( + ds.getOdlanguages().stream() + .map( + c -> + XmlSerializationUtils.asXmlElement( + "odlanguages", c.getValue())) + .collect(Collectors.toList())); } if (ds.getOdcontenttypes() != null) { - metadata.addAll(ds.getOdcontenttypes() - .stream() - .map(c -> XmlSerializationUtils.asXmlElement("odcontenttypes", c.getValue())) - .collect(Collectors.toList())); + metadata.addAll( + ds.getOdcontenttypes().stream() + .map( + c -> + XmlSerializationUtils.asXmlElement( + "odcontenttypes", c.getValue())) + .collect(Collectors.toList())); } if (ds.getAccessinfopackage() != null) { - metadata.addAll(ds.getAccessinfopackage() - .stream() - .map(c -> XmlSerializationUtils.asXmlElement("accessinfopackage", c.getValue())) - .collect(Collectors.toList())); + metadata.addAll( + ds.getAccessinfopackage().stream() + .map( + c -> + XmlSerializationUtils.asXmlElement( + "accessinfopackage", c.getValue())) + .collect(Collectors.toList())); } if (ds.getReleaseenddate() != null) { - metadata.add(XmlSerializationUtils.asXmlElement("releasestartdate", ds.getReleaseenddate().getValue())); + metadata.add( + XmlSerializationUtils.asXmlElement( + "releasestartdate", ds.getReleaseenddate().getValue())); } if (ds.getReleaseenddate() != null) { - metadata.add(XmlSerializationUtils.asXmlElement("releaseenddate", ds.getReleaseenddate().getValue())); + metadata.add( + XmlSerializationUtils.asXmlElement( + "releaseenddate", ds.getReleaseenddate().getValue())); } if (ds.getMissionstatementurl() != null) { - metadata.add(XmlSerializationUtils.asXmlElement("missionstatementurl", ds.getMissionstatementurl().getValue())); + metadata.add( + XmlSerializationUtils.asXmlElement( + "missionstatementurl", ds.getMissionstatementurl().getValue())); } if (ds.getDataprovider() != null) { - metadata.add(XmlSerializationUtils.asXmlElement("dataprovider", ds.getDataprovider().getValue().toString())); + metadata.add( + XmlSerializationUtils.asXmlElement( + "dataprovider", ds.getDataprovider().getValue().toString())); } if (ds.getServiceprovider() != null) { - metadata.add(XmlSerializationUtils.asXmlElement("serviceprovider", ds.getServiceprovider().getValue().toString())); + metadata.add( + XmlSerializationUtils.asXmlElement( + "serviceprovider", + ds.getServiceprovider().getValue().toString())); } if (ds.getDatabaseaccesstype() != null) { - metadata.add(XmlSerializationUtils.asXmlElement("databaseaccesstype", ds.getDatabaseaccesstype().getValue())); + metadata.add( + XmlSerializationUtils.asXmlElement( + "databaseaccesstype", ds.getDatabaseaccesstype().getValue())); } if (ds.getDatauploadtype() != null) { - metadata.add(XmlSerializationUtils.asXmlElement("datauploadtype", ds.getDatauploadtype().getValue())); + metadata.add( + XmlSerializationUtils.asXmlElement( + "datauploadtype", ds.getDatauploadtype().getValue())); } if (ds.getDatabaseaccessrestriction() != null) { - metadata.add(XmlSerializationUtils.asXmlElement("databaseaccessrestriction", ds.getDatabaseaccessrestriction().getValue())); + metadata.add( + XmlSerializationUtils.asXmlElement( + "databaseaccessrestriction", + ds.getDatabaseaccessrestriction().getValue())); } if (ds.getDatauploadrestriction() != null) { - metadata.add(XmlSerializationUtils.asXmlElement("datauploadrestriction", ds.getDatauploadrestriction().getValue())); + metadata.add( + XmlSerializationUtils.asXmlElement( + "datauploadrestriction", + ds.getDatauploadrestriction().getValue())); } if (ds.getVersioning() != null) { - metadata.add(XmlSerializationUtils.asXmlElement("versioning", ds.getVersioning().getValue().toString())); + metadata.add( + XmlSerializationUtils.asXmlElement( + "versioning", ds.getVersioning().getValue().toString())); } if (ds.getCitationguidelineurl() != null) { - metadata.add(XmlSerializationUtils.asXmlElement("citationguidelineurl", ds.getCitationguidelineurl().getValue())); + metadata.add( + XmlSerializationUtils.asXmlElement( + "citationguidelineurl", + ds.getCitationguidelineurl().getValue())); } if (ds.getQualitymanagementkind() != null) { - metadata.add(XmlSerializationUtils.asXmlElement("qualitymanagementkind", ds.getQualitymanagementkind().getValue())); + metadata.add( + XmlSerializationUtils.asXmlElement( + "qualitymanagementkind", + ds.getQualitymanagementkind().getValue())); } if (ds.getPidsystems() != null) { - metadata.add(XmlSerializationUtils.asXmlElement("pidsystems", ds.getPidsystems().getValue())); + metadata.add( + XmlSerializationUtils.asXmlElement( + "pidsystems", ds.getPidsystems().getValue())); } if (ds.getCertificates() != null) { - metadata.add(XmlSerializationUtils.asXmlElement("certificates", ds.getCertificates().getValue())); + metadata.add( + XmlSerializationUtils.asXmlElement( + "certificates", ds.getCertificates().getValue())); } if (ds.getPolicies() != null) { - metadata.addAll(ds.getPolicies() - .stream() - .map(kv -> XmlSerializationUtils.mapKeyValue("policies", kv)) - .collect(Collectors.toList())); + metadata.addAll( + ds.getPolicies().stream() + .map(kv -> XmlSerializationUtils.mapKeyValue("policies", kv)) + .collect(Collectors.toList())); } if (ds.getJournal() != null) { metadata.add(XmlSerializationUtils.mapJournal(ds.getJournal())); } if (ds.getSubjects() != null) { - metadata.addAll(ds.getSubjects() - .stream() - .map(sp -> XmlSerializationUtils.mapStructuredProperty("subjects", sp)) - .collect(Collectors.toList())); + metadata.addAll( + ds.getSubjects().stream() + .map( + sp -> + XmlSerializationUtils.mapStructuredProperty( + "subjects", sp)) + .collect(Collectors.toList())); } break; @@ -505,53 +698,87 @@ public class XmlRecordFactory implements Serializable { final Organization o = (Organization) entity; if (o.getLegalshortname() != null) { - metadata.add(XmlSerializationUtils.asXmlElement("legalshortname", o.getLegalshortname().getValue())); + metadata.add( + XmlSerializationUtils.asXmlElement( + "legalshortname", o.getLegalshortname().getValue())); } if (o.getLegalname() != null) { - metadata.add(XmlSerializationUtils.asXmlElement("legalname", o.getLegalname().getValue())); + metadata.add( + XmlSerializationUtils.asXmlElement( + "legalname", o.getLegalname().getValue())); } if (o.getAlternativeNames() != null) { - metadata.addAll(o.getAlternativeNames() - .stream() - .map(c -> XmlSerializationUtils.asXmlElement("alternativeNames", c.getValue())) - .collect(Collectors.toList())); + metadata.addAll( + o.getAlternativeNames().stream() + .map( + c -> + XmlSerializationUtils.asXmlElement( + "alternativeNames", c.getValue())) + .collect(Collectors.toList())); } if (o.getWebsiteurl() != null) { - metadata.add(XmlSerializationUtils.asXmlElement("websiteurl", o.getWebsiteurl().getValue())); + metadata.add( + XmlSerializationUtils.asXmlElement( + "websiteurl", o.getWebsiteurl().getValue())); } if (o.getLogourl() != null) { - metadata.add(XmlSerializationUtils.asXmlElement("websiteurl", o.getLogourl().getValue())); + metadata.add( + XmlSerializationUtils.asXmlElement( + "websiteurl", o.getLogourl().getValue())); } if (o.getEclegalbody() != null) { - metadata.add(XmlSerializationUtils.asXmlElement("eclegalbody", o.getEclegalbody().getValue())); + metadata.add( + XmlSerializationUtils.asXmlElement( + "eclegalbody", o.getEclegalbody().getValue())); } if (o.getEclegalperson() != null) { - metadata.add(XmlSerializationUtils.asXmlElement("eclegalperson", o.getEclegalperson().getValue())); + metadata.add( + XmlSerializationUtils.asXmlElement( + "eclegalperson", o.getEclegalperson().getValue())); } if (o.getEcnonprofit() != null) { - metadata.add(XmlSerializationUtils.asXmlElement("ecnonprofit", o.getEcnonprofit().getValue())); + metadata.add( + XmlSerializationUtils.asXmlElement( + "ecnonprofit", o.getEcnonprofit().getValue())); } if (o.getEcresearchorganization() != null) { - metadata.add(XmlSerializationUtils.asXmlElement("ecresearchorganization", o.getEcresearchorganization().getValue())); + metadata.add( + XmlSerializationUtils.asXmlElement( + "ecresearchorganization", + o.getEcresearchorganization().getValue())); } if (o.getEchighereducation() != null) { - metadata.add(XmlSerializationUtils.asXmlElement("echighereducation", o.getEchighereducation().getValue())); + metadata.add( + XmlSerializationUtils.asXmlElement( + "echighereducation", o.getEchighereducation().getValue())); } if (o.getEcinternationalorganization() != null) { - metadata.add(XmlSerializationUtils.asXmlElement("ecinternationalorganizationeurinterests", o.getEcinternationalorganization().getValue())); + metadata.add( + XmlSerializationUtils.asXmlElement( + "ecinternationalorganizationeurinterests", + o.getEcinternationalorganization().getValue())); } if (o.getEcinternationalorganization() != null) { - metadata.add(XmlSerializationUtils.asXmlElement("ecinternationalorganization", o.getEcinternationalorganization().getValue())); + metadata.add( + XmlSerializationUtils.asXmlElement( + "ecinternationalorganization", + o.getEcinternationalorganization().getValue())); } if (o.getEcenterprise() != null) { - metadata.add(XmlSerializationUtils.asXmlElement("ecenterprise", o.getEcenterprise().getValue())); + metadata.add( + XmlSerializationUtils.asXmlElement( + "ecenterprise", o.getEcenterprise().getValue())); } if (o.getEcsmevalidated() != null) { - metadata.add(XmlSerializationUtils.asXmlElement("ecsmevalidated", o.getEcsmevalidated().getValue())); + metadata.add( + XmlSerializationUtils.asXmlElement( + "ecsmevalidated", o.getEcsmevalidated().getValue())); } if (o.getEcnutscode() != null) { - metadata.add(XmlSerializationUtils.asXmlElement("ecnutscode", o.getEcnutscode().getValue())); + metadata.add( + XmlSerializationUtils.asXmlElement( + "ecnutscode", o.getEcnutscode().getValue())); } if (o.getCountry() != null) { metadata.add(XmlSerializationUtils.mapQualifier("country", o.getCountry())); @@ -559,80 +786,119 @@ public class XmlRecordFactory implements Serializable { break; case project: - final Project p = (Project) entity; if (p.getWebsiteurl() != null) { - metadata.add(XmlSerializationUtils.asXmlElement("websiteurl", p.getWebsiteurl().getValue())); + metadata.add( + XmlSerializationUtils.asXmlElement( + "websiteurl", p.getWebsiteurl().getValue())); } if (p.getCode() != null) { - metadata.add(XmlSerializationUtils.asXmlElement("code", p.getCode().getValue())); + metadata.add( + XmlSerializationUtils.asXmlElement("code", p.getCode().getValue())); } if (p.getAcronym() != null) { - metadata.add(XmlSerializationUtils.asXmlElement("acronym", p.getAcronym().getValue())); + metadata.add( + XmlSerializationUtils.asXmlElement( + "acronym", p.getAcronym().getValue())); } if (p.getTitle() != null) { - metadata.add(XmlSerializationUtils.asXmlElement("title", p.getTitle().getValue())); + metadata.add( + XmlSerializationUtils.asXmlElement("title", p.getTitle().getValue())); } if (p.getStartdate() != null) { - metadata.add(XmlSerializationUtils.asXmlElement("startdate", p.getStartdate().getValue())); + metadata.add( + XmlSerializationUtils.asXmlElement( + "startdate", p.getStartdate().getValue())); } if (p.getEnddate() != null) { - metadata.add(XmlSerializationUtils.asXmlElement("enddate", p.getEnddate().getValue())); + metadata.add( + XmlSerializationUtils.asXmlElement( + "enddate", p.getEnddate().getValue())); } if (p.getCallidentifier() != null) { - metadata.add(XmlSerializationUtils.asXmlElement("callidentifier", p.getCallidentifier().getValue())); + metadata.add( + XmlSerializationUtils.asXmlElement( + "callidentifier", p.getCallidentifier().getValue())); } if (p.getKeywords() != null) { - metadata.add(XmlSerializationUtils.asXmlElement("keywords", p.getKeywords().getValue())); + metadata.add( + XmlSerializationUtils.asXmlElement( + "keywords", p.getKeywords().getValue())); } if (p.getDuration() != null) { - metadata.add(XmlSerializationUtils.asXmlElement("duration", p.getDuration().getValue())); + metadata.add( + XmlSerializationUtils.asXmlElement( + "duration", p.getDuration().getValue())); } if (p.getEcarticle29_3() != null) { - metadata.add(XmlSerializationUtils.asXmlElement("ecarticle29_3", p.getEcarticle29_3().getValue())); + metadata.add( + XmlSerializationUtils.asXmlElement( + "ecarticle29_3", p.getEcarticle29_3().getValue())); } if (p.getSubjects() != null) { - metadata.addAll(p.getSubjects() - .stream() - .map(sp -> XmlSerializationUtils.mapStructuredProperty("subject", sp)) - .collect(Collectors.toList())); + metadata.addAll( + p.getSubjects().stream() + .map( + sp -> + XmlSerializationUtils.mapStructuredProperty( + "subject", sp)) + .collect(Collectors.toList())); } if (p.getContracttype() != null) { - metadata.add(XmlSerializationUtils.mapQualifier("contracttype", p.getContracttype())); + metadata.add( + XmlSerializationUtils.mapQualifier( + "contracttype", p.getContracttype())); } if (p.getEcsc39() != null) { - metadata.add(XmlSerializationUtils.asXmlElement("ecsc39", p.getEcsc39().getValue())); + metadata.add( + XmlSerializationUtils.asXmlElement("ecsc39", p.getEcsc39().getValue())); } if (p.getContactfullname() != null) { - metadata.add(XmlSerializationUtils.asXmlElement("contactfullname", p.getContactfullname().getValue())); + metadata.add( + XmlSerializationUtils.asXmlElement( + "contactfullname", p.getContactfullname().getValue())); } if (p.getContactfax() != null) { - metadata.add(XmlSerializationUtils.asXmlElement("contactfax", p.getContactfax().getValue())); + metadata.add( + XmlSerializationUtils.asXmlElement( + "contactfax", p.getContactfax().getValue())); } if (p.getContactphone() != null) { - metadata.add(XmlSerializationUtils.asXmlElement("contactphone", p.getContactphone().getValue())); + metadata.add( + XmlSerializationUtils.asXmlElement( + "contactphone", p.getContactphone().getValue())); } if (p.getContactemail() != null) { - metadata.add(XmlSerializationUtils.asXmlElement("contactemail", p.getContactemail().getValue())); + metadata.add( + XmlSerializationUtils.asXmlElement( + "contactemail", p.getContactemail().getValue())); } if (p.getSummary() != null) { - metadata.add(XmlSerializationUtils.asXmlElement("summary", p.getSummary().getValue())); + metadata.add( + XmlSerializationUtils.asXmlElement( + "summary", p.getSummary().getValue())); } if (p.getCurrency() != null) { - metadata.add(XmlSerializationUtils.asXmlElement("currency", p.getCurrency().getValue())); + metadata.add( + XmlSerializationUtils.asXmlElement( + "currency", p.getCurrency().getValue())); } if (p.getTotalcost() != null) { - metadata.add(XmlSerializationUtils.asXmlElement("totalcost", p.getTotalcost().toString())); + metadata.add( + XmlSerializationUtils.asXmlElement( + "totalcost", p.getTotalcost().toString())); } if (p.getFundedamount() != null) { - metadata.add(XmlSerializationUtils.asXmlElement("fundedamount", p.getFundedamount().toString())); + metadata.add( + XmlSerializationUtils.asXmlElement( + "fundedamount", p.getFundedamount().toString())); } if (p.getFundingtree() != null) { - metadata.addAll(p.getFundingtree() - .stream() - .map(ft -> ft.getValue()) - .collect(Collectors.toList())); + metadata.addAll( + p.getFundingtree().stream() + .map(ft -> ft.getValue()) + .collect(Collectors.toList())); } break; @@ -669,7 +935,8 @@ public class XmlRecordFactory implements Serializable { return bestAccessRight; } - private List listRelations(final JoinedEntity je, TemplateFactory templateFactory, final Set contexts) { + private List listRelations( + final JoinedEntity je, TemplateFactory templateFactory, final Set contexts) { final List rels = Lists.newArrayList(); for (final Tuple2 link : je.getLinks()) { @@ -685,104 +952,135 @@ public class XmlRecordFactory implements Serializable { case otherresearchproduct: case software: if (re.getTitle() != null && isNotBlank(re.getTitle().getValue())) { - metadata.add(XmlSerializationUtils.mapStructuredProperty("title", re.getTitle())); + metadata.add( + XmlSerializationUtils.mapStructuredProperty( + "title", re.getTitle())); } if (isNotBlank(re.getDateofacceptance())) { - metadata.add(XmlSerializationUtils.asXmlElement("dateofacceptance", re.getDateofacceptance())); + metadata.add( + XmlSerializationUtils.asXmlElement( + "dateofacceptance", re.getDateofacceptance())); } if (isNotBlank(re.getPublisher())) { - metadata.add(XmlSerializationUtils.asXmlElement("publisher", re.getPublisher())); + metadata.add( + XmlSerializationUtils.asXmlElement("publisher", re.getPublisher())); } if (isNotBlank(re.getCodeRepositoryUrl())) { - metadata.add(XmlSerializationUtils.asXmlElement("coderepositoryurl", re.getCodeRepositoryUrl())); + metadata.add( + XmlSerializationUtils.asXmlElement( + "coderepositoryurl", re.getCodeRepositoryUrl())); } if (re.getResulttype() != null & re.getResulttype().isBlank()) { - metadata.add(XmlSerializationUtils.mapQualifier("resulttype", re.getResulttype())); + metadata.add( + XmlSerializationUtils.mapQualifier( + "resulttype", re.getResulttype())); } if (re.getCollectedfrom() != null) { - metadata.addAll(re.getCollectedfrom() - .stream() - .map(kv -> XmlSerializationUtils.mapKeyValue("collectedfrom", kv)) - .collect(Collectors.toList())); + metadata.addAll( + re.getCollectedfrom().stream() + .map( + kv -> + XmlSerializationUtils.mapKeyValue( + "collectedfrom", kv)) + .collect(Collectors.toList())); } if (re.getPid() != null) { - metadata.addAll(re.getPid() - .stream() - .map(p -> XmlSerializationUtils.mapStructuredProperty("pid", p)) - .collect(Collectors.toList())); + metadata.addAll( + re.getPid().stream() + .map( + p -> + XmlSerializationUtils.mapStructuredProperty( + "pid", p)) + .collect(Collectors.toList())); } break; case datasource: if (isNotBlank(re.getOfficialname())) { - metadata.add(XmlSerializationUtils.asXmlElement("officialname", re.getOfficialname())); + metadata.add( + XmlSerializationUtils.asXmlElement( + "officialname", re.getOfficialname())); } if (re.getDatasourcetype() != null & !re.getDatasourcetype().isBlank()) { mapDatasourceType(metadata, re.getDatasourcetype()); } - if (re.getOpenairecompatibility() != null & !re.getOpenairecompatibility().isBlank()) { - metadata.add(XmlSerializationUtils.mapQualifier("openairecompatibility", re.getOpenairecompatibility())); + if (re.getOpenairecompatibility() != null + & !re.getOpenairecompatibility().isBlank()) { + metadata.add( + XmlSerializationUtils.mapQualifier( + "openairecompatibility", re.getOpenairecompatibility())); } break; case organization: if (isNotBlank(re.getLegalname())) { - metadata.add(XmlSerializationUtils.asXmlElement("legalname", re.getLegalname())); + metadata.add( + XmlSerializationUtils.asXmlElement("legalname", re.getLegalname())); } if (isNotBlank(re.getLegalshortname())) { - metadata.add(XmlSerializationUtils.asXmlElement("legalshortname", re.getLegalshortname())); + metadata.add( + XmlSerializationUtils.asXmlElement( + "legalshortname", re.getLegalshortname())); } if (re.getCountry() != null & !re.getCountry().isBlank()) { - metadata.add(XmlSerializationUtils.mapQualifier("country", re.getCountry())); + metadata.add( + XmlSerializationUtils.mapQualifier("country", re.getCountry())); } break; case project: if (isNotBlank(re.getProjectTitle())) { - metadata.add(XmlSerializationUtils.asXmlElement("title", re.getProjectTitle())); + metadata.add( + XmlSerializationUtils.asXmlElement("title", re.getProjectTitle())); } if (isNotBlank(re.getCode())) { metadata.add(XmlSerializationUtils.asXmlElement("code", re.getCode())); } if (isNotBlank(re.getAcronym())) { - metadata.add(XmlSerializationUtils.asXmlElement("acronym", re.getAcronym())); + metadata.add( + XmlSerializationUtils.asXmlElement("acronym", re.getAcronym())); } if (re.getContracttype() != null & !re.getContracttype().isBlank()) { - metadata.add(XmlSerializationUtils.mapQualifier("contracttype", re.getContracttype())); + metadata.add( + XmlSerializationUtils.mapQualifier( + "contracttype", re.getContracttype())); } if (re.getFundingtree() != null) { - metadata.addAll(re.getFundingtree() - .stream() - .peek(ft -> fillContextMap(ft, contexts)) - .map(ft -> getRelFundingTree(ft)) - .collect(Collectors.toList())); + metadata.addAll( + re.getFundingtree().stream() + .peek(ft -> fillContextMap(ft, contexts)) + .map(ft -> getRelFundingTree(ft)) + .collect(Collectors.toList())); } break; default: throw new IllegalArgumentException("invalid target type: " + targetType); - } final DataInfo info = rel.getDataInfo(); final String scheme = ModelSupport.getScheme(re.getType(), targetType); if (StringUtils.isBlank(scheme)) { - throw new IllegalArgumentException(String.format("missing scheme for: <%s - %s>", re.getType(), targetType)); + throw new IllegalArgumentException( + String.format("missing scheme for: <%s - %s>", re.getType(), targetType)); } - final String accumulatorName = getRelDescriptor(rel.getRelType(), rel.getSubRelType(), rel.getRelClass()); + final String accumulatorName = + getRelDescriptor(rel.getRelType(), rel.getSubRelType(), rel.getRelClass()); if (accumulators.containsKey(accumulatorName)) { accumulators.get(accumulatorName).add(1); } - rels.add(templateFactory.getRel( - targetType, - rel.getTarget(), - Sets.newHashSet(metadata), - rel.getRelClass(), - scheme, - info)); + rels.add( + templateFactory.getRel( + targetType, + rel.getTarget(), + Sets.newHashSet(metadata), + rel.getRelClass(), + scheme, + info)); } return rels; } - private List listChildren(final OafEntity entity, String type, TemplateFactory templateFactory) { + private List listChildren( + final OafEntity entity, String type, TemplateFactory templateFactory) { final List children = Lists.newArrayList(); EntityType entityType = EntityType.valueOf(type); @@ -794,34 +1092,63 @@ public class XmlRecordFactory implements Serializable { final List fields = Lists.newArrayList(); if (instance.getAccessright() != null && !instance.getAccessright().isBlank()) { - fields.add(XmlSerializationUtils.mapQualifier("accessright", instance.getAccessright())); + fields.add( + XmlSerializationUtils.mapQualifier( + "accessright", instance.getAccessright())); } if (instance.getCollectedfrom() != null) { - fields.add(XmlSerializationUtils.mapKeyValue("collectedfrom", instance.getCollectedfrom())); + fields.add( + XmlSerializationUtils.mapKeyValue( + "collectedfrom", instance.getCollectedfrom())); } if (instance.getHostedby() != null) { - fields.add(XmlSerializationUtils.mapKeyValue("hostedby", instance.getHostedby())); + fields.add( + XmlSerializationUtils.mapKeyValue( + "hostedby", instance.getHostedby())); } - if (instance.getDateofacceptance() != null && isNotBlank(instance.getDateofacceptance().getValue())) { - fields.add(XmlSerializationUtils.asXmlElement("dateofacceptance", instance.getDateofacceptance().getValue())); + if (instance.getDateofacceptance() != null + && isNotBlank(instance.getDateofacceptance().getValue())) { + fields.add( + XmlSerializationUtils.asXmlElement( + "dateofacceptance", + instance.getDateofacceptance().getValue())); } - if (instance.getInstancetype() != null && !instance.getInstancetype().isBlank()) { - fields.add(XmlSerializationUtils.mapQualifier("instancetype", instance.getInstancetype())); + if (instance.getInstancetype() != null + && !instance.getInstancetype().isBlank()) { + fields.add( + XmlSerializationUtils.mapQualifier( + "instancetype", instance.getInstancetype())); } if (isNotBlank(instance.getDistributionlocation())) { - fields.add(XmlSerializationUtils.asXmlElement("distributionlocation", instance.getDistributionlocation())); + fields.add( + XmlSerializationUtils.asXmlElement( + "distributionlocation", + instance.getDistributionlocation())); } - if (instance.getRefereed() != null && isNotBlank(instance.getRefereed().getValue())) { - fields.add(XmlSerializationUtils.asXmlElement("refereed", instance.getRefereed().getValue())); + if (instance.getRefereed() != null + && isNotBlank(instance.getRefereed().getValue())) { + fields.add( + XmlSerializationUtils.asXmlElement( + "refereed", instance.getRefereed().getValue())); } - if (instance.getProcessingchargeamount() != null && isNotBlank(instance.getProcessingchargeamount().getValue())) { - fields.add(XmlSerializationUtils.asXmlElement("processingchargeamount", instance.getProcessingchargeamount().getValue())); + if (instance.getProcessingchargeamount() != null + && isNotBlank(instance.getProcessingchargeamount().getValue())) { + fields.add( + XmlSerializationUtils.asXmlElement( + "processingchargeamount", + instance.getProcessingchargeamount().getValue())); } - if (instance.getProcessingchargecurrency() != null && isNotBlank(instance.getProcessingchargecurrency().getValue())) { - fields.add(XmlSerializationUtils.asXmlElement("processingchargecurrency", instance.getProcessingchargecurrency().getValue())); + if (instance.getProcessingchargecurrency() != null + && isNotBlank(instance.getProcessingchargecurrency().getValue())) { + fields.add( + XmlSerializationUtils.asXmlElement( + "processingchargecurrency", + instance.getProcessingchargecurrency().getValue())); } - children.add(templateFactory.getInstance(instance.getHostedby().getKey(), fields, instance.getUrl())); + children.add( + templateFactory.getInstance( + instance.getHostedby().getKey(), fields, instance.getUrl())); } } final List ext = ((Result) entity).getExternalReference(); @@ -831,7 +1158,8 @@ public class XmlRecordFactory implements Serializable { final List fields = Lists.newArrayList(); if (isNotBlank(er.getSitename())) { - fields.add(XmlSerializationUtils.asXmlElement("sitename", er.getSitename())); + fields.add( + XmlSerializationUtils.asXmlElement("sitename", er.getSitename())); } if (isNotBlank(er.getLabel())) { fields.add(XmlSerializationUtils.asXmlElement("label", er.getLabel())); @@ -840,13 +1168,18 @@ public class XmlRecordFactory implements Serializable { fields.add(XmlSerializationUtils.asXmlElement("url", er.getUrl())); } if (isNotBlank(er.getDescription())) { - fields.add(XmlSerializationUtils.asXmlElement("description", er.getDescription())); + fields.add( + XmlSerializationUtils.asXmlElement( + "description", er.getDescription())); } if (isNotBlank(er.getUrl())) { - fields.add(XmlSerializationUtils.mapQualifier("qualifier", er.getQualifier())); + fields.add( + XmlSerializationUtils.mapQualifier("qualifier", er.getQualifier())); } if (isNotBlank(er.getRefidentifier())) { - fields.add(XmlSerializationUtils.asXmlElement("refidentifier", er.getRefidentifier())); + fields.add( + XmlSerializationUtils.asXmlElement( + "refidentifier", er.getRefidentifier())); } if (isNotBlank(er.getQuery())) { fields.add(XmlSerializationUtils.asXmlElement("query", er.getQuery())); @@ -862,16 +1195,19 @@ public class XmlRecordFactory implements Serializable { private List listExtraInfo(OafEntity entity) { final List extraInfo = entity.getExtraInfo(); - return extraInfo != null ? extraInfo - .stream() - .map(e -> XmlSerializationUtils.mapExtraInfo(e)) - .collect(Collectors.toList()) : Lists.newArrayList(); + return extraInfo != null + ? extraInfo.stream() + .map(e -> XmlSerializationUtils.mapExtraInfo(e)) + .collect(Collectors.toList()) + : Lists.newArrayList(); } private List buildContexts(final String type, final Set contexts) { final List res = Lists.newArrayList(); - if ((contextMapper != null) && !contextMapper.isEmpty() && MainEntityType.result.toString().equals(type)) { + if ((contextMapper != null) + && !contextMapper.isEmpty() + && MainEntityType.result.toString().equals(type)) { XMLTag document = XMLDoc.newDocument(true).addRoot("contextRoot"); @@ -885,7 +1221,9 @@ public class XmlRecordFactory implements Serializable { if (def == null) { continue; - // throw new IllegalStateException(String.format("cannot find context for id '%s'", id)); + // throw new IllegalStateException(String.format("cannot find context for id + // '%s'", + // id)); } if (def.getName().equals("context")) { @@ -897,7 +1235,13 @@ public class XmlRecordFactory implements Serializable { if (def.getName().equals("category")) { final String rootId = substringBefore(def.getId(), "::"); - document = addContextDef(document.gotoRoot().gotoTag("//context[./@id='" + rootId + "']", new Object()), def); + document = + addContextDef( + document.gotoRoot() + .gotoTag( + "//context[./@id='" + rootId + "']", + new Object()), + def); } if (def.getName().equals("concept")) { @@ -930,14 +1274,17 @@ public class XmlRecordFactory implements Serializable { } private XMLTag addContextDef(final XMLTag tag, final ContextDef def) { - tag.addTag(def.getName()).addAttribute("id", def.getId()).addAttribute("label", def.getLabel()); + tag.addTag(def.getName()) + .addAttribute("id", def.getId()) + .addAttribute("label", def.getLabel()); if ((def.getType() != null) && !def.getType().isEmpty()) { tag.addAttribute("type", def.getType()); } return tag; } - private String asStringElement(final org.w3c.dom.Element element, final Transformer transformer) throws TransformerException { + private String asStringElement(final org.w3c.dom.Element element, final Transformer transformer) + throws TransformerException { final StringWriter buffer = new StringWriter(); transformer.transform(new DOMSource(element), new StreamResult(buffer)); return buffer.toString(); @@ -959,23 +1306,41 @@ public class XmlRecordFactory implements Serializable { final String funderShortName = funder.valueOf("./shortname"); contexts.add(funderShortName); - contextMapper.put(funderShortName, new ContextDef(funderShortName, funder.valueOf("./name"), "context", "funding")); + contextMapper.put( + funderShortName, + new ContextDef( + funderShortName, funder.valueOf("./name"), "context", "funding")); final Node level0 = fundingPath.selectSingleNode("//funding_level_0"); if (level0 != null) { - final String level0Id = Joiner.on("::").join(funderShortName, level0.valueOf("./name")); - contextMapper.put(level0Id, new ContextDef(level0Id, level0.valueOf("./description"), "category", "")); + final String level0Id = + Joiner.on("::").join(funderShortName, level0.valueOf("./name")); + contextMapper.put( + level0Id, + new ContextDef( + level0Id, level0.valueOf("./description"), "category", "")); final Node level1 = fundingPath.selectSingleNode("//funding_level_1"); if (level1 == null) { contexts.add(level0Id); } else { - final String level1Id = Joiner.on("::").join(level0Id, level1.valueOf("./name")); - contextMapper.put(level1Id, new ContextDef(level1Id, level1.valueOf("./description"), "concept", "")); + final String level1Id = + Joiner.on("::").join(level0Id, level1.valueOf("./name")); + contextMapper.put( + level1Id, + new ContextDef( + level1Id, level1.valueOf("./description"), "concept", "")); final Node level2 = fundingPath.selectSingleNode("//funding_level_2"); if (level2 == null) { contexts.add(level1Id); } else { - final String level2Id = Joiner.on("::").join(level1Id, level2.valueOf("./name")); - contextMapper.put(level2Id, new ContextDef(level2Id, level2.valueOf("./description"), "concept", "")); + final String level2Id = + Joiner.on("::").join(level1Id, level2.valueOf("./name")); + contextMapper.put( + level2Id, + new ContextDef( + level2Id, + level2.valueOf("./description"), + "concept", + "")); contexts.add(level2Id); } } @@ -986,8 +1351,6 @@ public class XmlRecordFactory implements Serializable { } } - - @SuppressWarnings("unchecked") protected static String getRelFundingTree(final String xmlTree) { String funding = ""; @@ -997,13 +1360,26 @@ public class XmlRecordFactory implements Serializable { funding += getFunderElement(ftree); - for (final Object o : Lists.reverse(ftree.selectNodes("//fundingtree//*[starts-with(local-name(),'funding_level_')]"))) { + for (final Object o : + Lists.reverse( + ftree.selectNodes( + "//fundingtree//*[starts-with(local-name(),'funding_level_')]"))) { final Element e = (Element) o; final String _id = e.valueOf("./id"); - funding += "<" + e.getName() + " name=\"" + XmlSerializationUtils.escapeXml(e.valueOf("./name")) + "\">" + XmlSerializationUtils.escapeXml(_id) + ""; + funding += + "<" + + e.getName() + + " name=\"" + + XmlSerializationUtils.escapeXml(e.valueOf("./name")) + + "\">" + + XmlSerializationUtils.escapeXml(_id) + + ""; } } catch (final DocumentException e) { - throw new IllegalArgumentException("unable to parse funding tree: " + xmlTree + "\n" + e.getMessage()); + throw new IllegalArgumentException( + "unable to parse funding tree: " + xmlTree + "\n" + e.getMessage()); } finally { funding += ""; } @@ -1016,8 +1392,14 @@ public class XmlRecordFactory implements Serializable { final String funderName = ftree.valueOf("//fundingtree/funder/name"); final String funderJurisdiction = ftree.valueOf("//fundingtree/funder/jurisdiction"); - return ""; + return ""; } - -} \ No newline at end of file +} diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/XmlSerializationUtils.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/XmlSerializationUtils.java index bc183d0b3..4dacce7b9 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/XmlSerializationUtils.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/XmlSerializationUtils.java @@ -1,41 +1,43 @@ package eu.dnetlib.dhp.oa.provision.utils; -import eu.dnetlib.dhp.schema.oaf.*; - import static eu.dnetlib.dhp.oa.provision.utils.GraphMappingUtils.removePrefix; import static org.apache.commons.lang3.StringUtils.isBlank; import static org.apache.commons.lang3.StringUtils.isNotBlank; +import eu.dnetlib.dhp.schema.oaf.*; + public class XmlSerializationUtils { // XML 1.0 // #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF] - private final static String xml10pattern = "[^" - + "\u0009\r\n" - + "\u0020-\uD7FF" - + "\uE000-\uFFFD" - + "\ud800\udc00-\udbff\udfff" - + "]"; + private static final String xml10pattern = + "[^" + + "\u0009\r\n" + + "\u0020-\uD7FF" + + "\uE000-\uFFFD" + + "\ud800\udc00-\udbff\udfff" + + "]"; public static String mapJournal(Journal j) { - final String attrs = new StringBuilder() - .append(attr("issn", j.getIssnPrinted())) - .append(attr("eissn", j.getIssnOnline())) - .append(attr("lissn", j.getIssnLinking())) - .append(attr("ep", j.getEp())) - .append(attr("iss", j.getIss())) - .append(attr("sp", j.getSp())) - .append(attr("vol", j.getVol())) - .toString() - .trim(); + final String attrs = + new StringBuilder() + .append(attr("issn", j.getIssnPrinted())) + .append(attr("eissn", j.getIssnOnline())) + .append(attr("lissn", j.getIssnLinking())) + .append(attr("ep", j.getEp())) + .append(attr("iss", j.getIss())) + .append(attr("sp", j.getSp())) + .append(attr("vol", j.getVol())) + .toString() + .trim(); return new StringBuilder() - .append("") - .append(escapeXml(j.getName())) - .append("") - .toString(); + .append("") + .append(escapeXml(j.getName())) + .append("") + .toString(); } private static String attr(final String name, final String value) { @@ -43,7 +45,11 @@ public class XmlSerializationUtils { } public static String mapStructuredProperty(String name, StructuredProperty t) { - return asXmlElement(name, t.getValue(), t.getQualifier(), t.getDataInfo() != null ? t.getDataInfo() : null); + return asXmlElement( + name, + t.getValue(), + t.getQualifier(), + t.getDataInfo() != null ? t.getDataInfo() : null); } public static String mapQualifier(String name, Qualifier q) { @@ -51,8 +57,7 @@ public class XmlSerializationUtils { } public static String escapeXml(final String value) { - return value - .replaceAll("&", "&") + return value.replaceAll("&", "&") .replaceAll("<", "<") .replaceAll(">", ">") .replaceAll("\"", """) @@ -67,16 +72,25 @@ public class XmlSerializationUtils { .append(asXmlElement("deletedbyinference", dataInfo.getDeletedbyinference() + "")) .append(asXmlElement("trust", dataInfo.getTrust() + "")) .append(asXmlElement("inferenceprovenance", dataInfo.getInferenceprovenance() + "")) - .append(asXmlElement("provenanceaction", null, dataInfo.getProvenanceaction(), null)) + .append( + asXmlElement( + "provenanceaction", null, dataInfo.getProvenanceaction(), null)) .append("") .toString(); } private static StringBuilder dataInfoAsAttributes(final StringBuilder sb, final DataInfo info) { - return sb - .append(attr("inferred", info.getInferred() != null ? info.getInferred().toString() : "")) + return sb.append( + attr( + "inferred", + info.getInferred() != null ? info.getInferred().toString() : "")) .append(attr("inferenceprovenance", info.getInferenceprovenance())) - .append(attr("provenanceaction", info.getProvenanceaction() != null ? info.getProvenanceaction().getClassid() : "")) + .append( + attr( + "provenanceaction", + info.getProvenanceaction() != null + ? info.getProvenanceaction().getClassid() + : "")) .append(attr("trust", info.getTrust())); } @@ -108,7 +122,8 @@ public class XmlSerializationUtils { return asXmlElement(name, value, null, null); } - public static String asXmlElement(final String name, final String value, final Qualifier q, final DataInfo info) { + public static String asXmlElement( + final String name, final String value, final Qualifier q, final DataInfo info) { StringBuilder sb = new StringBuilder(); sb.append("<"); sb.append(name); @@ -116,12 +131,21 @@ public class XmlSerializationUtils { sb.append(getAttributes(q)); } if (info != null) { - sb - .append(" ") - .append(attr("inferred", info.getInferred() != null ? info.getInferred().toString() : "")) - .append(attr("inferenceprovenance", info.getInferenceprovenance())) - .append(attr("provenanceaction", info.getProvenanceaction() != null ? info.getProvenanceaction().getClassid() : "")) - .append(attr("trust", info.getTrust())); + sb.append(" ") + .append( + attr( + "inferred", + info.getInferred() != null + ? info.getInferred().toString() + : "")) + .append(attr("inferenceprovenance", info.getInferenceprovenance())) + .append( + attr( + "provenanceaction", + info.getProvenanceaction() != null + ? info.getProvenanceaction().getClassid() + : "")) + .append(attr("trust", info.getTrust())); } if (isBlank(value)) { sb.append("/>"); @@ -147,5 +171,4 @@ public class XmlSerializationUtils { .append(attr("schemename", q.getSchemename())) .toString(); } - } diff --git a/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/GraphJoinerTest.java b/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/GraphJoinerTest.java index d1456d832..e1c0af7ad 100644 --- a/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/GraphJoinerTest.java +++ b/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/GraphJoinerTest.java @@ -1,10 +1,9 @@ package eu.dnetlib.dhp.oa.provision; -import org.junit.jupiter.api.BeforeEach; - import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; +import org.junit.jupiter.api.BeforeEach; public class GraphJoinerTest { @@ -21,18 +20,22 @@ public class GraphJoinerTest { } private static void copyFiles(Path source, Path target) throws IOException { - Files.list(source).forEach(f -> { - try { - if (Files.isDirectory(f)) { - Path subTarget = Files.createDirectories(target.resolve(f.getFileName())); - copyFiles(f, subTarget); - } else { - Files.copy(f, target.resolve(f.getFileName())); - } - } catch (IOException e) { - e.printStackTrace(); - throw new RuntimeException(e); - } - }); + Files.list(source) + .forEach( + f -> { + try { + if (Files.isDirectory(f)) { + Path subTarget = + Files.createDirectories( + target.resolve(f.getFileName())); + copyFiles(f, subTarget); + } else { + Files.copy(f, target.resolve(f.getFileName())); + } + } catch (IOException e) { + e.printStackTrace(); + throw new RuntimeException(e); + } + }); } } diff --git a/pom.xml b/pom.xml index 1d36d42f9..f4dfc2c0d 100644 --- a/pom.xml +++ b/pom.xml @@ -462,6 +462,36 @@ + + com.cosium.code + git-code-format-maven-plugin + ${git-code-format-maven-plugin.version} + + + + install-formatter-hook + + install-hooks + + + + + validate-code-format + + validate-code-format + + + + + + true + false + false + false + + + org.apache.maven.plugins maven-release-plugin @@ -535,6 +565,7 @@ UTF-8 3.6.0 2.22.2 + 2.4 cdh5.9.2 2.6.0-${dhp.cdh.version} 4.1.0-${dhp.cdh.version}